npm - @workbench-ai/workbench-core - Versions diffs - 0.0.46 → 0.0.48 - Mend

@workbench-ai/workbench-core 0.0.46 → 0.0.48

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (39) hide show

package/dist/execution-events.d.ts +2 -2
package/dist/execution-events.d.ts.map +1 -1
package/dist/execution-events.js +3 -3
package/dist/{execution-phases.d.ts → execution-evidence.d.ts} +8 -7
package/dist/execution-evidence.d.ts.map +1 -0
package/dist/{execution-phases.js → execution-evidence.js} +91 -51
package/dist/execution-graph.js +1 -2
package/dist/execution-jobs.js +1 -1
package/dist/execution-outputs.d.ts.map +1 -1
package/dist/execution-outputs.js +5 -10
package/dist/execution-runtime-types.d.ts +7 -3
package/dist/execution-runtime-types.d.ts.map +1 -1
package/dist/execution-traces.d.ts +11 -1
package/dist/execution-traces.d.ts.map +1 -1
package/dist/execution-traces.js +305 -2
package/dist/generic-spec.d.ts +8 -3
package/dist/generic-spec.d.ts.map +1 -1
package/dist/generic-spec.js +26 -37
package/dist/index.d.ts +22 -11
package/dist/index.d.ts.map +1 -1
package/dist/index.js +888 -218
package/dist/runtime-dockerfile.d.ts +14 -0
package/dist/runtime-dockerfile.d.ts.map +1 -0
package/dist/runtime-dockerfile.js +65 -0
package/dist/sandbox-backends/docker.d.ts.map +1 -1
package/dist/sandbox-backends/docker.js +9 -12
package/dist/sandbox-backends/index.d.ts.map +1 -1
package/dist/sandbox-backends/index.js +2 -1
package/dist/sandbox-inputs.d.ts.map +1 -1
package/dist/sandbox-inputs.js +1 -0
package/dist/sandbox-plane.d.ts +1 -0
package/dist/sandbox-plane.d.ts.map +1 -1
package/dist/sandbox-plane.js +12 -22
package/dist/trace-files.d.ts +2 -2
package/dist/trace-files.d.ts.map +1 -1
package/dist/trace-files.js +4 -4
package/package.json +3 -3
package/worker/sandbox-adapter-runner.cjs +22 -13
package/dist/execution-phases.d.ts.map +0 -1

package/dist/index.js CHANGED Viewed

@@ -1,28 +1,30 @@
-import { createHash } from "node:crypto";
+import { createHash, randomBytes } from "node:crypto";
 import os from "node:os";
 import path from "node:path";
+import { fileURLToPath } from "node:url";
 import YAML from "yaml";
-import { adapterCommandName, assertWorkbenchAdapterOperationResultOk, collectWorkbenchAdapterAuthRequirements, readWorkbenchAdapterOperationResult, workbenchAdapterOperationCommand, workbenchAdapterOperationResultPath, } from "@workbench-ai/workbench-protocol";
-import { BENCHMARK_SPEC_FILE, engineCaseEnginePrivateFiles, engineCaseFilesForRuntimeInput, engineCaseSubjectVisibleFiles, resolveEngineCaseExecutionConfig, resolveWorkbenchResolvedSourceYaml as resolveWorkbenchResolvedSourceYamlInternal, validateWorkbenchResolvedSourceYaml as validateWorkbenchResolvedSourceYamlInternal, isWorkbenchSubjectManifestPath, } from "./generic-spec.js";
+import { adapterCommandName, assertWorkbenchAdapterOperationResultOk, collectWorkbenchAdapterAuthRequirements, parseWorkbenchAdapterManifest, readWorkbenchAdapterOperationResult, WORKBENCH_RUNTIME_CONTROL_TOKEN_ENV, WORKBENCH_RUNTIME_CONTROL_URL_ENV, workbenchAdapterOperationCommand, workbenchAdapterOperationExecutor, workbenchAdapterOperationResultPath, } from "@workbench-ai/workbench-protocol";
+import { BENCHMARK_SPEC_FILE, engineCasePrivateFiles, engineCaseFilesForRuntimeInput, engineCasePublicFiles, resolveEngineCaseExecutionConfig, resolveWorkbenchResolvedSourceYaml as resolveWorkbenchResolvedSourceYamlInternal, validateWorkbenchResolvedSourceYaml as validateWorkbenchResolvedSourceYamlInternal, isWorkbenchSubjectManifestPath, } from "./generic-spec.js";
 import { attachSandboxMetadataToJob, createWorkbenchSandboxFileStore, isSurfaceSnapshotFile, readWorkbenchExecutionSpec, } from "./sandbox-inputs.js";
-import { asRuntimeRecord, importNodeModule, jsonRecord, nodeBuiltin, quoteShellArg, resolveWorkbenchWorkerId, } from "./runtime-utils.js";
-import { executeValidatedSandboxExecution, } from "./sandbox-plane.js";
+import { asRuntimeRecord, importNodeModule, isJsonPayload, jsonRecord, nodeBuiltin, quoteShellArg, resolveWorkbenchWorkerId, } from "./runtime-utils.js";
+import { createWorkbenchExecutionCapability, createWorkbenchSandboxAllocation, collectExecutionCapabilityScopeIssues, collectSandboxAllocationScopeIssues, collectSandboxHandleScopeIssues, assertSandboxBackendSupportsNetworkPolicy, executeValidatedSandboxExecution, } from "./sandbox-plane.js";
 import { createSandboxBackendPlaneForProvider, } from "./sandbox-backends/index.js";
 import { applyWorkbenchSubjectPatch } from "./subject-patch.js";
 import { assignUsageRole, completeUsageSummary, mergeUsageSummaries, normalizeUsageSummary, usageStats, } from "./execution-usage.js";
-import { traceFilePaths, workbenchTracePhaseDirectory, } from "./trace-files.js";
+import { traceFilePaths, workbenchTraceExecutionDirectory, } from "./trace-files.js";
 import { engineCaseForCase, } from "./execution-jobs.js";
-import { createWorkbenchExecutionEventPublisher, publishCommandPhaseEvent, } from "./execution-events.js";
-import { readWorkbenchExecutionPurpose } from "./execution-phases.js";
+import { createWorkbenchExecutionEventPublisher, publishCommandStepEvent, } from "./execution-events.js";
+import { readWorkbenchExecutionPurpose } from "./execution-evidence.js";
 import { adapterAuthEnv, localWorkbenchAdapterAuthStore, normalizeWorkbenchAdapterAuthTarget, sanitizeWorkbenchAdapterAuthBundle, } from "./adapter-auth.js";
-export { BENCHMARK_SPEC_FILE, DEFAULT_EXECUTION_RESOURCES, engineCaseEnginePrivateFiles, engineCaseFilesForRuntimeInput, engineCaseSubjectVisibleFiles, engineResolveInvocationForSpec, engineResolveBindingForSpec, engineResolveBindingForSourceYaml, isWorkbenchSubjectManifestPath, parseWorkbenchSourceFiles, resolveEngineCaseExecutionConfig, resolveWorkbenchResolvedSourceYaml, resolveWorkbenchSourceFiles, serializeWorkbenchResolvedSourceYaml, validateWorkbenchResolvedSourceYaml, } from "./generic-spec.js";
-export { adapterCommandName, cloneWorkbenchAdapterManifest, collectWorkbenchAdapterAuthRequirements, collectWorkbenchAdapterInvocations, parseWorkbenchAdapterManifest, workbenchAdapterManifestRequiresAuth, workbenchAdapterManifestSupportsOperation, workbenchAdapterOperationCommand, withDefaultWorkbenchAdapterAuth, withDefaultWorkbenchAdapterAuthProfiles, } from "@workbench-ai/workbench-protocol";
+export { BENCHMARK_SPEC_FILE, DEFAULT_EXECUTION_RESOURCES, engineCasePrivateFiles, engineCaseFilesForRuntimeInput, engineCasePublicFiles, engineResolveInvocationForSpec, engineResolveBindingForSpec, engineResolveBindingForSourceYaml, isWorkbenchSubjectManifestPath, parseWorkbenchSourceFiles, resolveEngineCaseExecutionConfig, resolveWorkbenchResolvedSourceYaml, resolveWorkbenchSourceFiles, runtimeNetwork, runtimeResources, serializeWorkbenchResolvedSourceYaml, validateWorkbenchResolvedSourceYaml, } from "./generic-spec.js";
+export { composeRuntimeDockerfileWithAdapterInstallers, } from "./runtime-dockerfile.js";
+export { adapterCommandName, cloneWorkbenchAdapterManifest, collectWorkbenchAdapterAuthRequirements, collectWorkbenchAdapterInvocations, parseWorkbenchAdapterManifest, workbenchAdapterManifestRequiresAuth, workbenchAdapterManifestSupportsOperation, workbenchAdapterOperationCommand, workbenchAdapterOperationExecutor, withDefaultWorkbenchAdapterAuth, withDefaultWorkbenchAdapterAuthProfiles, } from "@workbench-ai/workbench-protocol";
 export { adapterAuthEnv, createWorkbenchAdapterAuthBundle, defaultWorkbenchAdapterAuthStoreRoot, localWorkbenchAdapterAuthStore, normalizeWorkbenchAdapterAuthTarget, parseWorkbenchAdapterAuthTarget, sanitizeWorkbenchAdapterAuthBundle, } from "./adapter-auth.js";
 export { asRuntimeRecord, importNodeModule, nodeBuiltin, normalizeWorkbenchWorkerId, normalizeRuntimeRegistry, quoteShellArg, resolveDockerRuntimeImageRef, resolveWorkbenchWorkerId, } from "./runtime-utils.js";
 export { assignUsageRole, extractExecutionUsageFromTrace, mergeUsageSummaries, } from "./execution-usage.js";
 export { createWorkbenchProgressStdoutParser, publishWorkbenchProgressStdoutEnvelope, } from "./execution-events.js";
 export { resolveSandboxTemplateImage, } from "./sandbox-backends/template-images.js";
-export { readOutputTraceFiles, workbenchTracePhaseDirectory, workbenchTraceRunDirectory, workbenchTraceRunDirectoryName, } from "./trace-files.js";
+export { readOutputTraceFiles, workbenchTraceExecutionDirectory, workbenchTraceRunDirectory, workbenchTraceRunDirectoryName, } from "./trace-files.js";
 export { assertWorkbenchAdapterOperationSupport, assertWorkbenchAdapterOperationResultOk, collectWorkbenchAdapterOperationIssues, collectWorkbenchAdapterOperationRequirements, ensureWorkbenchAdapterOutputDir, WORKBENCH_ADAPTER_RESULT_FILE, normalizeWorkbenchAdapterOperationRequest, normalizeWorkbenchAdapterOperationResult, readWorkbenchAdapterOperationRequest, readWorkbenchAdapterOperationResult, workbenchAdapterOperationResultPath, writeWorkbenchAdapterOperationResult, } from "@workbench-ai/workbench-protocol";
 export { applyWorkbenchSubjectPatch, } from "./subject-patch.js";
 export { createWorkbenchSandboxFileStore, createSandboxAdapterRequest, executionResultFromCompletedSandboxJob, materializeWorkbenchSandboxInput, readWorkbenchExecutionSpec, sanitizeWorkbenchExecutionJobForSandbox, } from "./sandbox-inputs.js";
@@ -31,8 +33,8 @@ export { createBaselineSubjectExecution, createBaselineSubjectJob, createWorkben
 export { addCapacity, capacityFits, runWorkbenchExecutionDag, subtractCapacity, workbenchJobDependencies, workbenchJobHostCost, workbenchJobResources, } from "./execution-scheduler.js";
 export { assertWorkbenchExecutionIsolation, collectWorkbenchExecutionIsolationIssues, validateWorkbenchExecutionOutputPayloads, } from "./execution-outputs.js";
 export { collectSandboxAllocationScopeIssues, collectExecutionCapabilityScopeIssues, collectSandboxHandleScopeIssues, createWorkbenchSandboxAllocation, createWorkbenchSandboxExecutionMetadata, createWorkbenchExecutionCapability, executeValidatedSandboxExecution, } from "./sandbox-plane.js";
-export { buildSubjectCasePhaseRefs, buildWorkbenchTracePhases, isWorkbenchPhaseActive, readWorkbenchExecutionId, readWorkbenchExecutionMetadataNumber, readWorkbenchExecutionMetadataString, readWorkbenchExecutionPurpose, resolveWorkbenchJobGroupStatus, } from "./execution-phases.js";
-export { finalizeWorkbenchExecutionTraceForJob, mergeWorkbenchExecutionTracesByJob, } from "./execution-traces.js";
+export { buildSubjectCaseExecutionRefs, buildWorkbenchExecutionEvidence, isWorkbenchExecutionActive, readWorkbenchExecutionId, readWorkbenchExecutionMetadataNumber, readWorkbenchExecutionMetadataString, readWorkbenchExecutionPurpose, resolveWorkbenchJobGroupStatus, } from "./execution-evidence.js";
+export { buildWorkbenchTraceSessionsFromFiles, combineWorkbenchTraceSessions, finalizeWorkbenchExecutionTraceForJob, mergeWorkbenchExecutionTracesByJob, readWorkbenchExecutionTraceFiles, traceSessionLabel, } from "./execution-traces.js";
 export { DOCKER_SANDBOX_BACKEND, assertSandboxHostHealthForProvider, createDockerSandboxBackendDescriptor, createDockerSandboxPlane, resolveWorkbenchSandboxProviderName, sandboxProviderAdmissionForResources, sandboxProviderDefaultMaxConcurrentJobs, sandboxProviderLeaseScope, sandboxHostHealthExpectationForProvider, } from "./sandbox-backends/index.js";
 export const DEFAULT_ENVIRONMENT_VERSIONS = [
     {
@@ -142,7 +144,7 @@ export const DEFAULT_ENVIRONMENTS = [
     {
         id: "env_libreoffice_agent",
         name: "LibreOffice + Agent",
-        description: "Agent runtime with soffice and Python libraries for spreadsheet-heavy skill and rubric evaluations.",
+        description: "Agent runtime with soffice and Python libraries for spreadsheet-heavy evaluations.",
         currentVersionId: "envv_libreoffice_agent",
         builtIn: true,
         createdAt: "2026-04-29T00:00:00.000Z",
@@ -278,30 +280,36 @@ function adapterProtocolCommandSpec(adapter, operation, manifests = []) {
     return {
         use: "command",
         command: manifest ? workbenchAdapterOperationCommand(manifest, operation) : adapterCommandName(adapter.use),
+        executor: manifest ? workbenchAdapterOperationExecutor(manifest, operation) : "sandbox",
     };
 }
-function protocolPhaseForExecution(execution, manifests) {
-    const role = executionPurposeRole(execution.purpose);
-    const operation = execution.purpose === "improve" ? "optimizer.improve" : "subject.run";
+function protocolStepForExecution(execution, manifests) {
+    if (execution.purpose !== "improve") {
+        throw new Error(`Protocol execution step only supports improve executions, not ${execution.purpose}.`);
+    }
+    const operation = "optimizer.improve";
     const command = adapterProtocolCommandSpec(execution.adapter, operation, manifests);
     return {
-        kind: role,
+        kind: "optimizer",
         label: execution.purpose,
         operation,
+        executor: command.executor,
         adapter: execution.adapter,
         command: command.command,
     };
 }
-function attemptPhasesForExecution(execution, spec, manifests) {
+function attemptStepsForExecution(execution, spec, manifests) {
     void spec;
-    const enginePhase = {
+    const command = adapterProtocolCommandSpec(execution.adapter, "engine.run", manifests);
+    const engineStep = {
         kind: "engine",
         label: "engine",
         operation: "engine.run",
+        executor: command.executor,
         adapter: execution.adapter,
-        command: adapterProtocolCommandSpec(execution.adapter, "engine.run", manifests).command,
+        command: command.command,
     };
-    return [enginePhase];
+    return [engineStep];
 }
 function adapterConfigRecord(adapter, manifests = []) {
     const config = cloneJsonRecord(jsonRecord(adapter.with));
@@ -411,7 +419,10 @@ export function materializeWorkbenchRunResult(args) {
             .sort((left, right) => compareSampleOutputs(left.output, right.output));
         const outputJobIds = new Set(outputs.flatMap(({ jobs }) => jobs.map((job) => job.id)));
         const completedSampleKeys = new Set(outputs
-            .map(({ output }) => evaluationSampleGroupKeyFromOutput(output))
+            .flatMap(({ jobs, output }) => [
+            evaluationSampleGroupKeyFromOutput(output),
+            ...jobs.map(evaluationSampleGroupKeyFromJob),
+        ])
             .filter((key) => key !== null));
         const errorSampleJobs = [
             ...subjectJobs.filter((job) => job.status === "failed"),
@@ -422,12 +433,13 @@ export function materializeWorkbenchRunResult(args) {
             ...outputs.map(({ jobs, output }) => withJobUsage(output.sample, completed, jobs[0])),
             ...errorSamples,
         ].sort((left, right) => left.index - right.index || left.id.localeCompare(right.id));
-        const evalRecord = createEvaluationRecord(subjectId, samples);
+        const subjectName = normalizedSubjectDisplayName(args.spec.subject.name);
+        const evalRecord = createEvaluationRecord(subjectId, subjectName, samples);
         const usage = mergeUsageSummaries([
             subjectRevision.usage,
             ...samples.map((sample) => sample.usage),
         ]);
-        const metrics = evaluationMeanMetrics(createEvaluationRecord(subjectId, samples));
+        const metrics = evaluationMeanMetrics(evalRecord);
         const attemptIndex = subjectRevision.attemptIndex;
         const evaluationTraces = [
             ...outputs.flatMap(({ output }) => output.traces),
@@ -457,6 +469,7 @@ export function materializeWorkbenchRunResult(args) {
         }
         const record = {
             id: subjectId,
+            ...(subjectName ? { name: subjectName } : {}),
             ordinal: args.existingSubjectCount + subjects.length,
             benchmarkFingerprint: args.benchmarkFingerprint,
             subjectFingerprint: args.subjectFingerprint ?? materializedSubjectFingerprint(args.spec, subjectRevision.files),
@@ -472,7 +485,7 @@ export function materializeWorkbenchRunResult(args) {
             meta,
         };
         subjects.push(record);
-        evaluations.push(createEvaluationResultRecord({
+        evaluations.push(createEvaluationScorecard({
             runId: args.runId,
             benchmarkFingerprint: args.benchmarkFingerprint,
             createdAt: args.startedAt,
@@ -528,6 +541,8 @@ function materializedSubjectFingerprint(spec, files) {
     hash.update("workbench-subject-v1\0");
     hash.update("materialized\0runner\0");
     hash.update(JSON.stringify(spec.run));
+    hash.update("prepare");
+    hash.update(JSON.stringify(spec.subject.prepare ?? null));
     for (const file of filterSubjectSourceFiles(files).slice().sort((left, right) => left.path.localeCompare(right.path))) {
         hash.update("\0file\0");
         hash.update(file.path);
@@ -547,14 +562,15 @@ function materializedSubjectFiles(args) {
     }
     return [...byPath.values()].sort((left, right) => left.path.localeCompare(right.path));
 }
-function createEvaluationResultRecord(args) {
+function createEvaluationScorecard(args) {
     const evaluation = args.evaluation;
     return {
-        id: evaluationResultId(args.runId, args.subject.id),
+        id: evaluationScorecardId(args.runId, args.subject.id),
         runId: args.runId,
         benchmarkFingerprint: args.benchmarkFingerprint,
         subjectFingerprint: args.subject.subjectFingerprint,
         subjectId: args.subject.id,
+        ...(args.subject.name ? { subjectName: args.subject.name } : {}),
         createdAt: args.createdAt,
         updatedAt: evaluation.finishedAt ?? args.createdAt,
         status: evaluation.status,
@@ -568,7 +584,7 @@ function createEvaluationResultRecord(args) {
         evaluation,
     };
 }
-function evaluationResultId(runId, subjectId) {
+export function evaluationScorecardId(runId, subjectId) {
     const runPart = runId.replace(/[^a-z0-9]+/giu, "_").replace(/^_+|_+$/gu, "").slice(-24);
     const subjectPart = subjectId.replace(/[^a-z0-9]+/giu, "_").replace(/^_+|_+$/gu, "").slice(-24);
     return `eval_${runPart}_${subjectPart}`;
@@ -584,7 +600,7 @@ export function isWorkbenchInternalOutputPath(filePath) {
         normalized === "sandbox-environment.json" ||
         normalized === "sandbox_error.log" ||
         normalized === "exit_code" ||
-        /^[a-z-]+_(stdout\.log|stderr\.log|exit_code)$/u.test(normalized));
+        /^[a-z_-]+_(stdout\.log|stderr\.log|exit_code)$/u.test(normalized));
 }
 export function createSubjectRevisionTraceInputFiles(args) {
     const files = [];
@@ -620,6 +636,23 @@ export function createSubjectRevisionTraceInputFiles(args) {
     }, null, 2)}\n`));
     return dedupeSurfaceFiles(files);
 }
+export function createSubjectEvaluationTraceInputFiles(args) {
+    const subject = args.subject;
+    if (!subject?.eval && !subject?.metrics) {
+        return [];
+    }
+    const filePath = normalizeRelativePath(args.path ?? `base-subject/${subject.id}/evaluation.json`);
+    const payload = {
+        kind: "subject_evaluation",
+        subjectId: subject.id,
+        status: subject.status,
+        metrics: subject.metrics ?? null,
+        fileChanges: subject.fileChanges,
+        eval: subject.eval ?? null,
+        prompt: subject.prompt ?? null,
+    };
+    return [textSurfaceFile(filePath, `${JSON.stringify(payload, null, 2)}\n`)];
+}
 function isTerminalExecutionJob(job) {
     return job.kind === "execute" && (job.status === "succeeded" ||
         job.status === "failed" ||
@@ -866,16 +899,14 @@ export function createSubjectFilePreview(args) {
     };
 }
 export function createCaseReview(args) {
-    const preferredSampleIndex = uniquePhaseSampleIndex(args.phases ?? []);
-    const sampleMatchesCase = (sample) => sample.id === args.caseId ||
-        sample.id.startsWith(`${args.caseId}__`) ||
-        (sample.cases ?? []).some((entry) => entry.id === args.caseId || entry.id.startsWith(`${args.caseId}__`));
+    const preferredSampleIndex = uniqueExecutionSampleIndex(args.executions ?? []);
+    const sampleMatchesCase = (sample) => (sample.cases ?? []).some((entry) => entry.id === args.caseId);
     const samples = args.subject.eval?.samples ?? [];
     const sampleResult = samples.find((sample) => typeof preferredSampleIndex === "number" &&
         sample.index === preferredSampleIndex &&
         sampleMatchesCase(sample)) ?? samples.find(sampleMatchesCase);
-    const caseResult = sampleResult?.cases?.find((entry) => entry.id === args.caseId || entry.id.startsWith(`${args.caseId}__`));
-    if (!sampleResult && (args.phases?.length ?? 0) > 0) {
+    const caseResult = sampleResult?.cases?.find((entry) => entry.id === args.caseId);
+    if (!sampleResult && (args.executions?.length ?? 0) > 0) {
         return {
             subjectId: args.subject.id,
             caseId: args.caseId,
@@ -884,7 +915,7 @@ export function createCaseReview(args) {
                 ? { sampleIndex: preferredSampleIndex }
                 : {}),
             metrics: {},
-            phases: args.phases ?? [],
+            executions: args.executions ?? [],
             criteria_results: [],
         };
     }
@@ -893,28 +924,21 @@ export function createCaseReview(args) {
     }
     const durationMs = typeof caseResult?.durationMs === "number"
         ? caseResult.durationMs
-        : sampleResult?.cases?.length === 1 &&
-            typeof sampleResult.durationMs === "number"
-            ? sampleResult.durationMs
-            : !caseResult && typeof sampleResult.durationMs === "number"
-                ? sampleResult.durationMs
-                : undefined;
-    const sampleStatus = sampleResult.status === "planned" ? undefined : sampleResult.status;
-    const status = caseResult?.status ?? sampleStatus;
+        : undefined;
     return {
         subjectId: args.subject.id,
-        caseId: caseResult?.id ?? sampleResult.id,
+        caseId: caseResult?.id ?? args.caseId,
         caseLabel: caseResult?.label ?? args.caseId,
         sampleId: sampleResult.id,
         sampleIndex: sampleResult.index,
-        ...(status ? { status } : {}),
-        metrics: caseResult?.metrics ?? sampleResult.metrics ?? {},
+        ...(caseResult?.status ? { status: caseResult.status } : {}),
+        metrics: caseResult?.metrics ?? {},
         ...(typeof durationMs === "number" ? { durationMs } : {}),
         ...(caseResult?.source ? { source: caseResult.source } : {}),
-        ...((caseResult?.feedback ?? sampleResult.feedback) !== undefined
-            ? { feedback: caseResult?.feedback ?? sampleResult.feedback }
+        ...(caseResult?.feedback !== undefined
+            ? { feedback: caseResult.feedback }
             : {}),
-        phases: args.phases ?? [],
+        executions: args.executions ?? [],
         criteria_results: (caseResult?.criteria ?? []).map((criterion) => ({
             criterion_id: criterion.criterion_id,
             pass: criterion.pass,
@@ -924,9 +948,9 @@ export function createCaseReview(args) {
         })),
     };
 }
-function uniquePhaseSampleIndex(phases) {
-    const sampleIndices = new Set(phases
-        .map((phase) => phase.sampleIndex)
+function uniqueExecutionSampleIndex(executions) {
+    const sampleIndices = new Set(executions
+        .map((execution) => execution.sampleIndex)
         .filter((index) => typeof index === "number"));
     if (sampleIndices.size !== 1) {
         return null;
@@ -951,6 +975,7 @@ function parseAuthoredWorkbenchSourceSpec(source) {
             name: resolved.subject.name,
             description: resolved.subject.description,
             files: { path: resolved.subject.files.path },
+            ...(resolved.subject.prepare ? { prepare: { ...resolved.subject.prepare } } : {}),
             run: runSpecFromInvocation(resolved.run),
         },
         ...(resolved.optimizer
@@ -1101,11 +1126,18 @@ export async function executeWorkbenchExecutionJob(args, options) {
         const runtimeArgs = adapterAuthProfiles.length > 0
             ? { ...args, adapterAuthProfiles }
             : args;
-        const executionForSandbox = readWorkbenchExecutionSpec(runtimeArgs.job);
+        const executionForRuntime = readWorkbenchExecutionSpec(runtimeArgs.job);
+        const executor = workbenchExecutionExecutorForRuntimeInput(runtimeArgs);
+        if (executor === "host") {
+            return await withWorkbenchRuntimeControlServer(runtimeArgs, options, startedAt, async (adapterRuntimeEnv) => executeAdapterInCurrentRuntime({
+                ...runtimeArgs,
+                adapterRuntimeEnv,
+            }, executionForRuntime, startedAt, createWorkbenchExecutionCapability(executionForRuntime, { now: startedAt })));
+        }
         const fileStore = createWorkbenchSandboxFileStore(runtimeArgs);
         const planeFactory = options.createSandboxPlaneForProvider ?? createSandboxBackendPlaneForProvider;
         const plane = planeFactory(options.sandboxProvider, runtimeArgs, startedAt, fileStore);
-        const validated = await executeValidatedSandboxExecution(plane, executionForSandbox, {
+        const validated = await executeValidatedSandboxExecution(plane, executionForRuntime, {
             now: startedAt,
             runnerId: resolveWorkbenchWorkerId([
                 process.env.WORKBENCH_WORKER_ID,
@@ -1121,6 +1153,215 @@ export async function executeWorkbenchExecutionJob(args, options) {
         return failWorkbenchRunJob(args.job, startedAt, error);
     }
 }
+export function workbenchExecutionExecutorForRuntimeInput(args) {
+    if (args.runtimeControlOperation) {
+        return "sandbox";
+    }
+    const execution = readWorkbenchExecutionSpec(args.job);
+    const operation = adapterOperationForExecutionPurpose(execution.purpose);
+    if (!operation) {
+        return "sandbox";
+    }
+    const manifest = args.adapterManifests?.find((entry) => entry.id === execution.adapter.use);
+    return manifest ? workbenchAdapterOperationExecutor(manifest, operation) : "sandbox";
+}
+function adapterOperationForExecutionPurpose(purpose) {
+    if (purpose === "improve") {
+        return "optimizer.improve";
+    }
+    if (purpose === "attempt") {
+        return "engine.run";
+    }
+    return null;
+}
+const RUNTIME_CONTROL_MAX_BODY_BYTES = 512 * 1024 * 1024;
+async function withWorkbenchRuntimeControlServer(args, options, startedAt, run) {
+    const [{ createServer }] = await Promise.all([
+        importNodeModule(nodeBuiltin("http")),
+    ]);
+    const token = randomBytes(24).toString("base64url");
+    const server = createServer((request, response) => {
+        void handleWorkbenchRuntimeControlHttpRequest({
+            request,
+            response,
+            token,
+            args,
+            options,
+            startedAt,
+        });
+    });
+    const url = await new Promise((resolve, reject) => {
+        server.once("error", reject);
+        server.listen(0, "127.0.0.1", () => {
+            server.off("error", reject);
+            const address = server.address();
+            if (!address || typeof address === "string") {
+                reject(new Error("Workbench runtime-control server did not expose a local TCP address."));
+                return;
+            }
+            resolve(`http://127.0.0.1:${address.port}`);
+        });
+    });
+    try {
+        return await run({
+            [WORKBENCH_RUNTIME_CONTROL_URL_ENV]: url,
+            [WORKBENCH_RUNTIME_CONTROL_TOKEN_ENV]: token,
+        });
+    }
+    finally {
+        await new Promise((resolve) => server.close(() => resolve()));
+    }
+}
+async function handleWorkbenchRuntimeControlHttpRequest(args) {
+    const { request, response } = args;
+    try {
+        if (request.method !== "POST" || request.url !== "/v1/operation-sequence") {
+            writeRuntimeControlJson(response, 404, { error: "Unknown Workbench runtime-control endpoint." });
+            return;
+        }
+        if (request.headers.authorization !== `Bearer ${args.token}`) {
+            writeRuntimeControlJson(response, 401, { error: "Workbench runtime-control token is invalid." });
+            return;
+        }
+        const parsed = JSON.parse(await readRuntimeControlBody(request));
+        const controlRequest = normalizeRuntimeControlOperationSequenceRequest(parsed);
+        const result = await executeRuntimeControlOperationSequenceInSandbox(args.args, args.options, args.startedAt, controlRequest);
+        writeRuntimeControlJson(response, 200, result);
+    }
+    catch (error) {
+        writeRuntimeControlJson(response, 500, {
+            error: error instanceof Error ? error.stack ?? error.message : String(error),
+        });
+    }
+}
+function writeRuntimeControlJson(response, statusCode, payload) {
+    response.statusCode = statusCode;
+    response.setHeader("content-type", "application/json");
+    response.end(`${JSON.stringify(payload, null, 2)}\n`);
+}
+function readRuntimeControlBody(request) {
+    return new Promise((resolve, reject) => {
+        const chunks = [];
+        let size = 0;
+        request.on("data", (chunk) => {
+            size += chunk.length;
+            if (size > RUNTIME_CONTROL_MAX_BODY_BYTES) {
+                reject(new Error("Workbench runtime-control request body is too large."));
+                request.destroy();
+                return;
+            }
+            chunks.push(chunk);
+        });
+        request.on("error", reject);
+        request.on("end", () => resolve(Buffer.concat(chunks).toString("utf8")));
+    });
+}
+function normalizeRuntimeControlOperationSequenceRequest(value) {
+    if (!value || typeof value !== "object" || Array.isArray(value)) {
+        throw new Error("Workbench runtime-control operation sequence request must be an object.");
+    }
+    const record = value;
+    if (!Array.isArray(record.operations) || record.operations.length === 0) {
+        throw new Error("Workbench runtime-control operation sequence requires at least one operation.");
+    }
+    const inputs = normalizeRuntimeControlInputs(record.inputs);
+    return {
+        ...(inputs ? { inputs } : {}),
+        operations: record.operations.map((entry, index) => normalizeRuntimeControlOperation(entry, `operations[${index}]`)),
+        ...(typeof record.prepare === "boolean" ? { prepare: record.prepare } : {}),
+        ...(typeof record.collectWorkspace === "boolean" ? { collectWorkspace: record.collectWorkspace } : {}),
+    };
+}
+function normalizeRuntimeControlInputs(value) {
+    if (value === undefined) {
+        return undefined;
+    }
+    if (!value || typeof value !== "object" || Array.isArray(value)) {
+        throw new Error("Workbench runtime-control inputs must be an object.");
+    }
+    const record = value;
+    const inputs = {};
+    if (hasOwn(record, "subject")) {
+        inputs.subject = normalizeRuntimeControlFiles(record.subject, "inputs.subject");
+    }
+    if (hasOwn(record, "case")) {
+        inputs.case = normalizeRuntimeControlFiles(record.case, "inputs.case");
+    }
+    if (hasOwn(record, "enginePrivate")) {
+        inputs.enginePrivate = normalizeRuntimeControlFiles(record.enginePrivate, "inputs.enginePrivate");
+    }
+    if (hasOwn(record, "traces")) {
+        inputs.traces = normalizeRuntimeControlFiles(record.traces, "inputs.traces");
+    }
+    if (hasOwn(record, "workspace")) {
+        inputs.workspace = normalizeRuntimeControlFiles(record.workspace, "inputs.workspace");
+    }
+    if (hasOwn(record, "output")) {
+        inputs.output = normalizeRuntimeControlFiles(record.output, "inputs.output");
+    }
+    return inputs;
+}
+function normalizeRuntimeControlFiles(value, label) {
+    if (value === undefined) {
+        return [];
+    }
+    if (!Array.isArray(value)) {
+        throw new Error(`Workbench runtime-control ${label} must be an array.`);
+    }
+    return value.map((entry, index) => {
+        if (!isSurfaceSnapshotFile(entry)) {
+            throw new Error(`Workbench runtime-control ${label}[${index}] must be a surface snapshot file.`);
+        }
+        return { ...entry, path: normalizeRelativePath(entry.path) };
+    });
+}
+function hasOwn(value, key) {
+    return Object.prototype.hasOwnProperty.call(value, key);
+}
+function normalizeRuntimeControlOperation(value, label) {
+    if (!value || typeof value !== "object" || Array.isArray(value)) {
+        throw new Error(`Workbench runtime-control ${label} must be an object.`);
+    }
+    const record = value;
+    const operation = record.operation;
+    if (operation !== "engine.resolve" &&
+        operation !== "engine.run" &&
+        operation !== "subject.run" &&
+        operation !== "optimizer.improve") {
+        throw new Error(`Workbench runtime-control ${label}.operation is invalid.`);
+    }
+    const invocation = record.invocation;
+    if (!invocation || typeof invocation !== "object" || Array.isArray(invocation)) {
+        throw new Error(`Workbench runtime-control ${label}.invocation must be an object.`);
+    }
+    const invocationRecord = invocation;
+    if (typeof invocationRecord.use !== "string" || invocationRecord.use.length === 0) {
+        throw new Error(`Workbench runtime-control ${label}.invocation.use is required.`);
+    }
+    const withConfig = invocationRecord.with === undefined
+        ? {}
+        : isJsonPayload(invocationRecord.with)
+            ? invocationRecord.with
+            : null;
+    if (withConfig === null) {
+        throw new Error(`Workbench runtime-control ${label}.invocation.with must be JSON.`);
+    }
+    if (invocationRecord.auth !== undefined && !isJsonPayload(invocationRecord.auth)) {
+        throw new Error(`Workbench runtime-control ${label}.invocation.auth must be JSON.`);
+    }
+    return {
+        operation,
+        invocation: {
+            use: invocationRecord.use,
+            with: withConfig,
+            ...(invocationRecord.auth !== undefined ? { auth: invocationRecord.auth } : {}),
+            ...(typeof invocationRecord.command === "string" && invocationRecord.command.trim()
+                ? { command: invocationRecord.command }
+                : {}),
+        },
+        ...(typeof record.label === "string" && record.label.trim() ? { label: record.label } : {}),
+    };
+}
 async function explicitAdapterAuthProfilesForExecution(execution, args, loadLocalAdapterProfiles) {
     const required = requiredAdapterAuthTargetsForExecution(execution, args);
     if (required.length === 0) {
@@ -1155,7 +1396,7 @@ function adapterAuthTargetKey(target) {
 export function workbenchExecutionPurpose(job) {
     return readWorkbenchExecutionPurpose(job);
 }
-export async function executeAdapterInCurrentSandboxRuntime(args, execution, startedAt, capability) {
+export async function executeAdapterInCurrentRuntime(args, execution, startedAt, capability) {
     const eventPublisher = createWorkbenchExecutionEventPublisher({
         projectId: args.job.projectId,
         runId: args.job.runId,
@@ -1174,10 +1415,10 @@ export async function executeAdapterInCurrentSandboxRuntime(args, execution, sta
     };
     try {
         if (execution.purpose === "improve") {
-            return await executeSubjectRevisionExecutionInSandbox(runtimeInput, execution, startedAt, capability, eventPublisher);
+            return await executeSubjectRevisionExecutionInCurrentRuntime(runtimeInput, execution, startedAt, capability, eventPublisher);
         }
         if (execution.purpose === "attempt") {
-            return await executeAttemptExecutionInSandbox(runtimeInput, execution, startedAt, capability, eventPublisher);
+            return await executeAttemptExecutionInCurrentRuntime(runtimeInput, execution, startedAt, capability, eventPublisher);
         }
         throw new Error(`Unsupported execution purpose ${execution.purpose}.`);
     }
@@ -1274,7 +1515,7 @@ function adapterAuthRequest(bundles, root, currentAdapterId) {
     }
     return entries;
 }
-function adapterAuthRequestForPhase(args, adapterId) {
+function adapterAuthRequestForStep(args, adapterId) {
     const profiles = (args.adapterAuthProfiles ?? [])
         .map((bundle) => sanitizeWorkbenchAdapterAuthBundle(bundle));
     if (profiles.length === 0) {
@@ -1295,12 +1536,19 @@ function adapterAuthProfilesForExecution(execution, args) {
 }
 function requiredAdapterAuthTargetsForExecution(execution, args) {
     const manifests = args.adapterManifests ?? [];
-    return collectWorkbenchAdapterAuthRequirements(adapterInvocationsForExecution(execution, args.spec), manifests)
+    return collectWorkbenchAdapterAuthRequirements(adapterInvocationsForExecution(execution, args), manifests)
         .map((target) => normalizeWorkbenchAdapterAuthTarget(target));
 }
-function adapterInvocationsForExecution(execution, spec) {
+function adapterInvocationsForExecution(execution, args) {
+    if (args.runtimeControlOperation) {
+        return uniqueAdapterInvocations(args.runtimeControlOperation.operations.map((operation) => ({
+            use: operation.invocation.use,
+            with: operation.invocation.with ?? {},
+            ...(operation.invocation.auth !== undefined ? { auth: operation.invocation.auth } : {}),
+        })));
+    }
     if (execution.purpose === "attempt") {
-        return uniqueAdapterInvocations([execution.adapter, spec.run]);
+        return uniqueAdapterInvocations([execution.adapter, args.spec.run]);
     }
     return [execution.adapter];
 }
@@ -1341,7 +1589,7 @@ function completedJobFromSandboxResult(fallbackJob, startedAt, result) {
     }
     return attachSandboxMetadataToJob(failWorkbenchRunJob(fallbackJob, result.startedAt || startedAt, result.error ?? `Sandbox execution ${result.status}.`, result.finishedAt), asRuntimeRecord(result.metadata).sandbox);
 }
-async function executeSubjectRevisionExecutionInSandbox(args, execution, startedAt, capability, eventPublisher) {
+async function executeSubjectRevisionExecutionInCurrentRuntime(args, execution, startedAt, capability, eventPublisher) {
     const { workload, result } = await runHostedProtocolExecutionResult(args, execution, startedAt, capability, eventPublisher);
     if (result.error || (result.exitCode ?? 0) !== 0) {
         return failWorkbenchRunJob(args.job, startedAt, result.error ?? `Adapter ${execution.adapter.use} exited with status ${result.exitCode}.`, result.finishedAt, result);
@@ -1382,7 +1630,7 @@ async function executeSubjectRevisionExecutionInSandbox(args, execution, started
         },
     };
 }
-async function executeAttemptExecutionInSandbox(args, execution, startedAt, capability, eventPublisher) {
+async function executeAttemptExecutionInCurrentRuntime(args, execution, startedAt, capability, eventPublisher) {
     const workload = createWorkbenchRunWorkload({
         job: args.job,
         spec: args.spec,
@@ -1391,7 +1639,7 @@ async function executeAttemptExecutionInSandbox(args, execution, startedAt, capa
         engineCases: args.engineCases,
         traceFiles: args.traceFiles,
     });
-    const workloadResult = await runHostedCommandExecutionPhases(args, workload, attemptPhasesForExecution(execution, args.spec, args.adapterManifests), startedAt, {
+    const workloadResult = await runHostedCommandExecutionSteps(args, workload, attemptStepsForExecution(execution, args.spec, args.adapterManifests), startedAt, {
         capability,
         eventPublisher,
     });
@@ -1405,10 +1653,7 @@ async function executeAttemptExecutionInSandbox(args, execution, startedAt, capa
         return failWorkbenchRunJob(args.job, startedAt, "Attempt engine must return a workbench-result result with a finite numeric score.", workloadResult.finishedAt, workloadResult);
     }
     const finishedAt = workloadResult.finishedAt ?? new Date().toISOString();
-    const usage = mergeUsageSummaries([
-        workloadResult.usage,
-        engineResult.usage,
-    ]);
+    const usage = attemptUsageSummary(workloadResult.usage, engineResult.usage);
     const sample = evaluateSample({
         subjectId: workload.subjectId,
         files: workloadResult.files,
@@ -1453,6 +1698,282 @@ async function executeAttemptExecutionInSandbox(args, execution, startedAt, capa
         },
     };
 }
+export async function executeRuntimeControlOperationSequenceInCurrentRuntime(args, execution, startedAt, capability) {
+    void execution;
+    void capability;
+    if (!args.runtimeControlOperation) {
+        throw new Error("Runtime-control operation sequence is missing from the sandbox request.");
+    }
+    const childExecution = readWorkbenchExecutionSpec(args.job);
+    const workload = createWorkbenchRunWorkload({
+        job: args.job,
+        spec: args.spec,
+        baseFiles: args.baseFiles,
+        engineResolveFiles: args.engineResolveFiles,
+        engineCases: args.engineCases,
+        traceFiles: args.traceFiles,
+    });
+    const runtimeArgs = { ...args };
+    delete runtimeArgs.adapterRuntimeEnv;
+    const adapterAuth = await materializeSandboxAdapterAuth(runtimeArgs, childExecution);
+    let result;
+    try {
+        result = await runHostedCommandExecutionSteps({
+            ...runtimeArgs,
+            ...(adapterAuth.root ? { adapterAuthRoot: adapterAuth.root } : {}),
+            ...(Object.keys(adapterAuth.env).length > 0
+                ? { adapterAuthEnv: adapterAuth.env }
+                : {}),
+        }, workload, args.runtimeControlOperation.operations.map((operation, index) => runtimeControlStepForOperation(operation, index, args.adapterManifests)), startedAt, {
+            runSubjectPrepare: args.runtimeControlOperation.prepare ?? false,
+            workspaceFiles: args.runtimeControlOperation.inputs?.workspace ?? [],
+            outputFiles: args.runtimeControlOperation.inputs?.output ?? [],
+            collectWorkspace: args.runtimeControlOperation.collectWorkspace ?? false,
+        });
+    }
+    finally {
+        if (adapterAuth.cleanup) {
+            await adapterAuth.cleanup().catch(() => undefined);
+        }
+    }
+    const finishedAt = result.finishedAt ?? new Date().toISOString();
+    const failed = Boolean(result.error) || (result.exitCode ?? 0) !== 0;
+    return {
+        ...args.job,
+        status: failed ? "failed" : "succeeded",
+        attempt: Math.max(1, args.job.attempt),
+        startedAt,
+        finishedAt,
+        updatedAt: finishedAt,
+        ...(failed ? { error: result.error ?? `Runtime-control operation sequence exited with status ${result.exitCode}.` } : {}),
+        output: runtimeControlJobOutput(result, !failed),
+    };
+}
+async function executeRuntimeControlOperationSequenceInSandbox(args, options, startedAt, request) {
+    const childArgs = createRuntimeControlSandboxInput(args, request);
+    const execution = readWorkbenchExecutionSpec(childArgs.job);
+    const fileStore = createWorkbenchSandboxFileStore(childArgs);
+    const planeFactory = options.createSandboxPlaneForProvider ?? createSandboxBackendPlaneForProvider;
+    const plane = planeFactory(options.sandboxProvider, childArgs, startedAt, fileStore);
+    assertSandboxBackendSupportsNetworkPolicy(plane.backend, execution);
+    const sandboxOptions = {
+        now: startedAt,
+        runnerId: resolveWorkbenchWorkerId([
+            process.env.WORKBENCH_WORKER_ID,
+            process.env.EC2_INSTANCE_ID,
+            os.hostname(),
+            process.env.HOSTNAME,
+        ], "local-runner"),
+        fileStore,
+    };
+    const inputs = await fileStore.materializeInputs(execution);
+    const environment = plane.prepareEnvironment
+        ? await plane.prepareEnvironment(execution, sandboxOptions)
+        : {
+            backend: plane.backend.name,
+            kind: execution.sandbox.kind,
+            ref: execution.sandbox.ref,
+        };
+    const allocation = createWorkbenchSandboxAllocation(execution, {
+        backend: plane.backend.name,
+        runnerId: sandboxOptions.runnerId,
+        now: startedAt,
+    });
+    const capability = createWorkbenchExecutionCapability(execution, { now: startedAt });
+    assertRuntimeControlScope("Runtime-control sandbox allocation", collectSandboxAllocationScopeIssues(allocation, execution, { now: startedAt }));
+    assertRuntimeControlScope("Runtime-control execution capability", collectExecutionCapabilityScopeIssues(capability, execution, { now: startedAt }));
+    const sandbox = await plane.createSandbox({
+        execution,
+        environment,
+        allocation,
+        capability,
+        inputs,
+    }, sandboxOptions);
+    assertRuntimeControlScope("Runtime-control sandbox handle", collectSandboxHandleScopeIssues(sandbox, allocation, execution));
+    let result;
+    try {
+        result = await plane.exec({
+            execution,
+            environment,
+            sandbox,
+            allocation,
+            capability,
+            inputs,
+        }, sandboxOptions);
+    }
+    finally {
+        await plane.destroySandbox(sandbox, sandboxOptions);
+    }
+    const completedJob = completedJobFromSandboxResult(childArgs.job, startedAt, result);
+    return runtimeControlResultFromCompletedJob(completedJob);
+}
+function createRuntimeControlSandboxInput(args, request) {
+    const parentExecution = readWorkbenchExecutionSpec(args.job);
+    const parentWorkload = createWorkbenchRunWorkload({
+        job: args.job,
+        spec: args.spec,
+        baseFiles: args.baseFiles,
+        engineResolveFiles: args.engineResolveFiles,
+        engineCases: args.engineCases,
+        traceFiles: args.traceFiles,
+    });
+    const nonce = runtimeControlNonce();
+    const childExecutionId = `${parentExecution.id}:runtime:${nonce}`;
+    const childJobId = `${args.job.id}:runtime:${nonce}`;
+    const parentInput = asRuntimeRecord(args.job.input);
+    const publicFiles = runtimeControlInputFiles(request.inputs, "case", parentWorkload.engineCase ? engineCasePublicFiles(parentWorkload.engineCase) : []);
+    const privateFiles = runtimeControlInputFiles(request.inputs, "enginePrivate", parentWorkload.engineCase ? engineCasePrivateFiles(parentWorkload.engineCase) : []);
+    const subjectFiles = runtimeControlInputFiles(request.inputs, "subject", parentWorkload.subjectFiles);
+    const traceFiles = runtimeControlInputFiles(request.inputs, "traces", parentWorkload.traceFiles);
+    const adapter = request.operations[request.operations.length - 1]?.invocation;
+    const childExecution = {
+        ...parentExecution,
+        id: childExecutionId,
+        outputs: [],
+        adapter: adapter
+            ? {
+                use: adapter.use,
+                with: adapter.with ?? {},
+                ...(adapter.auth !== undefined ? { auth: adapter.auth } : {}),
+            }
+            : parentExecution.adapter,
+        metadata: {
+            ...asRuntimeRecord(parentExecution.metadata),
+            runtimeControl: true,
+            caseId: parentWorkload.caseId,
+        },
+    };
+    const engineCase = {
+        id: parentWorkload.caseId,
+        case: parentWorkload.engineCaseSpec ?? {
+            version: 3,
+            prompt: parentWorkload.prompt,
+        },
+        files: {
+            public: publicFiles,
+            private: privateFiles,
+        },
+    };
+    const childJob = {
+        ...args.job,
+        id: childJobId,
+        input: {
+            ...parentInput,
+            execution: childExecution,
+            caseId: parentWorkload.caseId,
+        },
+    };
+    const childArgs = {
+        ...args,
+        job: childJob,
+        baseFiles: subjectFiles,
+        engineResolveFiles: [...publicFiles, ...privateFiles],
+        engineCases: [engineCase],
+        traceFiles,
+        runtimeControlOperation: request,
+    };
+    delete childArgs.adapterRuntimeEnv;
+    delete childArgs.workspaceRoot;
+    return childArgs;
+}
+function runtimeControlInputFiles(inputs, key, fallback) {
+    if (inputs && Object.prototype.hasOwnProperty.call(inputs, key)) {
+        return cloneSurfaceFiles(inputs[key] ?? []);
+    }
+    return cloneSurfaceFiles(fallback);
+}
+function runtimeControlStepForOperation(operation, index, manifests = []) {
+    const command = operation.invocation.command?.trim()
+        || adapterProtocolCommandSpec({
+            use: operation.invocation.use,
+            with: operation.invocation.with ?? {},
+            ...(operation.invocation.auth !== undefined ? { auth: operation.invocation.auth } : {}),
+        }, operation.operation, manifests).command;
+    return {
+        kind: operation.operation === "subject.run"
+            ? "subject"
+            : operation.operation === "optimizer.improve"
+                ? "optimizer"
+                : "engine",
+        label: operation.label ?? `${operation.operation.replace(".", "_")}_${index + 1}`,
+        operation: operation.operation,
+        executor: "sandbox",
+        adapter: {
+            use: operation.invocation.use,
+            with: operation.invocation.with ?? {},
+            ...(operation.invocation.auth !== undefined ? { auth: operation.invocation.auth } : {}),
+        },
+        command,
+    };
+}
+function runtimeControlResultFromCompletedJob(job) {
+    return normalizeRuntimeControlResultOutput(asRuntimeRecord(job.output), job.status === "succeeded", job.error);
+}
+function runtimeControlJobOutput(result, ok) {
+    return normalizeRuntimeControlResultOutput({
+        ok,
+        files: result.files,
+        fileChanges: result.fileChanges,
+        ...(result.operationResults ? { operationResults: result.operationResults } : {}),
+        ...(result.workspaceFiles ? { workspaceFiles: result.workspaceFiles } : {}),
+        ...(result.result ? { result: result.result } : {}),
+        ...(result.usage ? { usage: result.usage } : {}),
+        ...(result.summary !== undefined ? { summary: result.summary } : {}),
+        ...(result.feedback !== undefined ? { feedback: result.feedback } : {}),
+        ...(result.error ? { error: result.error } : {}),
+    }, ok, result.error);
+}
+function normalizeRuntimeControlResultOutput(output, ok, fallbackError) {
+    const files = Array.isArray(output.files)
+        ? output.files.filter(isSurfaceSnapshotFile)
+        : [];
+    const workspaceFiles = Array.isArray(output.workspaceFiles)
+        ? output.workspaceFiles.filter(isSurfaceSnapshotFile)
+        : undefined;
+    const operationResults = Array.isArray(output.operationResults)
+        ? output.operationResults.filter(isWorkbenchAdapterOperationResult)
+        : [];
+    return {
+        ok: ok && output.ok !== false,
+        files,
+        fileChanges: Array.isArray(output.fileChanges)
+            ? output.fileChanges.filter((entry) => typeof entry === "string")
+            : files.map((file) => file.path),
+        operationResults,
+        ...(workspaceFiles ? { workspaceFiles } : {}),
+        ...(output.result && typeof output.result === "object" && !Array.isArray(output.result)
+            ? { result: output.result }
+            : {}),
+        ...(output.usage && typeof output.usage === "object" && !Array.isArray(output.usage)
+            ? { usage: output.usage }
+            : {}),
+        ...(typeof output.summary === "string" ? { summary: output.summary } : {}),
+        ...(output.feedback !== undefined && isJsonPayload(output.feedback) ? { feedback: output.feedback } : {}),
+        ...(typeof output.error === "string" ? { error: output.error } : fallbackError ? { error: fallbackError } : {}),
+    };
+}
+function isWorkbenchAdapterOperationResult(value) {
+    if (!value || typeof value !== "object" || Array.isArray(value)) {
+        return false;
+    }
+    const record = value;
+    return record.protocol === "workbench.adapter-result.v1" &&
+        (record.operation === "engine.resolve" ||
+            record.operation === "engine.run" ||
+            record.operation === "subject.run" ||
+            record.operation === "optimizer.improve");
+}
+function cloneSurfaceFiles(files) {
+    return files.map((file) => ({ ...file, path: normalizeRelativePath(file.path) }));
+}
+function runtimeControlNonce() {
+    return randomBytes(6).toString("hex");
+}
+function assertRuntimeControlScope(label, issues) {
+    if (issues.length > 0) {
+        throw new Error(`${label} failed validation:\n${issues.join("\n")}`);
+    }
+}
 async function runHostedProtocolExecutionResult(args, execution, startedAt, capability, eventPublisher) {
     const workload = createWorkbenchRunWorkload({
         job: args.job,
@@ -1462,13 +1983,13 @@ async function runHostedProtocolExecutionResult(args, execution, startedAt, capa
         engineCases: args.engineCases,
         traceFiles: args.traceFiles,
     });
-    const result = await runHostedCommandExecutionPhases(args, workload, [protocolPhaseForExecution(execution, args.adapterManifests)], startedAt, {
+    const result = await runHostedCommandExecutionSteps(args, workload, [protocolStepForExecution(execution, args.adapterManifests)], startedAt, {
         capability,
         eventPublisher,
     });
     return { workload, result };
 }
-async function runHostedCommandExecutionPhases(args, workload, phases, startedAt, options = {}) {
+async function runHostedCommandExecutionSteps(args, workload, steps, startedAt, options = {}) {
     const [{ execFile }, fs, os, path, { promisify }] = await Promise.all([
         importNodeModule(nodeBuiltin("child_process")),
         importNodeModule(nodeBuiltin("fs/promises")),
@@ -1489,9 +2010,22 @@ async function runHostedCommandExecutionPhases(args, workload, phases, startedAt
     const workspace = await createRuntimeWorkspaceRoot(args, fs, os, path, "workbench-execution-sandbox-");
     try {
         await stageWorkbenchRunWorkload(workspace.root, workload);
+        if (options.workspaceFiles && options.workspaceFiles.length > 0) {
+            await stageInitialWorkspaceFiles(workspace.root, options.workspaceFiles);
+        }
+        if (options.outputFiles && options.outputFiles.length > 0) {
+            await writeSurfaceFiles(outputDir(workspace.root), options.outputFiles);
+        }
+        const execution = readWorkbenchExecutionSpec(workload.job);
+        const hostAdapterIds = new Set(steps.flatMap((step) => step.executor === "host"
+            ? [step.adapter?.use ?? execution.adapter.use]
+            : []));
+        const hostAdapterRoots = hostAdapterIds.size > 0
+            ? await materializeHostAdapterRoots(workspace.root, args.adapterFiles ?? [], hostAdapterIds)
+            : new Map();
         let exitCode = 0;
         let runtimeError;
-        const phaseResults = [];
+        const operationResults = [];
         try {
             if (!environmentVersion) {
                 throw new Error("environment is required for adapter command executions.");
@@ -1503,49 +2037,64 @@ async function runHostedCommandExecutionPhases(args, workload, phases, startedAt
                     network: environmentVersion.spec.network,
                 }, null, 2)}\n`);
             }
-            const phaseTimeoutMs = environmentVersion
+            const stepTimeoutMs = environmentVersion
                 ? environmentVersionTimeoutMs(environmentVersion)
                 : 5 * 60 * 1000;
-            const execution = readWorkbenchExecutionSpec(workload.job);
-            for (const phase of phases) {
-                await resetHostedWorkloadPhaseOutput(workspace.root, phase);
-                if (phase.kind === "engine" && execution.purpose === "attempt") {
-                    await stageAttemptScoringInputs(workspace.root, workload);
+            const shouldRunSubjectPrepare = options.runSubjectPrepare ?? steps.some((step) => step.executor === "sandbox");
+            if (shouldRunSubjectPrepare) {
+                await runSubjectPrepareCommand({
+                    root: workspace.root,
+                    workload,
+                    execution,
+                    execFileAsync,
+                    timeoutMs: stepTimeoutMs,
+                    eventPublisher: options.eventPublisher,
+                });
+            }
+            let enginePrivateStaged = false;
+            for (const step of steps) {
+                if (step.kind === "engine" && !enginePrivateStaged) {
+                    await stageWorkbenchEnginePrivateFiles(workspace.root, workload);
+                    enginePrivateStaged = true;
                 }
-                const adapterRequestPath = await writeWorkbenchAdapterRequest(workspace.root, workload, execution, phase, adapterAuthRequestForPhase(args, phase.adapter?.use ?? execution.adapter.use), args.adapterManifests);
-                const phaseRole = phaseEventRole(phase);
-                await publishCommandPhaseEvent(options.eventPublisher, {
-                    phase: phase.label,
+                await resetHostedWorkloadStepOutput(workspace.root);
+                const adapterRequestPath = await writeWorkbenchAdapterRequest(workspace.root, workload, execution, step, adapterAuthRequestForStep(args, step.adapter?.use ?? execution.adapter.use), args.adapterManifests);
+                const stepRole = stepEventRole(step);
+                await publishCommandStepEvent(options.eventPublisher, {
+                    step: step.label,
                     status: "started",
-                    ...(phaseRole ? { role: phaseRole } : {}),
+                    ...(stepRole ? { role: stepRole } : {}),
                 });
                 try {
-                    if (!phase.command) {
-                        throw new Error(`Adapter phase ${phase.label} is missing a command.`);
+                    if (!step.command) {
+                        throw new Error(`Adapter step ${step.label} is missing a command.`);
                     }
-                    const command = createHostedWorkloadShellCommand(workspace.root, phase.command, phase.label, phase.okExitCodes);
+                    const adapterRoot = step.executor === "host"
+                        ? hostAdapterRoots.get(step.adapter?.use ?? execution.adapter.use)
+                        : undefined;
+                    const command = createHostedWorkloadShellCommand(workspace.root, step.command, step.label, step.okExitCodes);
                     await execFileAsync("sh", ["-c", command], {
-                        cwd: workspace.root,
-                        env: createHostedWorkloadPhaseEnv(workspace.root, adapterRequestPath, args.adapterAuthEnv),
+                        cwd: adapterRoot ?? workspace.root,
+                        env: createHostedWorkloadAdapterEnv(workspace.root, adapterRequestPath, args.adapterAuthEnv, adapterRoot ? { adapterRoot } : undefined, args.adapterRuntimeEnv),
                         maxBuffer: 10 * 1024 * 1024,
-                        timeout: phaseTimeoutMs,
+                        timeout: stepTimeoutMs,
                     });
-                    const operationResult = await readWorkbenchAdapterOperationResult(outputDir(workspace.root), phase.operation);
-                    assertWorkbenchAdapterOperationResultOk(operationResult, `Adapter ${phase.adapter?.use ?? execution.adapter.use} ${phase.operation}`);
-                    phaseResults.push(operationResult);
-                    await publishCommandPhaseEvent(options.eventPublisher, {
-                        phase: phase.label,
+                    const operationResult = await readWorkbenchAdapterOperationResult(outputDir(workspace.root), step.operation);
+                    assertWorkbenchAdapterOperationResultOk(operationResult, `Adapter ${step.adapter?.use ?? execution.adapter.use} ${step.operation}`);
+                    operationResults.push(operationResult);
+                    await publishCommandStepEvent(options.eventPublisher, {
+                        step: step.label,
                         status: "succeeded",
-                        ...(phaseRole ? { role: phaseRole } : {}),
+                        ...(stepRole ? { role: stepRole } : {}),
                     });
                 }
                 catch (error) {
-                    await publishCommandPhaseEvent(options.eventPublisher, {
-                        phase: phase.label,
+                    await publishCommandStepEvent(options.eventPublisher, {
+                        step: step.label,
                         status: "failed",
                         exitCode: readExitCode(error),
                         error: error instanceof Error ? error.message : String(error),
-                        ...(phaseRole ? { role: phaseRole } : {}),
+                        ...(stepRole ? { role: stepRole } : {}),
                     });
                     throw error;
                 }
@@ -1569,16 +2118,56 @@ async function runHostedCommandExecutionPhases(args, workload, phases, startedAt
                 startedAt,
             });
         }
-        return await readWorkbenchRunWorkloadResult(workspace.root, workload, {
+        const result = await readWorkbenchRunWorkloadResult(workspace.root, workload, {
             exitCode,
             startedAt,
-            phaseResults,
+            operationResults,
         });
+        if (options.collectWorkspace) {
+            result.workspaceFiles = await readMutableWorkspaceSnapshotFiles(workspace.root);
+        }
+        return result;
     }
     finally {
         await workspace.cleanup();
     }
 }
+async function runSubjectPrepareCommand(args) {
+    const command = args.workload.spec.subject.prepare?.command;
+    if (!command) {
+        return;
+    }
+    const role = args.execution.purpose === "improve" ? "optimizer" : "runner";
+    await publishCommandStepEvent(args.eventPublisher, {
+        step: "subject_prepare",
+        status: "started",
+        role,
+    });
+    try {
+        const shellCommand = createHostedWorkloadShellCommand(args.root, command, "subject_prepare");
+        await args.execFileAsync("sh", ["-c", shellCommand], {
+            cwd: args.root,
+            env: createHostedWorkloadPrepareEnv(args.root),
+            maxBuffer: 10 * 1024 * 1024,
+            timeout: args.timeoutMs,
+        });
+        await publishCommandStepEvent(args.eventPublisher, {
+            step: "subject_prepare",
+            status: "succeeded",
+            role,
+        });
+    }
+    catch (error) {
+        await publishCommandStepEvent(args.eventPublisher, {
+            step: "subject_prepare",
+            status: "failed",
+            exitCode: readExitCode(error),
+            error: error instanceof Error ? error.message : String(error),
+            role,
+        });
+        throw new Error(`Subject prepare command failed: ${error instanceof Error ? error.message : String(error)}`);
+    }
+}
 async function createRuntimeWorkspaceRoot(args, fs, os, path, prefix) {
     if (args.workspaceRoot) {
         await fs.mkdir(args.workspaceRoot, { recursive: true });
@@ -1614,19 +2203,22 @@ async function createRuntimeWorkspaceRoot(args, fs, os, path, prefix) {
         },
     };
 }
-function phaseEventRole(phase) {
-    if (phase.kind === "optimizer") {
+function stepEventRole(step) {
+    if (step.kind === "optimizer") {
         return "optimizer";
     }
-    if (phase.kind === "runner") {
+    if (step.kind === "subject") {
         return "runner";
     }
-    if (phase.kind === "engine") {
+    if (step.kind === "engine") {
         return "engine";
     }
     return undefined;
 }
 function adapterOperationUsageSummary(result) {
+    if (hasExplicitUsageRole(result.usage)) {
+        return completeUsageSummary(result.usage);
+    }
     if (result.operation === "optimizer.improve") {
         return assignUsageRole("optimizer", result.usage);
     }
@@ -1638,11 +2230,16 @@ function adapterOperationUsageSummary(result) {
     }
     return result.usage;
 }
-function executionPurposeRole(purpose) {
-    if (purpose === "improve") {
-        return "optimizer";
-    }
-    return "runner";
+function attemptUsageSummary(workloadUsage, resultUsage) {
+    const normalizedWorkloadUsage = completeUsageSummary(workloadUsage);
+    const legacyEngineUsage = normalizedWorkloadUsage?.engine
+        ? undefined
+        : assignUsageRole("engine", resultUsage);
+    return mergeUsageSummaries([normalizedWorkloadUsage, legacyEngineUsage]);
+}
+function hasExplicitUsageRole(usage) {
+    const normalized = completeUsageSummary(usage);
+    return Boolean(normalized?.optimizer || normalized?.runner || normalized?.engine);
 }
 function createSubjectPatchFromResult(result, spec) {
     if (result.subjectPatch) {
@@ -1720,47 +2317,103 @@ export async function stageWorkbenchRunWorkload(root, workload) {
         fs
             .rm(runtimePrivateDir(root), { recursive: true, force: true })
             .catch(() => undefined),
-        fs
-            .rm(runtimeLogsDir(root), { recursive: true, force: true })
-            .catch(() => undefined),
     ]);
     await fs.mkdir(inputDir(root), { recursive: true });
     await fs.mkdir(outputDir(root), { recursive: true });
     if (purpose === "attempt") {
-        assertMutableWorkspaceFiles(workload.subjectFiles, "Subject files");
         await fs.mkdir(subjectDir(root), { recursive: true });
         await fs.mkdir(caseDir(root), { recursive: true });
-        await fs.mkdir(runtimeLogsAgentDir(root), { recursive: true });
-        await fs.mkdir(runtimeLogsVerifierDir(root), { recursive: true });
         const engineCase = requireWorkloadEngineCase(workload, "Attempt staging");
         await writeSurfaceFiles(subjectDir(root), workload.subjectFiles);
-        await writeSurfaceFiles(caseDir(root), engineCaseSubjectVisibleFiles(engineCase));
-        await writeSurfaceFiles(root, workload.subjectFiles);
+        await writeSurfaceFiles(caseDir(root), engineCasePublicFiles(engineCase));
         return;
     }
     if (purpose === "improve") {
-        assertMutableWorkspaceFiles(workload.subjectFiles, "Subject files");
         await fs.mkdir(subjectDir(root), { recursive: true });
         await writeSurfaceFiles(subjectDir(root), workload.subjectFiles);
-        await writeSurfaceFiles(root, workload.subjectFiles);
         await fs.mkdir(tracesDir(root), { recursive: true });
         await writeSurfaceFiles(tracesDir(root), workload.traceFiles);
     }
 }
-async function stageAttemptScoringInputs(root, workload) {
+async function stageWorkbenchEnginePrivateFiles(root, workload) {
+    if (readWorkloadExecutionPurpose(workload) !== "attempt") {
+        return;
+    }
     const fs = await importNodeModule(nodeBuiltin("fs/promises"));
-    const engineCase = requireWorkloadEngineCase(workload, "Attempt scoring");
-    await Promise.all([
-        fs
-            .rm(runtimeEnginePrivateDir(root), { recursive: true, force: true })
-            .catch(() => undefined),
-        fs
-            .rm(runtimeLogsVerifierDir(root), { recursive: true, force: true })
-            .catch(() => undefined),
-    ]);
     await fs.mkdir(runtimeEnginePrivateDir(root), { recursive: true });
-    await fs.mkdir(runtimeLogsVerifierDir(root), { recursive: true });
-    await writeSurfaceFiles(runtimeEnginePrivateDir(root), engineCaseEnginePrivateFiles(engineCase));
+    await writeSurfaceFiles(runtimeEnginePrivateDir(root), engineCasePrivateFiles(requireWorkloadEngineCase(workload, "Engine-private staging")));
+}
+async function stageInitialWorkspaceFiles(root, files) {
+    await writeSurfaceFiles(root, files.filter((file) => isMutableWorkspaceSnapshotPath(file.path)));
+}
+async function readMutableWorkspaceSnapshotFiles(root) {
+    return (await readSurfaceFiles(root))
+        .filter((file) => isMutableWorkspaceSnapshotPath(file.path))
+        .sort((left, right) => left.path.localeCompare(right.path));
+}
+function isMutableWorkspaceSnapshotPath(filePath) {
+    const normalized = normalizeRelativePath(filePath);
+    return Boolean(normalized &&
+        !normalized.startsWith("../") &&
+        normalized !== "input" &&
+        !normalized.startsWith("input/") &&
+        normalized !== "private" &&
+        !normalized.startsWith("private/") &&
+        normalized !== "output" &&
+        !normalized.startsWith("output/") &&
+        normalized !== ".workbench" &&
+        !normalized.startsWith(".workbench/"));
+}
+async function materializeHostAdapterRoots(root, adapterFiles, adapterIds) {
+    if (adapterFiles.length === 0 || adapterIds.size === 0) {
+        return new Map();
+    }
+    const fs = await importNodeModule(nodeBuiltin("fs/promises"));
+    const path = await importNodeModule(nodeBuiltin("path"));
+    const sourceRoots = hostAdapterSourceRoots(adapterFiles, adapterIds);
+    const roots = new Map();
+    for (const [adapterId, sourceRoot] of sourceRoots) {
+        const targetRoot = path.join(root, ".workbench", "adapters", adapterId);
+        const files = adapterFiles.flatMap((file) => {
+            const relativePath = adapterFilePathWithinRoot(file.path, sourceRoot);
+            return relativePath === null
+                ? []
+                : [{ ...file, path: relativePath }];
+        });
+        await fs.rm(targetRoot, { recursive: true, force: true }).catch(() => undefined);
+        await fs.mkdir(targetRoot, { recursive: true });
+        await writeSurfaceFiles(targetRoot, files);
+        roots.set(adapterId, await fs.realpath(targetRoot));
+    }
+    return roots;
+}
+function hostAdapterSourceRoots(adapterFiles, adapterIds) {
+    const roots = new Map();
+    for (const file of adapterFiles) {
+        const normalized = normalizeRelativePath(file.path);
+        if (!normalized.endsWith("workbench.adapter.yaml")) {
+            continue;
+        }
+        const manifest = parseWorkbenchAdapterManifest(file.content);
+        if (!adapterIds.has(manifest.id)) {
+            continue;
+        }
+        const sourceRoot = normalized === "workbench.adapter.yaml"
+            ? ""
+            : normalized.slice(0, -"workbench.adapter.yaml".length).replace(/\/+$/u, "");
+        roots.set(manifest.id, sourceRoot);
+    }
+    return roots;
+}
+function adapterFilePathWithinRoot(filePath, sourceRoot) {
+    const normalized = normalizeRelativePath(filePath);
+    if (!sourceRoot) {
+        return normalized;
+    }
+    if (!normalized.startsWith(`${sourceRoot}/`)) {
+        return null;
+    }
+    return normalized.slice(sourceRoot.length + 1);
 }
 async function readHostedRunFailureResult(root, workload, options) {
     const traceFiles = await readRuntimeTraceFiles(root, workload);
@@ -1788,16 +2441,16 @@ async function readWorkbenchRunWorkloadResult(root, workload, options = {}) {
     const primaryOperation = purpose === "improve"
         ? "optimizer.improve"
         : "engine.run";
-    const primaryResult = [...(options.phaseResults ?? [])]
+    const primaryResult = [...(options.operationResults ?? [])]
         .reverse()
         .find((result) => result.operation === primaryOperation);
     const resultPayload = jsonRecord(primaryResult?.value);
     const usage = mergeUsageSummaries([
         options.usage,
-        ...(options.phaseResults ?? []).map(adapterOperationUsageSummary),
+        ...(options.operationResults ?? []).map(adapterOperationUsageSummary),
     ]);
-    const metrics = normalizeRewardMetrics(resultPayload.metrics);
-    const cases = normalizeRewardCases(resultPayload.cases);
+    const metrics = normalizeResultMetrics(resultPayload.metrics);
+    const cases = normalizeResultCases(resultPayload.cases);
     const includeResultScoring = purpose === "attempt";
     const files = [...outputFiles, ...traceFiles].sort((left, right) => left.path.localeCompare(right.path));
     const subjectPatch = purpose === "improve" ? primaryResult?.value : undefined;
@@ -1809,6 +2462,7 @@ async function readWorkbenchRunWorkloadResult(root, workload, options = {}) {
     return {
         files,
         fileChanges: declaredChanges,
+        ...(options.operationResults ? { operationResults: [...options.operationResults] } : {}),
         ...(subjectPatch ? { subjectPatch } : {}),
         ...(engineResult ? { result: engineResult } : {}),
         ...(includeResultScoring && metrics ? { metrics } : {}),
@@ -1835,10 +2489,10 @@ async function readRuntimeTraceFiles(root, workload) {
     const path = await importNodeModule(nodeBuiltin("path"));
     const traceRoot = path.join(outputDir(root), ".workbench", "traces", workload.job.id);
     const purpose = readWorkloadExecutionPurpose(workload);
-    const outputTraceRoot = workbenchTracePhaseDirectory({
+    const outputTraceRoot = workbenchTraceExecutionDirectory({
         sequence: 1,
         runId: workload.job.runId,
-        phase: purpose,
+        purpose,
     });
     return (await readSurfaceFiles(traceRoot)).map((file) => ({
         ...file,
@@ -1868,13 +2522,13 @@ function createHostedWorkloadShellCommand(root, command, prefix = "", okExitCode
         'exit "$status"',
     ].join("; ");
 }
-async function resetHostedWorkloadPhaseOutput(root, _phase) {
+async function resetHostedWorkloadStepOutput(root) {
     const fs = await importNodeModule(nodeBuiltin("fs/promises"));
     await fs
         .rm(workbenchAdapterOperationResultPath(outputDir(root)), { force: true })
         .catch(() => undefined);
 }
-async function writeWorkbenchAdapterRequest(root, workload, execution, phase, auth, manifests) {
+async function writeWorkbenchAdapterRequest(root, workload, execution, step, auth, manifests) {
     const [fs, path] = await Promise.all([
         importNodeModule(nodeBuiltin("fs/promises")),
         importNodeModule(nodeBuiltin("path")),
@@ -1882,13 +2536,13 @@ async function writeWorkbenchAdapterRequest(root, workload, execution, phase, au
     const requestPath = path.join(root, ".workbench", "request.json");
     await fs.mkdir(path.dirname(requestPath), { recursive: true });
     const casePrompt = workload.engineCaseSpec?.prompt;
-    const adapter = phase.adapter ?? execution.adapter;
+    const adapter = step.adapter ?? execution.adapter;
     const subjectCommand = adapterProtocolCommandSpec(workload.spec.run, "subject.run", manifests).command;
     await fs.writeFile(requestPath, `${JSON.stringify({
         protocol: "workbench.adapter.v3",
         id: execution.id,
         jobId: workload.job.id,
-        operation: phase.operation,
+        operation: step.operation,
         invocation: {
             use: adapter.use,
             with: adapterConfigRecord(adapter, manifests),
@@ -1903,6 +2557,7 @@ async function writeWorkbenchAdapterRequest(root, workload, execution, phase, au
             subject: {
                 id: workload.subjectId,
                 path: workload.spec.subject.files.path,
+                ...(workload.spec.subject.prepare ? { prepare: { ...workload.spec.subject.prepare } } : {}),
                 run: {
                     ...workload.spec.run,
                     command: subjectCommand,
@@ -1923,14 +2578,12 @@ async function writeWorkbenchAdapterRequest(root, workload, execution, phase, au
         },
         paths: {
             workspace: root,
-            cwd: root,
             output: outputDir(root),
             result: workbenchAdapterOperationResultPath(outputDir(root)),
             subject: subjectDir(root),
             ...(workload.engineCaseSpec ? { case: caseDir(root) } : {}),
             traces: tracesDir(root),
-            ...(phase.kind === "engine" ? { enginePrivate: runtimeEnginePrivateDir(root) } : {}),
-            logs: runtimeLogsDir(root),
+            ...(step.kind === "engine" ? { enginePrivate: runtimeEnginePrivateDir(root) } : {}),
         },
     }, null, 2)}\n`);
     return requestPath;
@@ -1945,7 +2598,29 @@ function requireOptimizerEdits(spec) {
     }
     return edits;
 }
-function createHostedWorkloadPhaseEnv(root, adapterRequestPath, adapterEnv = {}) {
+function createHostedWorkloadAdapterEnv(root, adapterRequestPath, adapterEnv = {}, options = {}, runtimeEnv = {}) {
+    const env = createHostedWorkloadBaseEnv();
+    env.WORKBENCH_ADAPTER_REQUEST = adapterRequestPath;
+    env.WORKBENCH_OUTPUT = outputDir(root);
+    env.WORKBENCH_RESULT = workbenchAdapterOperationResultPath(outputDir(root));
+    if (options.adapterRoot) {
+        env.WORKBENCH_ADAPTER_ROOT = options.adapterRoot;
+        env.WORKBENCH_WORKSPACE_ROOT = root;
+        env.PATH = [
+            `${options.adapterRoot}/node_modules/.bin`,
+            env.PATH,
+        ].filter(Boolean).join(":");
+    }
+    Object.assign(env, adapterEnv);
+    Object.assign(env, runtimeEnv);
+    return env;
+}
+function createHostedWorkloadPrepareEnv(root) {
+    const env = createHostedWorkloadBaseEnv();
+    env.WORKBENCH_OUTPUT = outputDir(root);
+    return env;
+}
+function createHostedWorkloadBaseEnv() {
     const env = {};
     for (const [key, value] of Object.entries(process.env)) {
         if (typeof value === "string") {
@@ -1957,20 +2632,52 @@ function createHostedWorkloadPhaseEnv(root, adapterRequestPath, adapterEnv = {})
             delete env[key];
         }
     }
-    const runtimeBins = [
+    const runtimeBins = uniquePathEntries([
+        ...nodeModuleBinDirsForAncestors(process.cwd()),
+        ...nodeModuleBinDirsForAncestors(path.dirname(fileURLToPath(import.meta.url))),
+        "/app/node_modules/.bin",
         "/workbench-runtime/node_modules/.bin",
         "/workbench-runtime/products/workbench/node_modules/.bin",
-    ].join(":");
-    const systemBins = "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin";
-    env.PATH = process.env.PATH
-        ? `${systemBins}:${runtimeBins}:${process.env.PATH}`
-        : `${systemBins}:${runtimeBins}`;
-    env.WORKBENCH_ADAPTER_REQUEST = adapterRequestPath;
-    env.WORKBENCH_OUTPUT = outputDir(root);
-    env.WORKBENCH_RESULT = workbenchAdapterOperationResultPath(outputDir(root));
-    Object.assign(env, adapterEnv);
+    ]);
+    env.PATH = uniquePathEntries([
+        path.dirname(process.execPath),
+        "/usr/local/sbin",
+        "/usr/local/bin",
+        "/usr/sbin",
+        "/usr/bin",
+        "/sbin",
+        "/bin",
+        ...runtimeBins,
+        ...(process.env.PATH ? process.env.PATH.split(path.delimiter) : []),
+    ]).join(path.delimiter);
     return env;
 }
+function nodeModuleBinDirsForAncestors(start) {
+    const dirs = [];
+    let current = path.resolve(start);
+    for (let depth = 0; depth < 12; depth += 1) {
+        dirs.push(path.join(current, "node_modules", ".bin"));
+        const parent = path.dirname(current);
+        if (parent === current) {
+            break;
+        }
+        current = parent;
+    }
+    return dirs;
+}
+function uniquePathEntries(entries) {
+    const seen = new Set();
+    const output = [];
+    for (const entry of entries) {
+        const trimmed = entry.trim();
+        if (!trimmed || seen.has(trimmed)) {
+            continue;
+        }
+        seen.add(trimmed);
+        output.push(trimmed);
+    }
+    return output;
+}
 function readWorkloadExecutionPurpose(workload) {
     const purpose = workbenchExecutionPurpose(workload.job);
     if (purpose === "improve" || purpose === "attempt") {
@@ -2005,35 +2712,6 @@ function runtimePrivateDir(root) {
 function runtimeEnginePrivateDir(root) {
     return `${runtimePrivateDir(root)}/engine`;
 }
-function runtimeLogsDir(root) {
-    return `${root}/logs`;
-}
-function runtimeLogsAgentDir(root) {
-    return `${runtimeLogsDir(root)}/agent`;
-}
-function runtimeLogsVerifierDir(root) {
-    return `${runtimeLogsDir(root)}/verifier`;
-}
-function assertMutableWorkspaceFiles(files, label) {
-    const reserved = files
-        .map((file) => normalizeRelativePath(file.path))
-        .filter(isRuntimeReservedWorkspacePath);
-    if (reserved.length > 0) {
-        throw new Error(`${label} cannot target runtime-reserved workspace paths: ${reserved.join(", ")}.`);
-    }
-}
-function isRuntimeReservedWorkspacePath(normalizedPath) {
-    return normalizedPath === ".workbench" ||
-        normalizedPath.startsWith(".workbench/") ||
-        normalizedPath === "input" ||
-        normalizedPath.startsWith("input/") ||
-        normalizedPath === "output" ||
-        normalizedPath.startsWith("output/") ||
-        normalizedPath === "logs" ||
-        normalizedPath.startsWith("logs/") ||
-        normalizedPath === "private" ||
-        normalizedPath.startsWith("private/");
-}
 async function writeSurfaceFiles(root, files) {
     const fs = await importNodeModule(nodeBuiltin("fs/promises"));
     const path = await importNodeModule(nodeBuiltin("path"));
@@ -2097,7 +2775,7 @@ function encodeSurfaceSnapshotContent(body, utf8Decoder) {
         };
     }
 }
-function normalizeRewardMetrics(value) {
+function normalizeResultMetrics(value) {
     if (!value || typeof value !== "object" || Array.isArray(value)) {
         return undefined;
     }
@@ -2109,7 +2787,7 @@ function normalizeRewardMetrics(value) {
     }
     return Object.keys(metrics).length > 0 ? metrics : undefined;
 }
-function normalizeRewardCases(value) {
+function normalizeResultCases(value) {
     if (!Array.isArray(value)) {
         return undefined;
     }
@@ -2122,7 +2800,7 @@ function normalizeRewardCases(value) {
         if (!id) {
             return [];
         }
-        const metrics = normalizeRewardMetrics(record.metrics) ?? {};
+        const metrics = normalizeResultMetrics(record.metrics) ?? {};
         const status = record.status === "completed" || record.status === "error"
             ? record.status
             : undefined;
@@ -2146,9 +2824,7 @@ function normalizeRewardCases(value) {
                     : undefined;
                 const pass = typeof criterionRecord.pass === "boolean"
                     ? criterionRecord.pass
-                    : score !== undefined
-                        ? score >= 0.5
-                        : undefined;
+                    : undefined;
                 if (!criterionId || score === undefined || pass === undefined) {
                     return [];
                 }
@@ -2261,13 +2937,13 @@ function evaluateSample(args) {
     if (typeof sampleScore !== "number" || !Number.isFinite(sampleScore)) {
         throw new Error("Evaluation sample requires an engine result with a finite numeric score.");
     }
-    const cases = args.workload.cases?.length ? args.workload.cases : undefined;
     const metrics = args.workload.metrics ?? {
         score: sampleScore,
     };
     if (metrics.score === undefined) {
         metrics.score = sampleScore;
     }
+    const cases = args.workload.cases?.length ? args.workload.cases : undefined;
     const feedback = {
         ...(args.workload.summary !== undefined
             ? { summary: args.workload.summary }
@@ -2295,7 +2971,7 @@ function evaluateSample(args) {
         feedback,
     };
 }
-function normalizeSampleJobOutput(value, fallbackFiles = []) {
+function normalizeSampleJobOutput(value) {
     if (!value || typeof value !== "object" || Array.isArray(value)) {
         return null;
     }
@@ -2314,9 +2990,6 @@ function normalizeSampleJobOutput(value, fallbackFiles = []) {
         !Number.isFinite(record.attemptIndex)) {
         return null;
     }
-    const sampleFiles = files.length > 0
-        ? files
-        : fallbackFiles.map((file) => ({ ...file }));
     return {
         subjectId: record.subjectId,
         attemptIndex: record.attemptIndex,
@@ -2324,10 +2997,10 @@ function normalizeSampleJobOutput(value, fallbackFiles = []) {
         fileChanges: Array.isArray(record.fileChanges)
             ? record.fileChanges.filter((entry) => typeof entry === "string")
             : [],
-        files: sampleFiles,
+        files,
         traces: Array.isArray(record.traces)
             ? record.traces.filter((entry) => typeof entry === "string")
-            : traceFilePaths(sampleFiles),
+            : traceFilePaths(files),
     };
 }
 function normalizeEvaluationSampleOutputs(args) {
@@ -2498,8 +3171,16 @@ function compareSampleOutputs(left, right) {
     }
     return left.sample.id.localeCompare(right.sample.id);
 }
-function createEvaluationRecord(subjectId, rawSamples) {
-    const samples = mergeEvaluationSampleRecords(rawSamples);
+function createEvaluationRecord(subjectId, subjectName, rawSamples) {
+    const samples = mergeEvaluationSampleRecords(rawSamples).map((sample) => subjectName
+        ? {
+            ...sample,
+            subject: {
+                ...sample.subject,
+                label: subjectName,
+            },
+        }
+        : sample);
     const startedAt = minTimestamp(samples.flatMap((sample) => (sample.startedAt ? [sample.startedAt] : [])));
     const finishedAt = maxTimestamp(samples.flatMap((sample) => (sample.finishedAt ? [sample.finishedAt] : [])));
     const durationValues = samples.flatMap((sample) => typeof sample.durationMs === "number" ? [sample.durationMs] : []);
@@ -2513,6 +3194,7 @@ function createEvaluationRecord(subjectId, rawSamples) {
         subject: {
             id: subjectId,
             kind: "subject",
+            ...(subjectName ? { label: subjectName } : {}),
         },
         status: samples.length > 0 && completedSampleCount === samples.length
             ? "completed"
@@ -2533,6 +3215,10 @@ function createEvaluationRecord(subjectId, rawSamples) {
         samples,
     };
 }
+function normalizedSubjectDisplayName(value) {
+    const normalized = value?.trim();
+    return normalized ? normalized : null;
+}
 function aggregateSampleMetrics(samples) {
     const metricNames = new Set(samples.flatMap((sample) => Object.keys(sample.metrics ?? {})));
     if (metricNames.size === 0) {
@@ -2563,14 +3249,14 @@ function mergeEvaluationSampleRecords(samples) {
 function mergeEvaluationSampleGroup(group) {
     const first = group[0];
     if (group.length === 1) {
-        return normalizeSingleCaseDurations(first);
+        return first;
     }
     const startedAt = minTimestamp(group.flatMap((sample) => (sample.startedAt ? [sample.startedAt] : [])));
     const finishedAt = maxTimestamp(group.flatMap((sample) => (sample.finishedAt ? [sample.finishedAt] : [])));
     const durationMs = startedAt && finishedAt
         ? Math.max(0, Date.parse(finishedAt) - Date.parse(startedAt))
         : undefined;
-    const cases = group.flatMap((sample) => normalizeCaseDurations(sample));
+    const cases = group.flatMap((sample) => sample.cases ?? []);
     const metrics = aggregateSampleGroupMetrics(group);
     const usage = mergeUsageSummaries(group.map((sample) => sample.usage));
     const errors = group.flatMap((sample) => sample.error ? [sample.error] : []);
@@ -2588,22 +3274,6 @@ function mergeEvaluationSampleGroup(group) {
         ...(cases.length > 0 ? { cases } : {}),
     };
 }
-function normalizeSingleCaseDurations(sample) {
-    if (!sample.cases) {
-        return sample;
-    }
-    const cases = normalizeCaseDurations(sample);
-    return cases.length === sample.cases.length
-        ? { ...sample, cases }
-        : sample;
-}
-function normalizeCaseDurations(sample) {
-    return (sample.cases ?? []).map((caseResult) => (typeof caseResult.durationMs === "number" ||
-        sample.cases?.length !== 1 ||
-        typeof sample.durationMs !== "number"
-        ? caseResult
-        : { ...caseResult, durationMs: sample.durationMs }));
-}
 function aggregateSampleGroupMetrics(group) {
     const metricNames = new Set(group.flatMap((sample) => Object.keys(sample.metrics ?? {})));
     if (metricNames.size === 0) {