@workbench-ai/workbench-core 0.0.46 → 0.0.47

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. package/dist/execution-events.d.ts +2 -2
  2. package/dist/execution-events.d.ts.map +1 -1
  3. package/dist/execution-events.js +3 -3
  4. package/dist/{execution-phases.d.ts → execution-evidence.d.ts} +8 -7
  5. package/dist/execution-evidence.d.ts.map +1 -0
  6. package/dist/{execution-phases.js → execution-evidence.js} +91 -51
  7. package/dist/execution-graph.js +1 -2
  8. package/dist/execution-jobs.js +1 -1
  9. package/dist/execution-outputs.d.ts.map +1 -1
  10. package/dist/execution-outputs.js +5 -10
  11. package/dist/execution-runtime-types.d.ts +7 -3
  12. package/dist/execution-runtime-types.d.ts.map +1 -1
  13. package/dist/execution-traces.d.ts +11 -1
  14. package/dist/execution-traces.d.ts.map +1 -1
  15. package/dist/execution-traces.js +305 -2
  16. package/dist/generic-spec.d.ts +8 -3
  17. package/dist/generic-spec.d.ts.map +1 -1
  18. package/dist/generic-spec.js +26 -37
  19. package/dist/index.d.ts +22 -11
  20. package/dist/index.d.ts.map +1 -1
  21. package/dist/index.js +868 -214
  22. package/dist/runtime-dockerfile.d.ts +14 -0
  23. package/dist/runtime-dockerfile.d.ts.map +1 -0
  24. package/dist/runtime-dockerfile.js +65 -0
  25. package/dist/sandbox-backends/docker.d.ts.map +1 -1
  26. package/dist/sandbox-backends/docker.js +9 -12
  27. package/dist/sandbox-backends/index.d.ts.map +1 -1
  28. package/dist/sandbox-backends/index.js +2 -1
  29. package/dist/sandbox-inputs.d.ts.map +1 -1
  30. package/dist/sandbox-inputs.js +1 -0
  31. package/dist/sandbox-plane.d.ts +1 -0
  32. package/dist/sandbox-plane.d.ts.map +1 -1
  33. package/dist/sandbox-plane.js +12 -22
  34. package/dist/trace-files.d.ts +2 -2
  35. package/dist/trace-files.d.ts.map +1 -1
  36. package/dist/trace-files.js +4 -4
  37. package/package.json +3 -3
  38. package/worker/sandbox-adapter-runner.cjs +22 -13
  39. package/dist/execution-phases.d.ts.map +0 -1
package/dist/index.js CHANGED
@@ -1,28 +1,30 @@
1
- import { createHash } from "node:crypto";
1
+ import { createHash, randomBytes } from "node:crypto";
2
2
  import os from "node:os";
3
3
  import path from "node:path";
4
+ import { fileURLToPath } from "node:url";
4
5
  import YAML from "yaml";
5
- import { adapterCommandName, assertWorkbenchAdapterOperationResultOk, collectWorkbenchAdapterAuthRequirements, readWorkbenchAdapterOperationResult, workbenchAdapterOperationCommand, workbenchAdapterOperationResultPath, } from "@workbench-ai/workbench-protocol";
6
- import { BENCHMARK_SPEC_FILE, engineCaseEnginePrivateFiles, engineCaseFilesForRuntimeInput, engineCaseSubjectVisibleFiles, resolveEngineCaseExecutionConfig, resolveWorkbenchResolvedSourceYaml as resolveWorkbenchResolvedSourceYamlInternal, validateWorkbenchResolvedSourceYaml as validateWorkbenchResolvedSourceYamlInternal, isWorkbenchSubjectManifestPath, } from "./generic-spec.js";
6
+ import { adapterCommandName, assertWorkbenchAdapterOperationResultOk, collectWorkbenchAdapterAuthRequirements, parseWorkbenchAdapterManifest, readWorkbenchAdapterOperationResult, WORKBENCH_RUNTIME_CONTROL_TOKEN_ENV, WORKBENCH_RUNTIME_CONTROL_URL_ENV, workbenchAdapterOperationCommand, workbenchAdapterOperationExecutor, workbenchAdapterOperationResultPath, } from "@workbench-ai/workbench-protocol";
7
+ import { BENCHMARK_SPEC_FILE, engineCasePrivateFiles, engineCaseFilesForRuntimeInput, engineCasePublicFiles, resolveEngineCaseExecutionConfig, resolveWorkbenchResolvedSourceYaml as resolveWorkbenchResolvedSourceYamlInternal, validateWorkbenchResolvedSourceYaml as validateWorkbenchResolvedSourceYamlInternal, isWorkbenchSubjectManifestPath, } from "./generic-spec.js";
7
8
  import { attachSandboxMetadataToJob, createWorkbenchSandboxFileStore, isSurfaceSnapshotFile, readWorkbenchExecutionSpec, } from "./sandbox-inputs.js";
8
- import { asRuntimeRecord, importNodeModule, jsonRecord, nodeBuiltin, quoteShellArg, resolveWorkbenchWorkerId, } from "./runtime-utils.js";
9
- import { executeValidatedSandboxExecution, } from "./sandbox-plane.js";
9
+ import { asRuntimeRecord, importNodeModule, isJsonPayload, jsonRecord, nodeBuiltin, quoteShellArg, resolveWorkbenchWorkerId, } from "./runtime-utils.js";
10
+ import { createWorkbenchExecutionCapability, createWorkbenchSandboxAllocation, collectExecutionCapabilityScopeIssues, collectSandboxAllocationScopeIssues, collectSandboxHandleScopeIssues, assertSandboxBackendSupportsNetworkPolicy, executeValidatedSandboxExecution, } from "./sandbox-plane.js";
10
11
  import { createSandboxBackendPlaneForProvider, } from "./sandbox-backends/index.js";
11
12
  import { applyWorkbenchSubjectPatch } from "./subject-patch.js";
12
13
  import { assignUsageRole, completeUsageSummary, mergeUsageSummaries, normalizeUsageSummary, usageStats, } from "./execution-usage.js";
13
- import { traceFilePaths, workbenchTracePhaseDirectory, } from "./trace-files.js";
14
+ import { traceFilePaths, workbenchTraceExecutionDirectory, } from "./trace-files.js";
14
15
  import { engineCaseForCase, } from "./execution-jobs.js";
15
- import { createWorkbenchExecutionEventPublisher, publishCommandPhaseEvent, } from "./execution-events.js";
16
- import { readWorkbenchExecutionPurpose } from "./execution-phases.js";
16
+ import { createWorkbenchExecutionEventPublisher, publishCommandStepEvent, } from "./execution-events.js";
17
+ import { readWorkbenchExecutionPurpose } from "./execution-evidence.js";
17
18
  import { adapterAuthEnv, localWorkbenchAdapterAuthStore, normalizeWorkbenchAdapterAuthTarget, sanitizeWorkbenchAdapterAuthBundle, } from "./adapter-auth.js";
18
- export { BENCHMARK_SPEC_FILE, DEFAULT_EXECUTION_RESOURCES, engineCaseEnginePrivateFiles, engineCaseFilesForRuntimeInput, engineCaseSubjectVisibleFiles, engineResolveInvocationForSpec, engineResolveBindingForSpec, engineResolveBindingForSourceYaml, isWorkbenchSubjectManifestPath, parseWorkbenchSourceFiles, resolveEngineCaseExecutionConfig, resolveWorkbenchResolvedSourceYaml, resolveWorkbenchSourceFiles, serializeWorkbenchResolvedSourceYaml, validateWorkbenchResolvedSourceYaml, } from "./generic-spec.js";
19
- export { adapterCommandName, cloneWorkbenchAdapterManifest, collectWorkbenchAdapterAuthRequirements, collectWorkbenchAdapterInvocations, parseWorkbenchAdapterManifest, workbenchAdapterManifestRequiresAuth, workbenchAdapterManifestSupportsOperation, workbenchAdapterOperationCommand, withDefaultWorkbenchAdapterAuth, withDefaultWorkbenchAdapterAuthProfiles, } from "@workbench-ai/workbench-protocol";
19
+ export { BENCHMARK_SPEC_FILE, DEFAULT_EXECUTION_RESOURCES, engineCasePrivateFiles, engineCaseFilesForRuntimeInput, engineCasePublicFiles, engineResolveInvocationForSpec, engineResolveBindingForSpec, engineResolveBindingForSourceYaml, isWorkbenchSubjectManifestPath, parseWorkbenchSourceFiles, resolveEngineCaseExecutionConfig, resolveWorkbenchResolvedSourceYaml, resolveWorkbenchSourceFiles, runtimeNetwork, runtimeResources, serializeWorkbenchResolvedSourceYaml, validateWorkbenchResolvedSourceYaml, } from "./generic-spec.js";
20
+ export { composeRuntimeDockerfileWithAdapterInstallers, } from "./runtime-dockerfile.js";
21
+ export { adapterCommandName, cloneWorkbenchAdapterManifest, collectWorkbenchAdapterAuthRequirements, collectWorkbenchAdapterInvocations, parseWorkbenchAdapterManifest, workbenchAdapterManifestRequiresAuth, workbenchAdapterManifestSupportsOperation, workbenchAdapterOperationCommand, workbenchAdapterOperationExecutor, withDefaultWorkbenchAdapterAuth, withDefaultWorkbenchAdapterAuthProfiles, } from "@workbench-ai/workbench-protocol";
20
22
  export { adapterAuthEnv, createWorkbenchAdapterAuthBundle, defaultWorkbenchAdapterAuthStoreRoot, localWorkbenchAdapterAuthStore, normalizeWorkbenchAdapterAuthTarget, parseWorkbenchAdapterAuthTarget, sanitizeWorkbenchAdapterAuthBundle, } from "./adapter-auth.js";
21
23
  export { asRuntimeRecord, importNodeModule, nodeBuiltin, normalizeWorkbenchWorkerId, normalizeRuntimeRegistry, quoteShellArg, resolveDockerRuntimeImageRef, resolveWorkbenchWorkerId, } from "./runtime-utils.js";
22
24
  export { assignUsageRole, extractExecutionUsageFromTrace, mergeUsageSummaries, } from "./execution-usage.js";
23
25
  export { createWorkbenchProgressStdoutParser, publishWorkbenchProgressStdoutEnvelope, } from "./execution-events.js";
24
26
  export { resolveSandboxTemplateImage, } from "./sandbox-backends/template-images.js";
25
- export { readOutputTraceFiles, workbenchTracePhaseDirectory, workbenchTraceRunDirectory, workbenchTraceRunDirectoryName, } from "./trace-files.js";
27
+ export { readOutputTraceFiles, workbenchTraceExecutionDirectory, workbenchTraceRunDirectory, workbenchTraceRunDirectoryName, } from "./trace-files.js";
26
28
  export { assertWorkbenchAdapterOperationSupport, assertWorkbenchAdapterOperationResultOk, collectWorkbenchAdapterOperationIssues, collectWorkbenchAdapterOperationRequirements, ensureWorkbenchAdapterOutputDir, WORKBENCH_ADAPTER_RESULT_FILE, normalizeWorkbenchAdapterOperationRequest, normalizeWorkbenchAdapterOperationResult, readWorkbenchAdapterOperationRequest, readWorkbenchAdapterOperationResult, workbenchAdapterOperationResultPath, writeWorkbenchAdapterOperationResult, } from "@workbench-ai/workbench-protocol";
27
29
  export { applyWorkbenchSubjectPatch, } from "./subject-patch.js";
28
30
  export { createWorkbenchSandboxFileStore, createSandboxAdapterRequest, executionResultFromCompletedSandboxJob, materializeWorkbenchSandboxInput, readWorkbenchExecutionSpec, sanitizeWorkbenchExecutionJobForSandbox, } from "./sandbox-inputs.js";
@@ -31,8 +33,8 @@ export { createBaselineSubjectExecution, createBaselineSubjectJob, createWorkben
31
33
  export { addCapacity, capacityFits, runWorkbenchExecutionDag, subtractCapacity, workbenchJobDependencies, workbenchJobHostCost, workbenchJobResources, } from "./execution-scheduler.js";
32
34
  export { assertWorkbenchExecutionIsolation, collectWorkbenchExecutionIsolationIssues, validateWorkbenchExecutionOutputPayloads, } from "./execution-outputs.js";
33
35
  export { collectSandboxAllocationScopeIssues, collectExecutionCapabilityScopeIssues, collectSandboxHandleScopeIssues, createWorkbenchSandboxAllocation, createWorkbenchSandboxExecutionMetadata, createWorkbenchExecutionCapability, executeValidatedSandboxExecution, } from "./sandbox-plane.js";
34
- export { buildSubjectCasePhaseRefs, buildWorkbenchTracePhases, isWorkbenchPhaseActive, readWorkbenchExecutionId, readWorkbenchExecutionMetadataNumber, readWorkbenchExecutionMetadataString, readWorkbenchExecutionPurpose, resolveWorkbenchJobGroupStatus, } from "./execution-phases.js";
35
- export { finalizeWorkbenchExecutionTraceForJob, mergeWorkbenchExecutionTracesByJob, } from "./execution-traces.js";
36
+ export { buildSubjectCaseExecutionRefs, buildWorkbenchExecutionEvidence, isWorkbenchExecutionActive, readWorkbenchExecutionId, readWorkbenchExecutionMetadataNumber, readWorkbenchExecutionMetadataString, readWorkbenchExecutionPurpose, resolveWorkbenchJobGroupStatus, } from "./execution-evidence.js";
37
+ export { buildWorkbenchTraceSessionsFromFiles, combineWorkbenchTraceSessions, finalizeWorkbenchExecutionTraceForJob, mergeWorkbenchExecutionTracesByJob, readWorkbenchExecutionTraceFiles, traceSessionLabel, } from "./execution-traces.js";
36
38
  export { DOCKER_SANDBOX_BACKEND, assertSandboxHostHealthForProvider, createDockerSandboxBackendDescriptor, createDockerSandboxPlane, resolveWorkbenchSandboxProviderName, sandboxProviderAdmissionForResources, sandboxProviderDefaultMaxConcurrentJobs, sandboxProviderLeaseScope, sandboxHostHealthExpectationForProvider, } from "./sandbox-backends/index.js";
37
39
  export const DEFAULT_ENVIRONMENT_VERSIONS = [
38
40
  {
@@ -142,7 +144,7 @@ export const DEFAULT_ENVIRONMENTS = [
142
144
  {
143
145
  id: "env_libreoffice_agent",
144
146
  name: "LibreOffice + Agent",
145
- description: "Agent runtime with soffice and Python libraries for spreadsheet-heavy skill and rubric evaluations.",
147
+ description: "Agent runtime with soffice and Python libraries for spreadsheet-heavy evaluations.",
146
148
  currentVersionId: "envv_libreoffice_agent",
147
149
  builtIn: true,
148
150
  createdAt: "2026-04-29T00:00:00.000Z",
@@ -278,30 +280,36 @@ function adapterProtocolCommandSpec(adapter, operation, manifests = []) {
278
280
  return {
279
281
  use: "command",
280
282
  command: manifest ? workbenchAdapterOperationCommand(manifest, operation) : adapterCommandName(adapter.use),
283
+ executor: manifest ? workbenchAdapterOperationExecutor(manifest, operation) : "sandbox",
281
284
  };
282
285
  }
283
- function protocolPhaseForExecution(execution, manifests) {
284
- const role = executionPurposeRole(execution.purpose);
285
- const operation = execution.purpose === "improve" ? "optimizer.improve" : "subject.run";
286
+ function protocolStepForExecution(execution, manifests) {
287
+ if (execution.purpose !== "improve") {
288
+ throw new Error(`Protocol execution step only supports improve executions, not ${execution.purpose}.`);
289
+ }
290
+ const operation = "optimizer.improve";
286
291
  const command = adapterProtocolCommandSpec(execution.adapter, operation, manifests);
287
292
  return {
288
- kind: role,
293
+ kind: "optimizer",
289
294
  label: execution.purpose,
290
295
  operation,
296
+ executor: command.executor,
291
297
  adapter: execution.adapter,
292
298
  command: command.command,
293
299
  };
294
300
  }
295
- function attemptPhasesForExecution(execution, spec, manifests) {
301
+ function attemptStepsForExecution(execution, spec, manifests) {
296
302
  void spec;
297
- const enginePhase = {
303
+ const command = adapterProtocolCommandSpec(execution.adapter, "engine.run", manifests);
304
+ const engineStep = {
298
305
  kind: "engine",
299
306
  label: "engine",
300
307
  operation: "engine.run",
308
+ executor: command.executor,
301
309
  adapter: execution.adapter,
302
- command: adapterProtocolCommandSpec(execution.adapter, "engine.run", manifests).command,
310
+ command: command.command,
303
311
  };
304
- return [enginePhase];
312
+ return [engineStep];
305
313
  }
306
314
  function adapterConfigRecord(adapter, manifests = []) {
307
315
  const config = cloneJsonRecord(jsonRecord(adapter.with));
@@ -411,7 +419,10 @@ export function materializeWorkbenchRunResult(args) {
411
419
  .sort((left, right) => compareSampleOutputs(left.output, right.output));
412
420
  const outputJobIds = new Set(outputs.flatMap(({ jobs }) => jobs.map((job) => job.id)));
413
421
  const completedSampleKeys = new Set(outputs
414
- .map(({ output }) => evaluationSampleGroupKeyFromOutput(output))
422
+ .flatMap(({ jobs, output }) => [
423
+ evaluationSampleGroupKeyFromOutput(output),
424
+ ...jobs.map(evaluationSampleGroupKeyFromJob),
425
+ ])
415
426
  .filter((key) => key !== null));
416
427
  const errorSampleJobs = [
417
428
  ...subjectJobs.filter((job) => job.status === "failed"),
@@ -472,7 +483,7 @@ export function materializeWorkbenchRunResult(args) {
472
483
  meta,
473
484
  };
474
485
  subjects.push(record);
475
- evaluations.push(createEvaluationResultRecord({
486
+ evaluations.push(createEvaluationScorecard({
476
487
  runId: args.runId,
477
488
  benchmarkFingerprint: args.benchmarkFingerprint,
478
489
  createdAt: args.startedAt,
@@ -528,6 +539,8 @@ function materializedSubjectFingerprint(spec, files) {
528
539
  hash.update("workbench-subject-v1\0");
529
540
  hash.update("materialized\0runner\0");
530
541
  hash.update(JSON.stringify(spec.run));
542
+ hash.update("prepare");
543
+ hash.update(JSON.stringify(spec.subject.prepare ?? null));
531
544
  for (const file of filterSubjectSourceFiles(files).slice().sort((left, right) => left.path.localeCompare(right.path))) {
532
545
  hash.update("\0file\0");
533
546
  hash.update(file.path);
@@ -547,10 +560,10 @@ function materializedSubjectFiles(args) {
547
560
  }
548
561
  return [...byPath.values()].sort((left, right) => left.path.localeCompare(right.path));
549
562
  }
550
- function createEvaluationResultRecord(args) {
563
+ function createEvaluationScorecard(args) {
551
564
  const evaluation = args.evaluation;
552
565
  return {
553
- id: evaluationResultId(args.runId, args.subject.id),
566
+ id: evaluationScorecardId(args.runId, args.subject.id),
554
567
  runId: args.runId,
555
568
  benchmarkFingerprint: args.benchmarkFingerprint,
556
569
  subjectFingerprint: args.subject.subjectFingerprint,
@@ -568,7 +581,7 @@ function createEvaluationResultRecord(args) {
568
581
  evaluation,
569
582
  };
570
583
  }
571
- function evaluationResultId(runId, subjectId) {
584
+ export function evaluationScorecardId(runId, subjectId) {
572
585
  const runPart = runId.replace(/[^a-z0-9]+/giu, "_").replace(/^_+|_+$/gu, "").slice(-24);
573
586
  const subjectPart = subjectId.replace(/[^a-z0-9]+/giu, "_").replace(/^_+|_+$/gu, "").slice(-24);
574
587
  return `eval_${runPart}_${subjectPart}`;
@@ -584,7 +597,7 @@ export function isWorkbenchInternalOutputPath(filePath) {
584
597
  normalized === "sandbox-environment.json" ||
585
598
  normalized === "sandbox_error.log" ||
586
599
  normalized === "exit_code" ||
587
- /^[a-z-]+_(stdout\.log|stderr\.log|exit_code)$/u.test(normalized));
600
+ /^[a-z_-]+_(stdout\.log|stderr\.log|exit_code)$/u.test(normalized));
588
601
  }
589
602
  export function createSubjectRevisionTraceInputFiles(args) {
590
603
  const files = [];
@@ -620,6 +633,23 @@ export function createSubjectRevisionTraceInputFiles(args) {
620
633
  }, null, 2)}\n`));
621
634
  return dedupeSurfaceFiles(files);
622
635
  }
636
+ export function createSubjectEvaluationTraceInputFiles(args) {
637
+ const subject = args.subject;
638
+ if (!subject?.eval && !subject?.metrics) {
639
+ return [];
640
+ }
641
+ const filePath = normalizeRelativePath(args.path ?? `base-subject/${subject.id}/evaluation.json`);
642
+ const payload = {
643
+ kind: "subject_evaluation",
644
+ subjectId: subject.id,
645
+ status: subject.status,
646
+ metrics: subject.metrics ?? null,
647
+ fileChanges: subject.fileChanges,
648
+ eval: subject.eval ?? null,
649
+ prompt: subject.prompt ?? null,
650
+ };
651
+ return [textSurfaceFile(filePath, `${JSON.stringify(payload, null, 2)}\n`)];
652
+ }
623
653
  function isTerminalExecutionJob(job) {
624
654
  return job.kind === "execute" && (job.status === "succeeded" ||
625
655
  job.status === "failed" ||
@@ -866,16 +896,14 @@ export function createSubjectFilePreview(args) {
866
896
  };
867
897
  }
868
898
  export function createCaseReview(args) {
869
- const preferredSampleIndex = uniquePhaseSampleIndex(args.phases ?? []);
870
- const sampleMatchesCase = (sample) => sample.id === args.caseId ||
871
- sample.id.startsWith(`${args.caseId}__`) ||
872
- (sample.cases ?? []).some((entry) => entry.id === args.caseId || entry.id.startsWith(`${args.caseId}__`));
899
+ const preferredSampleIndex = uniqueExecutionSampleIndex(args.executions ?? []);
900
+ const sampleMatchesCase = (sample) => (sample.cases ?? []).some((entry) => entry.id === args.caseId);
873
901
  const samples = args.subject.eval?.samples ?? [];
874
902
  const sampleResult = samples.find((sample) => typeof preferredSampleIndex === "number" &&
875
903
  sample.index === preferredSampleIndex &&
876
904
  sampleMatchesCase(sample)) ?? samples.find(sampleMatchesCase);
877
- const caseResult = sampleResult?.cases?.find((entry) => entry.id === args.caseId || entry.id.startsWith(`${args.caseId}__`));
878
- if (!sampleResult && (args.phases?.length ?? 0) > 0) {
905
+ const caseResult = sampleResult?.cases?.find((entry) => entry.id === args.caseId);
906
+ if (!sampleResult && (args.executions?.length ?? 0) > 0) {
879
907
  return {
880
908
  subjectId: args.subject.id,
881
909
  caseId: args.caseId,
@@ -884,7 +912,7 @@ export function createCaseReview(args) {
884
912
  ? { sampleIndex: preferredSampleIndex }
885
913
  : {}),
886
914
  metrics: {},
887
- phases: args.phases ?? [],
915
+ executions: args.executions ?? [],
888
916
  criteria_results: [],
889
917
  };
890
918
  }
@@ -893,28 +921,21 @@ export function createCaseReview(args) {
893
921
  }
894
922
  const durationMs = typeof caseResult?.durationMs === "number"
895
923
  ? caseResult.durationMs
896
- : sampleResult?.cases?.length === 1 &&
897
- typeof sampleResult.durationMs === "number"
898
- ? sampleResult.durationMs
899
- : !caseResult && typeof sampleResult.durationMs === "number"
900
- ? sampleResult.durationMs
901
- : undefined;
902
- const sampleStatus = sampleResult.status === "planned" ? undefined : sampleResult.status;
903
- const status = caseResult?.status ?? sampleStatus;
924
+ : undefined;
904
925
  return {
905
926
  subjectId: args.subject.id,
906
- caseId: caseResult?.id ?? sampleResult.id,
927
+ caseId: caseResult?.id ?? args.caseId,
907
928
  caseLabel: caseResult?.label ?? args.caseId,
908
929
  sampleId: sampleResult.id,
909
930
  sampleIndex: sampleResult.index,
910
- ...(status ? { status } : {}),
911
- metrics: caseResult?.metrics ?? sampleResult.metrics ?? {},
931
+ ...(caseResult?.status ? { status: caseResult.status } : {}),
932
+ metrics: caseResult?.metrics ?? {},
912
933
  ...(typeof durationMs === "number" ? { durationMs } : {}),
913
934
  ...(caseResult?.source ? { source: caseResult.source } : {}),
914
- ...((caseResult?.feedback ?? sampleResult.feedback) !== undefined
915
- ? { feedback: caseResult?.feedback ?? sampleResult.feedback }
935
+ ...(caseResult?.feedback !== undefined
936
+ ? { feedback: caseResult.feedback }
916
937
  : {}),
917
- phases: args.phases ?? [],
938
+ executions: args.executions ?? [],
918
939
  criteria_results: (caseResult?.criteria ?? []).map((criterion) => ({
919
940
  criterion_id: criterion.criterion_id,
920
941
  pass: criterion.pass,
@@ -924,9 +945,9 @@ export function createCaseReview(args) {
924
945
  })),
925
946
  };
926
947
  }
927
- function uniquePhaseSampleIndex(phases) {
928
- const sampleIndices = new Set(phases
929
- .map((phase) => phase.sampleIndex)
948
+ function uniqueExecutionSampleIndex(executions) {
949
+ const sampleIndices = new Set(executions
950
+ .map((execution) => execution.sampleIndex)
930
951
  .filter((index) => typeof index === "number"));
931
952
  if (sampleIndices.size !== 1) {
932
953
  return null;
@@ -951,6 +972,7 @@ function parseAuthoredWorkbenchSourceSpec(source) {
951
972
  name: resolved.subject.name,
952
973
  description: resolved.subject.description,
953
974
  files: { path: resolved.subject.files.path },
975
+ ...(resolved.subject.prepare ? { prepare: { ...resolved.subject.prepare } } : {}),
954
976
  run: runSpecFromInvocation(resolved.run),
955
977
  },
956
978
  ...(resolved.optimizer
@@ -1101,11 +1123,18 @@ export async function executeWorkbenchExecutionJob(args, options) {
1101
1123
  const runtimeArgs = adapterAuthProfiles.length > 0
1102
1124
  ? { ...args, adapterAuthProfiles }
1103
1125
  : args;
1104
- const executionForSandbox = readWorkbenchExecutionSpec(runtimeArgs.job);
1126
+ const executionForRuntime = readWorkbenchExecutionSpec(runtimeArgs.job);
1127
+ const executor = workbenchExecutionExecutorForRuntimeInput(runtimeArgs);
1128
+ if (executor === "host") {
1129
+ return await withWorkbenchRuntimeControlServer(runtimeArgs, options, startedAt, async (adapterRuntimeEnv) => executeAdapterInCurrentRuntime({
1130
+ ...runtimeArgs,
1131
+ adapterRuntimeEnv,
1132
+ }, executionForRuntime, startedAt, createWorkbenchExecutionCapability(executionForRuntime, { now: startedAt })));
1133
+ }
1105
1134
  const fileStore = createWorkbenchSandboxFileStore(runtimeArgs);
1106
1135
  const planeFactory = options.createSandboxPlaneForProvider ?? createSandboxBackendPlaneForProvider;
1107
1136
  const plane = planeFactory(options.sandboxProvider, runtimeArgs, startedAt, fileStore);
1108
- const validated = await executeValidatedSandboxExecution(plane, executionForSandbox, {
1137
+ const validated = await executeValidatedSandboxExecution(plane, executionForRuntime, {
1109
1138
  now: startedAt,
1110
1139
  runnerId: resolveWorkbenchWorkerId([
1111
1140
  process.env.WORKBENCH_WORKER_ID,
@@ -1121,6 +1150,215 @@ export async function executeWorkbenchExecutionJob(args, options) {
1121
1150
  return failWorkbenchRunJob(args.job, startedAt, error);
1122
1151
  }
1123
1152
  }
1153
+ export function workbenchExecutionExecutorForRuntimeInput(args) {
1154
+ if (args.runtimeControlOperation) {
1155
+ return "sandbox";
1156
+ }
1157
+ const execution = readWorkbenchExecutionSpec(args.job);
1158
+ const operation = adapterOperationForExecutionPurpose(execution.purpose);
1159
+ if (!operation) {
1160
+ return "sandbox";
1161
+ }
1162
+ const manifest = args.adapterManifests?.find((entry) => entry.id === execution.adapter.use);
1163
+ return manifest ? workbenchAdapterOperationExecutor(manifest, operation) : "sandbox";
1164
+ }
1165
+ function adapterOperationForExecutionPurpose(purpose) {
1166
+ if (purpose === "improve") {
1167
+ return "optimizer.improve";
1168
+ }
1169
+ if (purpose === "attempt") {
1170
+ return "engine.run";
1171
+ }
1172
+ return null;
1173
+ }
1174
+ const RUNTIME_CONTROL_MAX_BODY_BYTES = 512 * 1024 * 1024;
1175
+ async function withWorkbenchRuntimeControlServer(args, options, startedAt, run) {
1176
+ const [{ createServer }] = await Promise.all([
1177
+ importNodeModule(nodeBuiltin("http")),
1178
+ ]);
1179
+ const token = randomBytes(24).toString("base64url");
1180
+ const server = createServer((request, response) => {
1181
+ void handleWorkbenchRuntimeControlHttpRequest({
1182
+ request,
1183
+ response,
1184
+ token,
1185
+ args,
1186
+ options,
1187
+ startedAt,
1188
+ });
1189
+ });
1190
+ const url = await new Promise((resolve, reject) => {
1191
+ server.once("error", reject);
1192
+ server.listen(0, "127.0.0.1", () => {
1193
+ server.off("error", reject);
1194
+ const address = server.address();
1195
+ if (!address || typeof address === "string") {
1196
+ reject(new Error("Workbench runtime-control server did not expose a local TCP address."));
1197
+ return;
1198
+ }
1199
+ resolve(`http://127.0.0.1:${address.port}`);
1200
+ });
1201
+ });
1202
+ try {
1203
+ return await run({
1204
+ [WORKBENCH_RUNTIME_CONTROL_URL_ENV]: url,
1205
+ [WORKBENCH_RUNTIME_CONTROL_TOKEN_ENV]: token,
1206
+ });
1207
+ }
1208
+ finally {
1209
+ await new Promise((resolve) => server.close(() => resolve()));
1210
+ }
1211
+ }
1212
+ async function handleWorkbenchRuntimeControlHttpRequest(args) {
1213
+ const { request, response } = args;
1214
+ try {
1215
+ if (request.method !== "POST" || request.url !== "/v1/operation-sequence") {
1216
+ writeRuntimeControlJson(response, 404, { error: "Unknown Workbench runtime-control endpoint." });
1217
+ return;
1218
+ }
1219
+ if (request.headers.authorization !== `Bearer ${args.token}`) {
1220
+ writeRuntimeControlJson(response, 401, { error: "Workbench runtime-control token is invalid." });
1221
+ return;
1222
+ }
1223
+ const parsed = JSON.parse(await readRuntimeControlBody(request));
1224
+ const controlRequest = normalizeRuntimeControlOperationSequenceRequest(parsed);
1225
+ const result = await executeRuntimeControlOperationSequenceInSandbox(args.args, args.options, args.startedAt, controlRequest);
1226
+ writeRuntimeControlJson(response, 200, result);
1227
+ }
1228
+ catch (error) {
1229
+ writeRuntimeControlJson(response, 500, {
1230
+ error: error instanceof Error ? error.stack ?? error.message : String(error),
1231
+ });
1232
+ }
1233
+ }
1234
+ function writeRuntimeControlJson(response, statusCode, payload) {
1235
+ response.statusCode = statusCode;
1236
+ response.setHeader("content-type", "application/json");
1237
+ response.end(`${JSON.stringify(payload, null, 2)}\n`);
1238
+ }
1239
+ function readRuntimeControlBody(request) {
1240
+ return new Promise((resolve, reject) => {
1241
+ const chunks = [];
1242
+ let size = 0;
1243
+ request.on("data", (chunk) => {
1244
+ size += chunk.length;
1245
+ if (size > RUNTIME_CONTROL_MAX_BODY_BYTES) {
1246
+ reject(new Error("Workbench runtime-control request body is too large."));
1247
+ request.destroy();
1248
+ return;
1249
+ }
1250
+ chunks.push(chunk);
1251
+ });
1252
+ request.on("error", reject);
1253
+ request.on("end", () => resolve(Buffer.concat(chunks).toString("utf8")));
1254
+ });
1255
+ }
1256
+ function normalizeRuntimeControlOperationSequenceRequest(value) {
1257
+ if (!value || typeof value !== "object" || Array.isArray(value)) {
1258
+ throw new Error("Workbench runtime-control operation sequence request must be an object.");
1259
+ }
1260
+ const record = value;
1261
+ if (!Array.isArray(record.operations) || record.operations.length === 0) {
1262
+ throw new Error("Workbench runtime-control operation sequence requires at least one operation.");
1263
+ }
1264
+ const inputs = normalizeRuntimeControlInputs(record.inputs);
1265
+ return {
1266
+ ...(inputs ? { inputs } : {}),
1267
+ operations: record.operations.map((entry, index) => normalizeRuntimeControlOperation(entry, `operations[${index}]`)),
1268
+ ...(typeof record.prepare === "boolean" ? { prepare: record.prepare } : {}),
1269
+ ...(typeof record.collectWorkspace === "boolean" ? { collectWorkspace: record.collectWorkspace } : {}),
1270
+ };
1271
+ }
1272
+ function normalizeRuntimeControlInputs(value) {
1273
+ if (value === undefined) {
1274
+ return undefined;
1275
+ }
1276
+ if (!value || typeof value !== "object" || Array.isArray(value)) {
1277
+ throw new Error("Workbench runtime-control inputs must be an object.");
1278
+ }
1279
+ const record = value;
1280
+ const inputs = {};
1281
+ if (hasOwn(record, "subject")) {
1282
+ inputs.subject = normalizeRuntimeControlFiles(record.subject, "inputs.subject");
1283
+ }
1284
+ if (hasOwn(record, "case")) {
1285
+ inputs.case = normalizeRuntimeControlFiles(record.case, "inputs.case");
1286
+ }
1287
+ if (hasOwn(record, "enginePrivate")) {
1288
+ inputs.enginePrivate = normalizeRuntimeControlFiles(record.enginePrivate, "inputs.enginePrivate");
1289
+ }
1290
+ if (hasOwn(record, "traces")) {
1291
+ inputs.traces = normalizeRuntimeControlFiles(record.traces, "inputs.traces");
1292
+ }
1293
+ if (hasOwn(record, "workspace")) {
1294
+ inputs.workspace = normalizeRuntimeControlFiles(record.workspace, "inputs.workspace");
1295
+ }
1296
+ if (hasOwn(record, "output")) {
1297
+ inputs.output = normalizeRuntimeControlFiles(record.output, "inputs.output");
1298
+ }
1299
+ return inputs;
1300
+ }
1301
+ function normalizeRuntimeControlFiles(value, label) {
1302
+ if (value === undefined) {
1303
+ return [];
1304
+ }
1305
+ if (!Array.isArray(value)) {
1306
+ throw new Error(`Workbench runtime-control ${label} must be an array.`);
1307
+ }
1308
+ return value.map((entry, index) => {
1309
+ if (!isSurfaceSnapshotFile(entry)) {
1310
+ throw new Error(`Workbench runtime-control ${label}[${index}] must be a surface snapshot file.`);
1311
+ }
1312
+ return { ...entry, path: normalizeRelativePath(entry.path) };
1313
+ });
1314
+ }
1315
+ function hasOwn(value, key) {
1316
+ return Object.prototype.hasOwnProperty.call(value, key);
1317
+ }
1318
+ function normalizeRuntimeControlOperation(value, label) {
1319
+ if (!value || typeof value !== "object" || Array.isArray(value)) {
1320
+ throw new Error(`Workbench runtime-control ${label} must be an object.`);
1321
+ }
1322
+ const record = value;
1323
+ const operation = record.operation;
1324
+ if (operation !== "engine.resolve" &&
1325
+ operation !== "engine.run" &&
1326
+ operation !== "subject.run" &&
1327
+ operation !== "optimizer.improve") {
1328
+ throw new Error(`Workbench runtime-control ${label}.operation is invalid.`);
1329
+ }
1330
+ const invocation = record.invocation;
1331
+ if (!invocation || typeof invocation !== "object" || Array.isArray(invocation)) {
1332
+ throw new Error(`Workbench runtime-control ${label}.invocation must be an object.`);
1333
+ }
1334
+ const invocationRecord = invocation;
1335
+ if (typeof invocationRecord.use !== "string" || invocationRecord.use.length === 0) {
1336
+ throw new Error(`Workbench runtime-control ${label}.invocation.use is required.`);
1337
+ }
1338
+ const withConfig = invocationRecord.with === undefined
1339
+ ? {}
1340
+ : isJsonPayload(invocationRecord.with)
1341
+ ? invocationRecord.with
1342
+ : null;
1343
+ if (withConfig === null) {
1344
+ throw new Error(`Workbench runtime-control ${label}.invocation.with must be JSON.`);
1345
+ }
1346
+ if (invocationRecord.auth !== undefined && !isJsonPayload(invocationRecord.auth)) {
1347
+ throw new Error(`Workbench runtime-control ${label}.invocation.auth must be JSON.`);
1348
+ }
1349
+ return {
1350
+ operation,
1351
+ invocation: {
1352
+ use: invocationRecord.use,
1353
+ with: withConfig,
1354
+ ...(invocationRecord.auth !== undefined ? { auth: invocationRecord.auth } : {}),
1355
+ ...(typeof invocationRecord.command === "string" && invocationRecord.command.trim()
1356
+ ? { command: invocationRecord.command }
1357
+ : {}),
1358
+ },
1359
+ ...(typeof record.label === "string" && record.label.trim() ? { label: record.label } : {}),
1360
+ };
1361
+ }
1124
1362
  async function explicitAdapterAuthProfilesForExecution(execution, args, loadLocalAdapterProfiles) {
1125
1363
  const required = requiredAdapterAuthTargetsForExecution(execution, args);
1126
1364
  if (required.length === 0) {
@@ -1155,7 +1393,7 @@ function adapterAuthTargetKey(target) {
1155
1393
  export function workbenchExecutionPurpose(job) {
1156
1394
  return readWorkbenchExecutionPurpose(job);
1157
1395
  }
1158
- export async function executeAdapterInCurrentSandboxRuntime(args, execution, startedAt, capability) {
1396
+ export async function executeAdapterInCurrentRuntime(args, execution, startedAt, capability) {
1159
1397
  const eventPublisher = createWorkbenchExecutionEventPublisher({
1160
1398
  projectId: args.job.projectId,
1161
1399
  runId: args.job.runId,
@@ -1174,10 +1412,10 @@ export async function executeAdapterInCurrentSandboxRuntime(args, execution, sta
1174
1412
  };
1175
1413
  try {
1176
1414
  if (execution.purpose === "improve") {
1177
- return await executeSubjectRevisionExecutionInSandbox(runtimeInput, execution, startedAt, capability, eventPublisher);
1415
+ return await executeSubjectRevisionExecutionInCurrentRuntime(runtimeInput, execution, startedAt, capability, eventPublisher);
1178
1416
  }
1179
1417
  if (execution.purpose === "attempt") {
1180
- return await executeAttemptExecutionInSandbox(runtimeInput, execution, startedAt, capability, eventPublisher);
1418
+ return await executeAttemptExecutionInCurrentRuntime(runtimeInput, execution, startedAt, capability, eventPublisher);
1181
1419
  }
1182
1420
  throw new Error(`Unsupported execution purpose ${execution.purpose}.`);
1183
1421
  }
@@ -1274,7 +1512,7 @@ function adapterAuthRequest(bundles, root, currentAdapterId) {
1274
1512
  }
1275
1513
  return entries;
1276
1514
  }
1277
- function adapterAuthRequestForPhase(args, adapterId) {
1515
+ function adapterAuthRequestForStep(args, adapterId) {
1278
1516
  const profiles = (args.adapterAuthProfiles ?? [])
1279
1517
  .map((bundle) => sanitizeWorkbenchAdapterAuthBundle(bundle));
1280
1518
  if (profiles.length === 0) {
@@ -1295,12 +1533,19 @@ function adapterAuthProfilesForExecution(execution, args) {
1295
1533
  }
1296
1534
  function requiredAdapterAuthTargetsForExecution(execution, args) {
1297
1535
  const manifests = args.adapterManifests ?? [];
1298
- return collectWorkbenchAdapterAuthRequirements(adapterInvocationsForExecution(execution, args.spec), manifests)
1536
+ return collectWorkbenchAdapterAuthRequirements(adapterInvocationsForExecution(execution, args), manifests)
1299
1537
  .map((target) => normalizeWorkbenchAdapterAuthTarget(target));
1300
1538
  }
1301
- function adapterInvocationsForExecution(execution, spec) {
1539
+ function adapterInvocationsForExecution(execution, args) {
1540
+ if (args.runtimeControlOperation) {
1541
+ return uniqueAdapterInvocations(args.runtimeControlOperation.operations.map((operation) => ({
1542
+ use: operation.invocation.use,
1543
+ with: operation.invocation.with ?? {},
1544
+ ...(operation.invocation.auth !== undefined ? { auth: operation.invocation.auth } : {}),
1545
+ })));
1546
+ }
1302
1547
  if (execution.purpose === "attempt") {
1303
- return uniqueAdapterInvocations([execution.adapter, spec.run]);
1548
+ return uniqueAdapterInvocations([execution.adapter, args.spec.run]);
1304
1549
  }
1305
1550
  return [execution.adapter];
1306
1551
  }
@@ -1341,7 +1586,7 @@ function completedJobFromSandboxResult(fallbackJob, startedAt, result) {
1341
1586
  }
1342
1587
  return attachSandboxMetadataToJob(failWorkbenchRunJob(fallbackJob, result.startedAt || startedAt, result.error ?? `Sandbox execution ${result.status}.`, result.finishedAt), asRuntimeRecord(result.metadata).sandbox);
1343
1588
  }
1344
- async function executeSubjectRevisionExecutionInSandbox(args, execution, startedAt, capability, eventPublisher) {
1589
+ async function executeSubjectRevisionExecutionInCurrentRuntime(args, execution, startedAt, capability, eventPublisher) {
1345
1590
  const { workload, result } = await runHostedProtocolExecutionResult(args, execution, startedAt, capability, eventPublisher);
1346
1591
  if (result.error || (result.exitCode ?? 0) !== 0) {
1347
1592
  return failWorkbenchRunJob(args.job, startedAt, result.error ?? `Adapter ${execution.adapter.use} exited with status ${result.exitCode}.`, result.finishedAt, result);
@@ -1382,7 +1627,7 @@ async function executeSubjectRevisionExecutionInSandbox(args, execution, started
1382
1627
  },
1383
1628
  };
1384
1629
  }
1385
- async function executeAttemptExecutionInSandbox(args, execution, startedAt, capability, eventPublisher) {
1630
+ async function executeAttemptExecutionInCurrentRuntime(args, execution, startedAt, capability, eventPublisher) {
1386
1631
  const workload = createWorkbenchRunWorkload({
1387
1632
  job: args.job,
1388
1633
  spec: args.spec,
@@ -1391,7 +1636,7 @@ async function executeAttemptExecutionInSandbox(args, execution, startedAt, capa
1391
1636
  engineCases: args.engineCases,
1392
1637
  traceFiles: args.traceFiles,
1393
1638
  });
1394
- const workloadResult = await runHostedCommandExecutionPhases(args, workload, attemptPhasesForExecution(execution, args.spec, args.adapterManifests), startedAt, {
1639
+ const workloadResult = await runHostedCommandExecutionSteps(args, workload, attemptStepsForExecution(execution, args.spec, args.adapterManifests), startedAt, {
1395
1640
  capability,
1396
1641
  eventPublisher,
1397
1642
  });
@@ -1405,10 +1650,7 @@ async function executeAttemptExecutionInSandbox(args, execution, startedAt, capa
1405
1650
  return failWorkbenchRunJob(args.job, startedAt, "Attempt engine must return a workbench-result result with a finite numeric score.", workloadResult.finishedAt, workloadResult);
1406
1651
  }
1407
1652
  const finishedAt = workloadResult.finishedAt ?? new Date().toISOString();
1408
- const usage = mergeUsageSummaries([
1409
- workloadResult.usage,
1410
- engineResult.usage,
1411
- ]);
1653
+ const usage = attemptUsageSummary(workloadResult.usage, engineResult.usage);
1412
1654
  const sample = evaluateSample({
1413
1655
  subjectId: workload.subjectId,
1414
1656
  files: workloadResult.files,
@@ -1453,6 +1695,282 @@ async function executeAttemptExecutionInSandbox(args, execution, startedAt, capa
1453
1695
  },
1454
1696
  };
1455
1697
  }
1698
+ export async function executeRuntimeControlOperationSequenceInCurrentRuntime(args, execution, startedAt, capability) {
1699
+ void execution;
1700
+ void capability;
1701
+ if (!args.runtimeControlOperation) {
1702
+ throw new Error("Runtime-control operation sequence is missing from the sandbox request.");
1703
+ }
1704
+ const childExecution = readWorkbenchExecutionSpec(args.job);
1705
+ const workload = createWorkbenchRunWorkload({
1706
+ job: args.job,
1707
+ spec: args.spec,
1708
+ baseFiles: args.baseFiles,
1709
+ engineResolveFiles: args.engineResolveFiles,
1710
+ engineCases: args.engineCases,
1711
+ traceFiles: args.traceFiles,
1712
+ });
1713
+ const runtimeArgs = { ...args };
1714
+ delete runtimeArgs.adapterRuntimeEnv;
1715
+ const adapterAuth = await materializeSandboxAdapterAuth(runtimeArgs, childExecution);
1716
+ let result;
1717
+ try {
1718
+ result = await runHostedCommandExecutionSteps({
1719
+ ...runtimeArgs,
1720
+ ...(adapterAuth.root ? { adapterAuthRoot: adapterAuth.root } : {}),
1721
+ ...(Object.keys(adapterAuth.env).length > 0
1722
+ ? { adapterAuthEnv: adapterAuth.env }
1723
+ : {}),
1724
+ }, workload, args.runtimeControlOperation.operations.map((operation, index) => runtimeControlStepForOperation(operation, index, args.adapterManifests)), startedAt, {
1725
+ runSubjectPrepare: args.runtimeControlOperation.prepare ?? false,
1726
+ workspaceFiles: args.runtimeControlOperation.inputs?.workspace ?? [],
1727
+ outputFiles: args.runtimeControlOperation.inputs?.output ?? [],
1728
+ collectWorkspace: args.runtimeControlOperation.collectWorkspace ?? false,
1729
+ });
1730
+ }
1731
+ finally {
1732
+ if (adapterAuth.cleanup) {
1733
+ await adapterAuth.cleanup().catch(() => undefined);
1734
+ }
1735
+ }
1736
+ const finishedAt = result.finishedAt ?? new Date().toISOString();
1737
+ const failed = Boolean(result.error) || (result.exitCode ?? 0) !== 0;
1738
+ return {
1739
+ ...args.job,
1740
+ status: failed ? "failed" : "succeeded",
1741
+ attempt: Math.max(1, args.job.attempt),
1742
+ startedAt,
1743
+ finishedAt,
1744
+ updatedAt: finishedAt,
1745
+ ...(failed ? { error: result.error ?? `Runtime-control operation sequence exited with status ${result.exitCode}.` } : {}),
1746
+ output: runtimeControlJobOutput(result, !failed),
1747
+ };
1748
+ }
1749
+ async function executeRuntimeControlOperationSequenceInSandbox(args, options, startedAt, request) {
1750
+ const childArgs = createRuntimeControlSandboxInput(args, request);
1751
+ const execution = readWorkbenchExecutionSpec(childArgs.job);
1752
+ const fileStore = createWorkbenchSandboxFileStore(childArgs);
1753
+ const planeFactory = options.createSandboxPlaneForProvider ?? createSandboxBackendPlaneForProvider;
1754
+ const plane = planeFactory(options.sandboxProvider, childArgs, startedAt, fileStore);
1755
+ assertSandboxBackendSupportsNetworkPolicy(plane.backend, execution);
1756
+ const sandboxOptions = {
1757
+ now: startedAt,
1758
+ runnerId: resolveWorkbenchWorkerId([
1759
+ process.env.WORKBENCH_WORKER_ID,
1760
+ process.env.EC2_INSTANCE_ID,
1761
+ os.hostname(),
1762
+ process.env.HOSTNAME,
1763
+ ], "local-runner"),
1764
+ fileStore,
1765
+ };
1766
+ const inputs = await fileStore.materializeInputs(execution);
1767
+ const environment = plane.prepareEnvironment
1768
+ ? await plane.prepareEnvironment(execution, sandboxOptions)
1769
+ : {
1770
+ backend: plane.backend.name,
1771
+ kind: execution.sandbox.kind,
1772
+ ref: execution.sandbox.ref,
1773
+ };
1774
+ const allocation = createWorkbenchSandboxAllocation(execution, {
1775
+ backend: plane.backend.name,
1776
+ runnerId: sandboxOptions.runnerId,
1777
+ now: startedAt,
1778
+ });
1779
+ const capability = createWorkbenchExecutionCapability(execution, { now: startedAt });
1780
+ assertRuntimeControlScope("Runtime-control sandbox allocation", collectSandboxAllocationScopeIssues(allocation, execution, { now: startedAt }));
1781
+ assertRuntimeControlScope("Runtime-control execution capability", collectExecutionCapabilityScopeIssues(capability, execution, { now: startedAt }));
1782
+ const sandbox = await plane.createSandbox({
1783
+ execution,
1784
+ environment,
1785
+ allocation,
1786
+ capability,
1787
+ inputs,
1788
+ }, sandboxOptions);
1789
+ assertRuntimeControlScope("Runtime-control sandbox handle", collectSandboxHandleScopeIssues(sandbox, allocation, execution));
1790
+ let result;
1791
+ try {
1792
+ result = await plane.exec({
1793
+ execution,
1794
+ environment,
1795
+ sandbox,
1796
+ allocation,
1797
+ capability,
1798
+ inputs,
1799
+ }, sandboxOptions);
1800
+ }
1801
+ finally {
1802
+ await plane.destroySandbox(sandbox, sandboxOptions);
1803
+ }
1804
+ const completedJob = completedJobFromSandboxResult(childArgs.job, startedAt, result);
1805
+ return runtimeControlResultFromCompletedJob(completedJob);
1806
+ }
1807
+ function createRuntimeControlSandboxInput(args, request) {
1808
+ const parentExecution = readWorkbenchExecutionSpec(args.job);
1809
+ const parentWorkload = createWorkbenchRunWorkload({
1810
+ job: args.job,
1811
+ spec: args.spec,
1812
+ baseFiles: args.baseFiles,
1813
+ engineResolveFiles: args.engineResolveFiles,
1814
+ engineCases: args.engineCases,
1815
+ traceFiles: args.traceFiles,
1816
+ });
1817
+ const nonce = runtimeControlNonce();
1818
+ const childExecutionId = `${parentExecution.id}:runtime:${nonce}`;
1819
+ const childJobId = `${args.job.id}:runtime:${nonce}`;
1820
+ const parentInput = asRuntimeRecord(args.job.input);
1821
+ const publicFiles = runtimeControlInputFiles(request.inputs, "case", parentWorkload.engineCase ? engineCasePublicFiles(parentWorkload.engineCase) : []);
1822
+ const privateFiles = runtimeControlInputFiles(request.inputs, "enginePrivate", parentWorkload.engineCase ? engineCasePrivateFiles(parentWorkload.engineCase) : []);
1823
+ const subjectFiles = runtimeControlInputFiles(request.inputs, "subject", parentWorkload.subjectFiles);
1824
+ const traceFiles = runtimeControlInputFiles(request.inputs, "traces", parentWorkload.traceFiles);
1825
+ const adapter = request.operations[request.operations.length - 1]?.invocation;
1826
+ const childExecution = {
1827
+ ...parentExecution,
1828
+ id: childExecutionId,
1829
+ outputs: [],
1830
+ adapter: adapter
1831
+ ? {
1832
+ use: adapter.use,
1833
+ with: adapter.with ?? {},
1834
+ ...(adapter.auth !== undefined ? { auth: adapter.auth } : {}),
1835
+ }
1836
+ : parentExecution.adapter,
1837
+ metadata: {
1838
+ ...asRuntimeRecord(parentExecution.metadata),
1839
+ runtimeControl: true,
1840
+ caseId: parentWorkload.caseId,
1841
+ },
1842
+ };
1843
+ const engineCase = {
1844
+ id: parentWorkload.caseId,
1845
+ case: parentWorkload.engineCaseSpec ?? {
1846
+ version: 3,
1847
+ prompt: parentWorkload.prompt,
1848
+ },
1849
+ files: {
1850
+ public: publicFiles,
1851
+ private: privateFiles,
1852
+ },
1853
+ };
1854
+ const childJob = {
1855
+ ...args.job,
1856
+ id: childJobId,
1857
+ input: {
1858
+ ...parentInput,
1859
+ execution: childExecution,
1860
+ caseId: parentWorkload.caseId,
1861
+ },
1862
+ };
1863
+ const childArgs = {
1864
+ ...args,
1865
+ job: childJob,
1866
+ baseFiles: subjectFiles,
1867
+ engineResolveFiles: [...publicFiles, ...privateFiles],
1868
+ engineCases: [engineCase],
1869
+ traceFiles,
1870
+ runtimeControlOperation: request,
1871
+ };
1872
+ delete childArgs.adapterRuntimeEnv;
1873
+ delete childArgs.workspaceRoot;
1874
+ return childArgs;
1875
+ }
1876
+ function runtimeControlInputFiles(inputs, key, fallback) {
1877
+ if (inputs && Object.prototype.hasOwnProperty.call(inputs, key)) {
1878
+ return cloneSurfaceFiles(inputs[key] ?? []);
1879
+ }
1880
+ return cloneSurfaceFiles(fallback);
1881
+ }
1882
+ function runtimeControlStepForOperation(operation, index, manifests = []) {
1883
+ const command = operation.invocation.command?.trim()
1884
+ || adapterProtocolCommandSpec({
1885
+ use: operation.invocation.use,
1886
+ with: operation.invocation.with ?? {},
1887
+ ...(operation.invocation.auth !== undefined ? { auth: operation.invocation.auth } : {}),
1888
+ }, operation.operation, manifests).command;
1889
+ return {
1890
+ kind: operation.operation === "subject.run"
1891
+ ? "subject"
1892
+ : operation.operation === "optimizer.improve"
1893
+ ? "optimizer"
1894
+ : "engine",
1895
+ label: operation.label ?? `${operation.operation.replace(".", "_")}_${index + 1}`,
1896
+ operation: operation.operation,
1897
+ executor: "sandbox",
1898
+ adapter: {
1899
+ use: operation.invocation.use,
1900
+ with: operation.invocation.with ?? {},
1901
+ ...(operation.invocation.auth !== undefined ? { auth: operation.invocation.auth } : {}),
1902
+ },
1903
+ command,
1904
+ };
1905
+ }
1906
+ function runtimeControlResultFromCompletedJob(job) {
1907
+ return normalizeRuntimeControlResultOutput(asRuntimeRecord(job.output), job.status === "succeeded", job.error);
1908
+ }
1909
+ function runtimeControlJobOutput(result, ok) {
1910
+ return normalizeRuntimeControlResultOutput({
1911
+ ok,
1912
+ files: result.files,
1913
+ fileChanges: result.fileChanges,
1914
+ ...(result.operationResults ? { operationResults: result.operationResults } : {}),
1915
+ ...(result.workspaceFiles ? { workspaceFiles: result.workspaceFiles } : {}),
1916
+ ...(result.result ? { result: result.result } : {}),
1917
+ ...(result.usage ? { usage: result.usage } : {}),
1918
+ ...(result.summary !== undefined ? { summary: result.summary } : {}),
1919
+ ...(result.feedback !== undefined ? { feedback: result.feedback } : {}),
1920
+ ...(result.error ? { error: result.error } : {}),
1921
+ }, ok, result.error);
1922
+ }
1923
+ function normalizeRuntimeControlResultOutput(output, ok, fallbackError) {
1924
+ const files = Array.isArray(output.files)
1925
+ ? output.files.filter(isSurfaceSnapshotFile)
1926
+ : [];
1927
+ const workspaceFiles = Array.isArray(output.workspaceFiles)
1928
+ ? output.workspaceFiles.filter(isSurfaceSnapshotFile)
1929
+ : undefined;
1930
+ const operationResults = Array.isArray(output.operationResults)
1931
+ ? output.operationResults.filter(isWorkbenchAdapterOperationResult)
1932
+ : [];
1933
+ return {
1934
+ ok: ok && output.ok !== false,
1935
+ files,
1936
+ fileChanges: Array.isArray(output.fileChanges)
1937
+ ? output.fileChanges.filter((entry) => typeof entry === "string")
1938
+ : files.map((file) => file.path),
1939
+ operationResults,
1940
+ ...(workspaceFiles ? { workspaceFiles } : {}),
1941
+ ...(output.result && typeof output.result === "object" && !Array.isArray(output.result)
1942
+ ? { result: output.result }
1943
+ : {}),
1944
+ ...(output.usage && typeof output.usage === "object" && !Array.isArray(output.usage)
1945
+ ? { usage: output.usage }
1946
+ : {}),
1947
+ ...(typeof output.summary === "string" ? { summary: output.summary } : {}),
1948
+ ...(output.feedback !== undefined && isJsonPayload(output.feedback) ? { feedback: output.feedback } : {}),
1949
+ ...(typeof output.error === "string" ? { error: output.error } : fallbackError ? { error: fallbackError } : {}),
1950
+ };
1951
+ }
1952
+ function isWorkbenchAdapterOperationResult(value) {
1953
+ if (!value || typeof value !== "object" || Array.isArray(value)) {
1954
+ return false;
1955
+ }
1956
+ const record = value;
1957
+ return record.protocol === "workbench.adapter-result.v1" &&
1958
+ (record.operation === "engine.resolve" ||
1959
+ record.operation === "engine.run" ||
1960
+ record.operation === "subject.run" ||
1961
+ record.operation === "optimizer.improve");
1962
+ }
1963
+ function cloneSurfaceFiles(files) {
1964
+ return files.map((file) => ({ ...file, path: normalizeRelativePath(file.path) }));
1965
+ }
1966
+ function runtimeControlNonce() {
1967
+ return randomBytes(6).toString("hex");
1968
+ }
1969
+ function assertRuntimeControlScope(label, issues) {
1970
+ if (issues.length > 0) {
1971
+ throw new Error(`${label} failed validation:\n${issues.join("\n")}`);
1972
+ }
1973
+ }
1456
1974
  async function runHostedProtocolExecutionResult(args, execution, startedAt, capability, eventPublisher) {
1457
1975
  const workload = createWorkbenchRunWorkload({
1458
1976
  job: args.job,
@@ -1462,13 +1980,13 @@ async function runHostedProtocolExecutionResult(args, execution, startedAt, capa
1462
1980
  engineCases: args.engineCases,
1463
1981
  traceFiles: args.traceFiles,
1464
1982
  });
1465
- const result = await runHostedCommandExecutionPhases(args, workload, [protocolPhaseForExecution(execution, args.adapterManifests)], startedAt, {
1983
+ const result = await runHostedCommandExecutionSteps(args, workload, [protocolStepForExecution(execution, args.adapterManifests)], startedAt, {
1466
1984
  capability,
1467
1985
  eventPublisher,
1468
1986
  });
1469
1987
  return { workload, result };
1470
1988
  }
1471
- async function runHostedCommandExecutionPhases(args, workload, phases, startedAt, options = {}) {
1989
+ async function runHostedCommandExecutionSteps(args, workload, steps, startedAt, options = {}) {
1472
1990
  const [{ execFile }, fs, os, path, { promisify }] = await Promise.all([
1473
1991
  importNodeModule(nodeBuiltin("child_process")),
1474
1992
  importNodeModule(nodeBuiltin("fs/promises")),
@@ -1489,9 +2007,22 @@ async function runHostedCommandExecutionPhases(args, workload, phases, startedAt
1489
2007
  const workspace = await createRuntimeWorkspaceRoot(args, fs, os, path, "workbench-execution-sandbox-");
1490
2008
  try {
1491
2009
  await stageWorkbenchRunWorkload(workspace.root, workload);
2010
+ if (options.workspaceFiles && options.workspaceFiles.length > 0) {
2011
+ await stageInitialWorkspaceFiles(workspace.root, options.workspaceFiles);
2012
+ }
2013
+ if (options.outputFiles && options.outputFiles.length > 0) {
2014
+ await writeSurfaceFiles(outputDir(workspace.root), options.outputFiles);
2015
+ }
2016
+ const execution = readWorkbenchExecutionSpec(workload.job);
2017
+ const hostAdapterIds = new Set(steps.flatMap((step) => step.executor === "host"
2018
+ ? [step.adapter?.use ?? execution.adapter.use]
2019
+ : []));
2020
+ const hostAdapterRoots = hostAdapterIds.size > 0
2021
+ ? await materializeHostAdapterRoots(workspace.root, args.adapterFiles ?? [], hostAdapterIds)
2022
+ : new Map();
1492
2023
  let exitCode = 0;
1493
2024
  let runtimeError;
1494
- const phaseResults = [];
2025
+ const operationResults = [];
1495
2026
  try {
1496
2027
  if (!environmentVersion) {
1497
2028
  throw new Error("environment is required for adapter command executions.");
@@ -1503,49 +2034,64 @@ async function runHostedCommandExecutionPhases(args, workload, phases, startedAt
1503
2034
  network: environmentVersion.spec.network,
1504
2035
  }, null, 2)}\n`);
1505
2036
  }
1506
- const phaseTimeoutMs = environmentVersion
2037
+ const stepTimeoutMs = environmentVersion
1507
2038
  ? environmentVersionTimeoutMs(environmentVersion)
1508
2039
  : 5 * 60 * 1000;
1509
- const execution = readWorkbenchExecutionSpec(workload.job);
1510
- for (const phase of phases) {
1511
- await resetHostedWorkloadPhaseOutput(workspace.root, phase);
1512
- if (phase.kind === "engine" && execution.purpose === "attempt") {
1513
- await stageAttemptScoringInputs(workspace.root, workload);
2040
+ const shouldRunSubjectPrepare = options.runSubjectPrepare ?? steps.some((step) => step.executor === "sandbox");
2041
+ if (shouldRunSubjectPrepare) {
2042
+ await runSubjectPrepareCommand({
2043
+ root: workspace.root,
2044
+ workload,
2045
+ execution,
2046
+ execFileAsync,
2047
+ timeoutMs: stepTimeoutMs,
2048
+ eventPublisher: options.eventPublisher,
2049
+ });
2050
+ }
2051
+ let enginePrivateStaged = false;
2052
+ for (const step of steps) {
2053
+ if (step.kind === "engine" && !enginePrivateStaged) {
2054
+ await stageWorkbenchEnginePrivateFiles(workspace.root, workload);
2055
+ enginePrivateStaged = true;
1514
2056
  }
1515
- const adapterRequestPath = await writeWorkbenchAdapterRequest(workspace.root, workload, execution, phase, adapterAuthRequestForPhase(args, phase.adapter?.use ?? execution.adapter.use), args.adapterManifests);
1516
- const phaseRole = phaseEventRole(phase);
1517
- await publishCommandPhaseEvent(options.eventPublisher, {
1518
- phase: phase.label,
2057
+ await resetHostedWorkloadStepOutput(workspace.root);
2058
+ const adapterRequestPath = await writeWorkbenchAdapterRequest(workspace.root, workload, execution, step, adapterAuthRequestForStep(args, step.adapter?.use ?? execution.adapter.use), args.adapterManifests);
2059
+ const stepRole = stepEventRole(step);
2060
+ await publishCommandStepEvent(options.eventPublisher, {
2061
+ step: step.label,
1519
2062
  status: "started",
1520
- ...(phaseRole ? { role: phaseRole } : {}),
2063
+ ...(stepRole ? { role: stepRole } : {}),
1521
2064
  });
1522
2065
  try {
1523
- if (!phase.command) {
1524
- throw new Error(`Adapter phase ${phase.label} is missing a command.`);
2066
+ if (!step.command) {
2067
+ throw new Error(`Adapter step ${step.label} is missing a command.`);
1525
2068
  }
1526
- const command = createHostedWorkloadShellCommand(workspace.root, phase.command, phase.label, phase.okExitCodes);
2069
+ const adapterRoot = step.executor === "host"
2070
+ ? hostAdapterRoots.get(step.adapter?.use ?? execution.adapter.use)
2071
+ : undefined;
2072
+ const command = createHostedWorkloadShellCommand(workspace.root, step.command, step.label, step.okExitCodes);
1527
2073
  await execFileAsync("sh", ["-c", command], {
1528
- cwd: workspace.root,
1529
- env: createHostedWorkloadPhaseEnv(workspace.root, adapterRequestPath, args.adapterAuthEnv),
2074
+ cwd: adapterRoot ?? workspace.root,
2075
+ env: createHostedWorkloadAdapterEnv(workspace.root, adapterRequestPath, args.adapterAuthEnv, adapterRoot ? { adapterRoot } : undefined, args.adapterRuntimeEnv),
1530
2076
  maxBuffer: 10 * 1024 * 1024,
1531
- timeout: phaseTimeoutMs,
2077
+ timeout: stepTimeoutMs,
1532
2078
  });
1533
- const operationResult = await readWorkbenchAdapterOperationResult(outputDir(workspace.root), phase.operation);
1534
- assertWorkbenchAdapterOperationResultOk(operationResult, `Adapter ${phase.adapter?.use ?? execution.adapter.use} ${phase.operation}`);
1535
- phaseResults.push(operationResult);
1536
- await publishCommandPhaseEvent(options.eventPublisher, {
1537
- phase: phase.label,
2079
+ const operationResult = await readWorkbenchAdapterOperationResult(outputDir(workspace.root), step.operation);
2080
+ assertWorkbenchAdapterOperationResultOk(operationResult, `Adapter ${step.adapter?.use ?? execution.adapter.use} ${step.operation}`);
2081
+ operationResults.push(operationResult);
2082
+ await publishCommandStepEvent(options.eventPublisher, {
2083
+ step: step.label,
1538
2084
  status: "succeeded",
1539
- ...(phaseRole ? { role: phaseRole } : {}),
2085
+ ...(stepRole ? { role: stepRole } : {}),
1540
2086
  });
1541
2087
  }
1542
2088
  catch (error) {
1543
- await publishCommandPhaseEvent(options.eventPublisher, {
1544
- phase: phase.label,
2089
+ await publishCommandStepEvent(options.eventPublisher, {
2090
+ step: step.label,
1545
2091
  status: "failed",
1546
2092
  exitCode: readExitCode(error),
1547
2093
  error: error instanceof Error ? error.message : String(error),
1548
- ...(phaseRole ? { role: phaseRole } : {}),
2094
+ ...(stepRole ? { role: stepRole } : {}),
1549
2095
  });
1550
2096
  throw error;
1551
2097
  }
@@ -1569,16 +2115,56 @@ async function runHostedCommandExecutionPhases(args, workload, phases, startedAt
1569
2115
  startedAt,
1570
2116
  });
1571
2117
  }
1572
- return await readWorkbenchRunWorkloadResult(workspace.root, workload, {
2118
+ const result = await readWorkbenchRunWorkloadResult(workspace.root, workload, {
1573
2119
  exitCode,
1574
2120
  startedAt,
1575
- phaseResults,
2121
+ operationResults,
1576
2122
  });
2123
+ if (options.collectWorkspace) {
2124
+ result.workspaceFiles = await readMutableWorkspaceSnapshotFiles(workspace.root);
2125
+ }
2126
+ return result;
1577
2127
  }
1578
2128
  finally {
1579
2129
  await workspace.cleanup();
1580
2130
  }
1581
2131
  }
2132
+ async function runSubjectPrepareCommand(args) {
2133
+ const command = args.workload.spec.subject.prepare?.command;
2134
+ if (!command) {
2135
+ return;
2136
+ }
2137
+ const role = args.execution.purpose === "improve" ? "optimizer" : "runner";
2138
+ await publishCommandStepEvent(args.eventPublisher, {
2139
+ step: "subject_prepare",
2140
+ status: "started",
2141
+ role,
2142
+ });
2143
+ try {
2144
+ const shellCommand = createHostedWorkloadShellCommand(args.root, command, "subject_prepare");
2145
+ await args.execFileAsync("sh", ["-c", shellCommand], {
2146
+ cwd: args.root,
2147
+ env: createHostedWorkloadPrepareEnv(args.root),
2148
+ maxBuffer: 10 * 1024 * 1024,
2149
+ timeout: args.timeoutMs,
2150
+ });
2151
+ await publishCommandStepEvent(args.eventPublisher, {
2152
+ step: "subject_prepare",
2153
+ status: "succeeded",
2154
+ role,
2155
+ });
2156
+ }
2157
+ catch (error) {
2158
+ await publishCommandStepEvent(args.eventPublisher, {
2159
+ step: "subject_prepare",
2160
+ status: "failed",
2161
+ exitCode: readExitCode(error),
2162
+ error: error instanceof Error ? error.message : String(error),
2163
+ role,
2164
+ });
2165
+ throw new Error(`Subject prepare command failed: ${error instanceof Error ? error.message : String(error)}`);
2166
+ }
2167
+ }
1582
2168
  async function createRuntimeWorkspaceRoot(args, fs, os, path, prefix) {
1583
2169
  if (args.workspaceRoot) {
1584
2170
  await fs.mkdir(args.workspaceRoot, { recursive: true });
@@ -1614,19 +2200,22 @@ async function createRuntimeWorkspaceRoot(args, fs, os, path, prefix) {
1614
2200
  },
1615
2201
  };
1616
2202
  }
1617
- function phaseEventRole(phase) {
1618
- if (phase.kind === "optimizer") {
2203
+ function stepEventRole(step) {
2204
+ if (step.kind === "optimizer") {
1619
2205
  return "optimizer";
1620
2206
  }
1621
- if (phase.kind === "runner") {
2207
+ if (step.kind === "subject") {
1622
2208
  return "runner";
1623
2209
  }
1624
- if (phase.kind === "engine") {
2210
+ if (step.kind === "engine") {
1625
2211
  return "engine";
1626
2212
  }
1627
2213
  return undefined;
1628
2214
  }
1629
2215
  function adapterOperationUsageSummary(result) {
2216
+ if (hasExplicitUsageRole(result.usage)) {
2217
+ return completeUsageSummary(result.usage);
2218
+ }
1630
2219
  if (result.operation === "optimizer.improve") {
1631
2220
  return assignUsageRole("optimizer", result.usage);
1632
2221
  }
@@ -1638,11 +2227,16 @@ function adapterOperationUsageSummary(result) {
1638
2227
  }
1639
2228
  return result.usage;
1640
2229
  }
1641
- function executionPurposeRole(purpose) {
1642
- if (purpose === "improve") {
1643
- return "optimizer";
1644
- }
1645
- return "runner";
2230
+ function attemptUsageSummary(workloadUsage, resultUsage) {
2231
+ const normalizedWorkloadUsage = completeUsageSummary(workloadUsage);
2232
+ const legacyEngineUsage = normalizedWorkloadUsage?.engine
2233
+ ? undefined
2234
+ : assignUsageRole("engine", resultUsage);
2235
+ return mergeUsageSummaries([normalizedWorkloadUsage, legacyEngineUsage]);
2236
+ }
2237
+ function hasExplicitUsageRole(usage) {
2238
+ const normalized = completeUsageSummary(usage);
2239
+ return Boolean(normalized?.optimizer || normalized?.runner || normalized?.engine);
1646
2240
  }
1647
2241
  function createSubjectPatchFromResult(result, spec) {
1648
2242
  if (result.subjectPatch) {
@@ -1720,47 +2314,103 @@ export async function stageWorkbenchRunWorkload(root, workload) {
1720
2314
  fs
1721
2315
  .rm(runtimePrivateDir(root), { recursive: true, force: true })
1722
2316
  .catch(() => undefined),
1723
- fs
1724
- .rm(runtimeLogsDir(root), { recursive: true, force: true })
1725
- .catch(() => undefined),
1726
2317
  ]);
1727
2318
  await fs.mkdir(inputDir(root), { recursive: true });
1728
2319
  await fs.mkdir(outputDir(root), { recursive: true });
1729
2320
  if (purpose === "attempt") {
1730
- assertMutableWorkspaceFiles(workload.subjectFiles, "Subject files");
1731
2321
  await fs.mkdir(subjectDir(root), { recursive: true });
1732
2322
  await fs.mkdir(caseDir(root), { recursive: true });
1733
- await fs.mkdir(runtimeLogsAgentDir(root), { recursive: true });
1734
- await fs.mkdir(runtimeLogsVerifierDir(root), { recursive: true });
1735
2323
  const engineCase = requireWorkloadEngineCase(workload, "Attempt staging");
1736
2324
  await writeSurfaceFiles(subjectDir(root), workload.subjectFiles);
1737
- await writeSurfaceFiles(caseDir(root), engineCaseSubjectVisibleFiles(engineCase));
1738
- await writeSurfaceFiles(root, workload.subjectFiles);
2325
+ await writeSurfaceFiles(caseDir(root), engineCasePublicFiles(engineCase));
1739
2326
  return;
1740
2327
  }
1741
2328
  if (purpose === "improve") {
1742
- assertMutableWorkspaceFiles(workload.subjectFiles, "Subject files");
1743
2329
  await fs.mkdir(subjectDir(root), { recursive: true });
1744
2330
  await writeSurfaceFiles(subjectDir(root), workload.subjectFiles);
1745
- await writeSurfaceFiles(root, workload.subjectFiles);
1746
2331
  await fs.mkdir(tracesDir(root), { recursive: true });
1747
2332
  await writeSurfaceFiles(tracesDir(root), workload.traceFiles);
1748
2333
  }
1749
2334
  }
1750
- async function stageAttemptScoringInputs(root, workload) {
2335
+ async function stageWorkbenchEnginePrivateFiles(root, workload) {
2336
+ if (readWorkloadExecutionPurpose(workload) !== "attempt") {
2337
+ return;
2338
+ }
1751
2339
  const fs = await importNodeModule(nodeBuiltin("fs/promises"));
1752
- const engineCase = requireWorkloadEngineCase(workload, "Attempt scoring");
1753
- await Promise.all([
1754
- fs
1755
- .rm(runtimeEnginePrivateDir(root), { recursive: true, force: true })
1756
- .catch(() => undefined),
1757
- fs
1758
- .rm(runtimeLogsVerifierDir(root), { recursive: true, force: true })
1759
- .catch(() => undefined),
1760
- ]);
1761
2340
  await fs.mkdir(runtimeEnginePrivateDir(root), { recursive: true });
1762
- await fs.mkdir(runtimeLogsVerifierDir(root), { recursive: true });
1763
- await writeSurfaceFiles(runtimeEnginePrivateDir(root), engineCaseEnginePrivateFiles(engineCase));
2341
+ await writeSurfaceFiles(runtimeEnginePrivateDir(root), engineCasePrivateFiles(requireWorkloadEngineCase(workload, "Engine-private staging")));
2342
+ }
2343
+ async function stageInitialWorkspaceFiles(root, files) {
2344
+ await writeSurfaceFiles(root, files.filter((file) => isMutableWorkspaceSnapshotPath(file.path)));
2345
+ }
2346
+ async function readMutableWorkspaceSnapshotFiles(root) {
2347
+ return (await readSurfaceFiles(root))
2348
+ .filter((file) => isMutableWorkspaceSnapshotPath(file.path))
2349
+ .sort((left, right) => left.path.localeCompare(right.path));
2350
+ }
2351
+ function isMutableWorkspaceSnapshotPath(filePath) {
2352
+ const normalized = normalizeRelativePath(filePath);
2353
+ return Boolean(normalized &&
2354
+ !normalized.startsWith("../") &&
2355
+ normalized !== "input" &&
2356
+ !normalized.startsWith("input/") &&
2357
+ normalized !== "private" &&
2358
+ !normalized.startsWith("private/") &&
2359
+ normalized !== "output" &&
2360
+ !normalized.startsWith("output/") &&
2361
+ normalized !== ".workbench" &&
2362
+ !normalized.startsWith(".workbench/"));
2363
+ }
2364
+ async function materializeHostAdapterRoots(root, adapterFiles, adapterIds) {
2365
+ if (adapterFiles.length === 0 || adapterIds.size === 0) {
2366
+ return new Map();
2367
+ }
2368
+ const fs = await importNodeModule(nodeBuiltin("fs/promises"));
2369
+ const path = await importNodeModule(nodeBuiltin("path"));
2370
+ const sourceRoots = hostAdapterSourceRoots(adapterFiles, adapterIds);
2371
+ const roots = new Map();
2372
+ for (const [adapterId, sourceRoot] of sourceRoots) {
2373
+ const targetRoot = path.join(root, ".workbench", "adapters", adapterId);
2374
+ const files = adapterFiles.flatMap((file) => {
2375
+ const relativePath = adapterFilePathWithinRoot(file.path, sourceRoot);
2376
+ return relativePath === null
2377
+ ? []
2378
+ : [{ ...file, path: relativePath }];
2379
+ });
2380
+ await fs.rm(targetRoot, { recursive: true, force: true }).catch(() => undefined);
2381
+ await fs.mkdir(targetRoot, { recursive: true });
2382
+ await writeSurfaceFiles(targetRoot, files);
2383
+ roots.set(adapterId, await fs.realpath(targetRoot));
2384
+ }
2385
+ return roots;
2386
+ }
2387
+ function hostAdapterSourceRoots(adapterFiles, adapterIds) {
2388
+ const roots = new Map();
2389
+ for (const file of adapterFiles) {
2390
+ const normalized = normalizeRelativePath(file.path);
2391
+ if (!normalized.endsWith("workbench.adapter.yaml")) {
2392
+ continue;
2393
+ }
2394
+ const manifest = parseWorkbenchAdapterManifest(file.content);
2395
+ if (!adapterIds.has(manifest.id)) {
2396
+ continue;
2397
+ }
2398
+ const sourceRoot = normalized === "workbench.adapter.yaml"
2399
+ ? ""
2400
+ : normalized.slice(0, -"workbench.adapter.yaml".length).replace(/\/+$/u, "");
2401
+ roots.set(manifest.id, sourceRoot);
2402
+ }
2403
+ return roots;
2404
+ }
2405
+ function adapterFilePathWithinRoot(filePath, sourceRoot) {
2406
+ const normalized = normalizeRelativePath(filePath);
2407
+ if (!sourceRoot) {
2408
+ return normalized;
2409
+ }
2410
+ if (!normalized.startsWith(`${sourceRoot}/`)) {
2411
+ return null;
2412
+ }
2413
+ return normalized.slice(sourceRoot.length + 1);
1764
2414
  }
1765
2415
  async function readHostedRunFailureResult(root, workload, options) {
1766
2416
  const traceFiles = await readRuntimeTraceFiles(root, workload);
@@ -1788,16 +2438,16 @@ async function readWorkbenchRunWorkloadResult(root, workload, options = {}) {
1788
2438
  const primaryOperation = purpose === "improve"
1789
2439
  ? "optimizer.improve"
1790
2440
  : "engine.run";
1791
- const primaryResult = [...(options.phaseResults ?? [])]
2441
+ const primaryResult = [...(options.operationResults ?? [])]
1792
2442
  .reverse()
1793
2443
  .find((result) => result.operation === primaryOperation);
1794
2444
  const resultPayload = jsonRecord(primaryResult?.value);
1795
2445
  const usage = mergeUsageSummaries([
1796
2446
  options.usage,
1797
- ...(options.phaseResults ?? []).map(adapterOperationUsageSummary),
2447
+ ...(options.operationResults ?? []).map(adapterOperationUsageSummary),
1798
2448
  ]);
1799
- const metrics = normalizeRewardMetrics(resultPayload.metrics);
1800
- const cases = normalizeRewardCases(resultPayload.cases);
2449
+ const metrics = normalizeResultMetrics(resultPayload.metrics);
2450
+ const cases = normalizeResultCases(resultPayload.cases);
1801
2451
  const includeResultScoring = purpose === "attempt";
1802
2452
  const files = [...outputFiles, ...traceFiles].sort((left, right) => left.path.localeCompare(right.path));
1803
2453
  const subjectPatch = purpose === "improve" ? primaryResult?.value : undefined;
@@ -1809,6 +2459,7 @@ async function readWorkbenchRunWorkloadResult(root, workload, options = {}) {
1809
2459
  return {
1810
2460
  files,
1811
2461
  fileChanges: declaredChanges,
2462
+ ...(options.operationResults ? { operationResults: [...options.operationResults] } : {}),
1812
2463
  ...(subjectPatch ? { subjectPatch } : {}),
1813
2464
  ...(engineResult ? { result: engineResult } : {}),
1814
2465
  ...(includeResultScoring && metrics ? { metrics } : {}),
@@ -1835,10 +2486,10 @@ async function readRuntimeTraceFiles(root, workload) {
1835
2486
  const path = await importNodeModule(nodeBuiltin("path"));
1836
2487
  const traceRoot = path.join(outputDir(root), ".workbench", "traces", workload.job.id);
1837
2488
  const purpose = readWorkloadExecutionPurpose(workload);
1838
- const outputTraceRoot = workbenchTracePhaseDirectory({
2489
+ const outputTraceRoot = workbenchTraceExecutionDirectory({
1839
2490
  sequence: 1,
1840
2491
  runId: workload.job.runId,
1841
- phase: purpose,
2492
+ purpose,
1842
2493
  });
1843
2494
  return (await readSurfaceFiles(traceRoot)).map((file) => ({
1844
2495
  ...file,
@@ -1868,13 +2519,13 @@ function createHostedWorkloadShellCommand(root, command, prefix = "", okExitCode
1868
2519
  'exit "$status"',
1869
2520
  ].join("; ");
1870
2521
  }
1871
- async function resetHostedWorkloadPhaseOutput(root, _phase) {
2522
+ async function resetHostedWorkloadStepOutput(root) {
1872
2523
  const fs = await importNodeModule(nodeBuiltin("fs/promises"));
1873
2524
  await fs
1874
2525
  .rm(workbenchAdapterOperationResultPath(outputDir(root)), { force: true })
1875
2526
  .catch(() => undefined);
1876
2527
  }
1877
- async function writeWorkbenchAdapterRequest(root, workload, execution, phase, auth, manifests) {
2528
+ async function writeWorkbenchAdapterRequest(root, workload, execution, step, auth, manifests) {
1878
2529
  const [fs, path] = await Promise.all([
1879
2530
  importNodeModule(nodeBuiltin("fs/promises")),
1880
2531
  importNodeModule(nodeBuiltin("path")),
@@ -1882,13 +2533,13 @@ async function writeWorkbenchAdapterRequest(root, workload, execution, phase, au
1882
2533
  const requestPath = path.join(root, ".workbench", "request.json");
1883
2534
  await fs.mkdir(path.dirname(requestPath), { recursive: true });
1884
2535
  const casePrompt = workload.engineCaseSpec?.prompt;
1885
- const adapter = phase.adapter ?? execution.adapter;
2536
+ const adapter = step.adapter ?? execution.adapter;
1886
2537
  const subjectCommand = adapterProtocolCommandSpec(workload.spec.run, "subject.run", manifests).command;
1887
2538
  await fs.writeFile(requestPath, `${JSON.stringify({
1888
2539
  protocol: "workbench.adapter.v3",
1889
2540
  id: execution.id,
1890
2541
  jobId: workload.job.id,
1891
- operation: phase.operation,
2542
+ operation: step.operation,
1892
2543
  invocation: {
1893
2544
  use: adapter.use,
1894
2545
  with: adapterConfigRecord(adapter, manifests),
@@ -1903,6 +2554,7 @@ async function writeWorkbenchAdapterRequest(root, workload, execution, phase, au
1903
2554
  subject: {
1904
2555
  id: workload.subjectId,
1905
2556
  path: workload.spec.subject.files.path,
2557
+ ...(workload.spec.subject.prepare ? { prepare: { ...workload.spec.subject.prepare } } : {}),
1906
2558
  run: {
1907
2559
  ...workload.spec.run,
1908
2560
  command: subjectCommand,
@@ -1923,14 +2575,12 @@ async function writeWorkbenchAdapterRequest(root, workload, execution, phase, au
1923
2575
  },
1924
2576
  paths: {
1925
2577
  workspace: root,
1926
- cwd: root,
1927
2578
  output: outputDir(root),
1928
2579
  result: workbenchAdapterOperationResultPath(outputDir(root)),
1929
2580
  subject: subjectDir(root),
1930
2581
  ...(workload.engineCaseSpec ? { case: caseDir(root) } : {}),
1931
2582
  traces: tracesDir(root),
1932
- ...(phase.kind === "engine" ? { enginePrivate: runtimeEnginePrivateDir(root) } : {}),
1933
- logs: runtimeLogsDir(root),
2583
+ ...(step.kind === "engine" ? { enginePrivate: runtimeEnginePrivateDir(root) } : {}),
1934
2584
  },
1935
2585
  }, null, 2)}\n`);
1936
2586
  return requestPath;
@@ -1945,7 +2595,29 @@ function requireOptimizerEdits(spec) {
1945
2595
  }
1946
2596
  return edits;
1947
2597
  }
1948
- function createHostedWorkloadPhaseEnv(root, adapterRequestPath, adapterEnv = {}) {
2598
+ function createHostedWorkloadAdapterEnv(root, adapterRequestPath, adapterEnv = {}, options = {}, runtimeEnv = {}) {
2599
+ const env = createHostedWorkloadBaseEnv();
2600
+ env.WORKBENCH_ADAPTER_REQUEST = adapterRequestPath;
2601
+ env.WORKBENCH_OUTPUT = outputDir(root);
2602
+ env.WORKBENCH_RESULT = workbenchAdapterOperationResultPath(outputDir(root));
2603
+ if (options.adapterRoot) {
2604
+ env.WORKBENCH_ADAPTER_ROOT = options.adapterRoot;
2605
+ env.WORKBENCH_WORKSPACE_ROOT = root;
2606
+ env.PATH = [
2607
+ `${options.adapterRoot}/node_modules/.bin`,
2608
+ env.PATH,
2609
+ ].filter(Boolean).join(":");
2610
+ }
2611
+ Object.assign(env, adapterEnv);
2612
+ Object.assign(env, runtimeEnv);
2613
+ return env;
2614
+ }
2615
+ function createHostedWorkloadPrepareEnv(root) {
2616
+ const env = createHostedWorkloadBaseEnv();
2617
+ env.WORKBENCH_OUTPUT = outputDir(root);
2618
+ return env;
2619
+ }
2620
+ function createHostedWorkloadBaseEnv() {
1949
2621
  const env = {};
1950
2622
  for (const [key, value] of Object.entries(process.env)) {
1951
2623
  if (typeof value === "string") {
@@ -1957,20 +2629,52 @@ function createHostedWorkloadPhaseEnv(root, adapterRequestPath, adapterEnv = {})
1957
2629
  delete env[key];
1958
2630
  }
1959
2631
  }
1960
- const runtimeBins = [
2632
+ const runtimeBins = uniquePathEntries([
2633
+ ...nodeModuleBinDirsForAncestors(process.cwd()),
2634
+ ...nodeModuleBinDirsForAncestors(path.dirname(fileURLToPath(import.meta.url))),
2635
+ "/app/node_modules/.bin",
1961
2636
  "/workbench-runtime/node_modules/.bin",
1962
2637
  "/workbench-runtime/products/workbench/node_modules/.bin",
1963
- ].join(":");
1964
- const systemBins = "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin";
1965
- env.PATH = process.env.PATH
1966
- ? `${systemBins}:${runtimeBins}:${process.env.PATH}`
1967
- : `${systemBins}:${runtimeBins}`;
1968
- env.WORKBENCH_ADAPTER_REQUEST = adapterRequestPath;
1969
- env.WORKBENCH_OUTPUT = outputDir(root);
1970
- env.WORKBENCH_RESULT = workbenchAdapterOperationResultPath(outputDir(root));
1971
- Object.assign(env, adapterEnv);
2638
+ ]);
2639
+ env.PATH = uniquePathEntries([
2640
+ path.dirname(process.execPath),
2641
+ "/usr/local/sbin",
2642
+ "/usr/local/bin",
2643
+ "/usr/sbin",
2644
+ "/usr/bin",
2645
+ "/sbin",
2646
+ "/bin",
2647
+ ...runtimeBins,
2648
+ ...(process.env.PATH ? process.env.PATH.split(path.delimiter) : []),
2649
+ ]).join(path.delimiter);
1972
2650
  return env;
1973
2651
  }
2652
+ function nodeModuleBinDirsForAncestors(start) {
2653
+ const dirs = [];
2654
+ let current = path.resolve(start);
2655
+ for (let depth = 0; depth < 12; depth += 1) {
2656
+ dirs.push(path.join(current, "node_modules", ".bin"));
2657
+ const parent = path.dirname(current);
2658
+ if (parent === current) {
2659
+ break;
2660
+ }
2661
+ current = parent;
2662
+ }
2663
+ return dirs;
2664
+ }
2665
+ function uniquePathEntries(entries) {
2666
+ const seen = new Set();
2667
+ const output = [];
2668
+ for (const entry of entries) {
2669
+ const trimmed = entry.trim();
2670
+ if (!trimmed || seen.has(trimmed)) {
2671
+ continue;
2672
+ }
2673
+ seen.add(trimmed);
2674
+ output.push(trimmed);
2675
+ }
2676
+ return output;
2677
+ }
1974
2678
  function readWorkloadExecutionPurpose(workload) {
1975
2679
  const purpose = workbenchExecutionPurpose(workload.job);
1976
2680
  if (purpose === "improve" || purpose === "attempt") {
@@ -2005,35 +2709,6 @@ function runtimePrivateDir(root) {
2005
2709
  function runtimeEnginePrivateDir(root) {
2006
2710
  return `${runtimePrivateDir(root)}/engine`;
2007
2711
  }
2008
- function runtimeLogsDir(root) {
2009
- return `${root}/logs`;
2010
- }
2011
- function runtimeLogsAgentDir(root) {
2012
- return `${runtimeLogsDir(root)}/agent`;
2013
- }
2014
- function runtimeLogsVerifierDir(root) {
2015
- return `${runtimeLogsDir(root)}/verifier`;
2016
- }
2017
- function assertMutableWorkspaceFiles(files, label) {
2018
- const reserved = files
2019
- .map((file) => normalizeRelativePath(file.path))
2020
- .filter(isRuntimeReservedWorkspacePath);
2021
- if (reserved.length > 0) {
2022
- throw new Error(`${label} cannot target runtime-reserved workspace paths: ${reserved.join(", ")}.`);
2023
- }
2024
- }
2025
- function isRuntimeReservedWorkspacePath(normalizedPath) {
2026
- return normalizedPath === ".workbench" ||
2027
- normalizedPath.startsWith(".workbench/") ||
2028
- normalizedPath === "input" ||
2029
- normalizedPath.startsWith("input/") ||
2030
- normalizedPath === "output" ||
2031
- normalizedPath.startsWith("output/") ||
2032
- normalizedPath === "logs" ||
2033
- normalizedPath.startsWith("logs/") ||
2034
- normalizedPath === "private" ||
2035
- normalizedPath.startsWith("private/");
2036
- }
2037
2712
  async function writeSurfaceFiles(root, files) {
2038
2713
  const fs = await importNodeModule(nodeBuiltin("fs/promises"));
2039
2714
  const path = await importNodeModule(nodeBuiltin("path"));
@@ -2097,7 +2772,7 @@ function encodeSurfaceSnapshotContent(body, utf8Decoder) {
2097
2772
  };
2098
2773
  }
2099
2774
  }
2100
- function normalizeRewardMetrics(value) {
2775
+ function normalizeResultMetrics(value) {
2101
2776
  if (!value || typeof value !== "object" || Array.isArray(value)) {
2102
2777
  return undefined;
2103
2778
  }
@@ -2109,7 +2784,7 @@ function normalizeRewardMetrics(value) {
2109
2784
  }
2110
2785
  return Object.keys(metrics).length > 0 ? metrics : undefined;
2111
2786
  }
2112
- function normalizeRewardCases(value) {
2787
+ function normalizeResultCases(value) {
2113
2788
  if (!Array.isArray(value)) {
2114
2789
  return undefined;
2115
2790
  }
@@ -2122,7 +2797,7 @@ function normalizeRewardCases(value) {
2122
2797
  if (!id) {
2123
2798
  return [];
2124
2799
  }
2125
- const metrics = normalizeRewardMetrics(record.metrics) ?? {};
2800
+ const metrics = normalizeResultMetrics(record.metrics) ?? {};
2126
2801
  const status = record.status === "completed" || record.status === "error"
2127
2802
  ? record.status
2128
2803
  : undefined;
@@ -2146,9 +2821,7 @@ function normalizeRewardCases(value) {
2146
2821
  : undefined;
2147
2822
  const pass = typeof criterionRecord.pass === "boolean"
2148
2823
  ? criterionRecord.pass
2149
- : score !== undefined
2150
- ? score >= 0.5
2151
- : undefined;
2824
+ : undefined;
2152
2825
  if (!criterionId || score === undefined || pass === undefined) {
2153
2826
  return [];
2154
2827
  }
@@ -2261,13 +2934,13 @@ function evaluateSample(args) {
2261
2934
  if (typeof sampleScore !== "number" || !Number.isFinite(sampleScore)) {
2262
2935
  throw new Error("Evaluation sample requires an engine result with a finite numeric score.");
2263
2936
  }
2264
- const cases = args.workload.cases?.length ? args.workload.cases : undefined;
2265
2937
  const metrics = args.workload.metrics ?? {
2266
2938
  score: sampleScore,
2267
2939
  };
2268
2940
  if (metrics.score === undefined) {
2269
2941
  metrics.score = sampleScore;
2270
2942
  }
2943
+ const cases = args.workload.cases?.length ? args.workload.cases : undefined;
2271
2944
  const feedback = {
2272
2945
  ...(args.workload.summary !== undefined
2273
2946
  ? { summary: args.workload.summary }
@@ -2295,7 +2968,7 @@ function evaluateSample(args) {
2295
2968
  feedback,
2296
2969
  };
2297
2970
  }
2298
- function normalizeSampleJobOutput(value, fallbackFiles = []) {
2971
+ function normalizeSampleJobOutput(value) {
2299
2972
  if (!value || typeof value !== "object" || Array.isArray(value)) {
2300
2973
  return null;
2301
2974
  }
@@ -2314,9 +2987,6 @@ function normalizeSampleJobOutput(value, fallbackFiles = []) {
2314
2987
  !Number.isFinite(record.attemptIndex)) {
2315
2988
  return null;
2316
2989
  }
2317
- const sampleFiles = files.length > 0
2318
- ? files
2319
- : fallbackFiles.map((file) => ({ ...file }));
2320
2990
  return {
2321
2991
  subjectId: record.subjectId,
2322
2992
  attemptIndex: record.attemptIndex,
@@ -2324,10 +2994,10 @@ function normalizeSampleJobOutput(value, fallbackFiles = []) {
2324
2994
  fileChanges: Array.isArray(record.fileChanges)
2325
2995
  ? record.fileChanges.filter((entry) => typeof entry === "string")
2326
2996
  : [],
2327
- files: sampleFiles,
2997
+ files,
2328
2998
  traces: Array.isArray(record.traces)
2329
2999
  ? record.traces.filter((entry) => typeof entry === "string")
2330
- : traceFilePaths(sampleFiles),
3000
+ : traceFilePaths(files),
2331
3001
  };
2332
3002
  }
2333
3003
  function normalizeEvaluationSampleOutputs(args) {
@@ -2563,14 +3233,14 @@ function mergeEvaluationSampleRecords(samples) {
2563
3233
  function mergeEvaluationSampleGroup(group) {
2564
3234
  const first = group[0];
2565
3235
  if (group.length === 1) {
2566
- return normalizeSingleCaseDurations(first);
3236
+ return first;
2567
3237
  }
2568
3238
  const startedAt = minTimestamp(group.flatMap((sample) => (sample.startedAt ? [sample.startedAt] : [])));
2569
3239
  const finishedAt = maxTimestamp(group.flatMap((sample) => (sample.finishedAt ? [sample.finishedAt] : [])));
2570
3240
  const durationMs = startedAt && finishedAt
2571
3241
  ? Math.max(0, Date.parse(finishedAt) - Date.parse(startedAt))
2572
3242
  : undefined;
2573
- const cases = group.flatMap((sample) => normalizeCaseDurations(sample));
3243
+ const cases = group.flatMap((sample) => sample.cases ?? []);
2574
3244
  const metrics = aggregateSampleGroupMetrics(group);
2575
3245
  const usage = mergeUsageSummaries(group.map((sample) => sample.usage));
2576
3246
  const errors = group.flatMap((sample) => sample.error ? [sample.error] : []);
@@ -2588,22 +3258,6 @@ function mergeEvaluationSampleGroup(group) {
2588
3258
  ...(cases.length > 0 ? { cases } : {}),
2589
3259
  };
2590
3260
  }
2591
- function normalizeSingleCaseDurations(sample) {
2592
- if (!sample.cases) {
2593
- return sample;
2594
- }
2595
- const cases = normalizeCaseDurations(sample);
2596
- return cases.length === sample.cases.length
2597
- ? { ...sample, cases }
2598
- : sample;
2599
- }
2600
- function normalizeCaseDurations(sample) {
2601
- return (sample.cases ?? []).map((caseResult) => (typeof caseResult.durationMs === "number" ||
2602
- sample.cases?.length !== 1 ||
2603
- typeof sample.durationMs !== "number"
2604
- ? caseResult
2605
- : { ...caseResult, durationMs: sample.durationMs }));
2606
- }
2607
3261
  function aggregateSampleGroupMetrics(group) {
2608
3262
  const metricNames = new Set(group.flatMap((sample) => Object.keys(sample.metrics ?? {})));
2609
3263
  if (metricNames.size === 0) {