@workbench-ai/workbench-core 0.0.46 → 0.0.48

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (39) hide show
  1. package/dist/execution-events.d.ts +2 -2
  2. package/dist/execution-events.d.ts.map +1 -1
  3. package/dist/execution-events.js +3 -3
  4. package/dist/{execution-phases.d.ts → execution-evidence.d.ts} +8 -7
  5. package/dist/execution-evidence.d.ts.map +1 -0
  6. package/dist/{execution-phases.js → execution-evidence.js} +91 -51
  7. package/dist/execution-graph.js +1 -2
  8. package/dist/execution-jobs.js +1 -1
  9. package/dist/execution-outputs.d.ts.map +1 -1
  10. package/dist/execution-outputs.js +5 -10
  11. package/dist/execution-runtime-types.d.ts +7 -3
  12. package/dist/execution-runtime-types.d.ts.map +1 -1
  13. package/dist/execution-traces.d.ts +11 -1
  14. package/dist/execution-traces.d.ts.map +1 -1
  15. package/dist/execution-traces.js +305 -2
  16. package/dist/generic-spec.d.ts +8 -3
  17. package/dist/generic-spec.d.ts.map +1 -1
  18. package/dist/generic-spec.js +26 -37
  19. package/dist/index.d.ts +22 -11
  20. package/dist/index.d.ts.map +1 -1
  21. package/dist/index.js +888 -218
  22. package/dist/runtime-dockerfile.d.ts +14 -0
  23. package/dist/runtime-dockerfile.d.ts.map +1 -0
  24. package/dist/runtime-dockerfile.js +65 -0
  25. package/dist/sandbox-backends/docker.d.ts.map +1 -1
  26. package/dist/sandbox-backends/docker.js +9 -12
  27. package/dist/sandbox-backends/index.d.ts.map +1 -1
  28. package/dist/sandbox-backends/index.js +2 -1
  29. package/dist/sandbox-inputs.d.ts.map +1 -1
  30. package/dist/sandbox-inputs.js +1 -0
  31. package/dist/sandbox-plane.d.ts +1 -0
  32. package/dist/sandbox-plane.d.ts.map +1 -1
  33. package/dist/sandbox-plane.js +12 -22
  34. package/dist/trace-files.d.ts +2 -2
  35. package/dist/trace-files.d.ts.map +1 -1
  36. package/dist/trace-files.js +4 -4
  37. package/package.json +3 -3
  38. package/worker/sandbox-adapter-runner.cjs +22 -13
  39. package/dist/execution-phases.d.ts.map +0 -1
package/dist/index.js CHANGED
@@ -1,28 +1,30 @@
1
- import { createHash } from "node:crypto";
1
+ import { createHash, randomBytes } from "node:crypto";
2
2
  import os from "node:os";
3
3
  import path from "node:path";
4
+ import { fileURLToPath } from "node:url";
4
5
  import YAML from "yaml";
5
- import { adapterCommandName, assertWorkbenchAdapterOperationResultOk, collectWorkbenchAdapterAuthRequirements, readWorkbenchAdapterOperationResult, workbenchAdapterOperationCommand, workbenchAdapterOperationResultPath, } from "@workbench-ai/workbench-protocol";
6
- import { BENCHMARK_SPEC_FILE, engineCaseEnginePrivateFiles, engineCaseFilesForRuntimeInput, engineCaseSubjectVisibleFiles, resolveEngineCaseExecutionConfig, resolveWorkbenchResolvedSourceYaml as resolveWorkbenchResolvedSourceYamlInternal, validateWorkbenchResolvedSourceYaml as validateWorkbenchResolvedSourceYamlInternal, isWorkbenchSubjectManifestPath, } from "./generic-spec.js";
6
+ import { adapterCommandName, assertWorkbenchAdapterOperationResultOk, collectWorkbenchAdapterAuthRequirements, parseWorkbenchAdapterManifest, readWorkbenchAdapterOperationResult, WORKBENCH_RUNTIME_CONTROL_TOKEN_ENV, WORKBENCH_RUNTIME_CONTROL_URL_ENV, workbenchAdapterOperationCommand, workbenchAdapterOperationExecutor, workbenchAdapterOperationResultPath, } from "@workbench-ai/workbench-protocol";
7
+ import { BENCHMARK_SPEC_FILE, engineCasePrivateFiles, engineCaseFilesForRuntimeInput, engineCasePublicFiles, resolveEngineCaseExecutionConfig, resolveWorkbenchResolvedSourceYaml as resolveWorkbenchResolvedSourceYamlInternal, validateWorkbenchResolvedSourceYaml as validateWorkbenchResolvedSourceYamlInternal, isWorkbenchSubjectManifestPath, } from "./generic-spec.js";
7
8
  import { attachSandboxMetadataToJob, createWorkbenchSandboxFileStore, isSurfaceSnapshotFile, readWorkbenchExecutionSpec, } from "./sandbox-inputs.js";
8
- import { asRuntimeRecord, importNodeModule, jsonRecord, nodeBuiltin, quoteShellArg, resolveWorkbenchWorkerId, } from "./runtime-utils.js";
9
- import { executeValidatedSandboxExecution, } from "./sandbox-plane.js";
9
+ import { asRuntimeRecord, importNodeModule, isJsonPayload, jsonRecord, nodeBuiltin, quoteShellArg, resolveWorkbenchWorkerId, } from "./runtime-utils.js";
10
+ import { createWorkbenchExecutionCapability, createWorkbenchSandboxAllocation, collectExecutionCapabilityScopeIssues, collectSandboxAllocationScopeIssues, collectSandboxHandleScopeIssues, assertSandboxBackendSupportsNetworkPolicy, executeValidatedSandboxExecution, } from "./sandbox-plane.js";
10
11
  import { createSandboxBackendPlaneForProvider, } from "./sandbox-backends/index.js";
11
12
  import { applyWorkbenchSubjectPatch } from "./subject-patch.js";
12
13
  import { assignUsageRole, completeUsageSummary, mergeUsageSummaries, normalizeUsageSummary, usageStats, } from "./execution-usage.js";
13
- import { traceFilePaths, workbenchTracePhaseDirectory, } from "./trace-files.js";
14
+ import { traceFilePaths, workbenchTraceExecutionDirectory, } from "./trace-files.js";
14
15
  import { engineCaseForCase, } from "./execution-jobs.js";
15
- import { createWorkbenchExecutionEventPublisher, publishCommandPhaseEvent, } from "./execution-events.js";
16
- import { readWorkbenchExecutionPurpose } from "./execution-phases.js";
16
+ import { createWorkbenchExecutionEventPublisher, publishCommandStepEvent, } from "./execution-events.js";
17
+ import { readWorkbenchExecutionPurpose } from "./execution-evidence.js";
17
18
  import { adapterAuthEnv, localWorkbenchAdapterAuthStore, normalizeWorkbenchAdapterAuthTarget, sanitizeWorkbenchAdapterAuthBundle, } from "./adapter-auth.js";
18
- export { BENCHMARK_SPEC_FILE, DEFAULT_EXECUTION_RESOURCES, engineCaseEnginePrivateFiles, engineCaseFilesForRuntimeInput, engineCaseSubjectVisibleFiles, engineResolveInvocationForSpec, engineResolveBindingForSpec, engineResolveBindingForSourceYaml, isWorkbenchSubjectManifestPath, parseWorkbenchSourceFiles, resolveEngineCaseExecutionConfig, resolveWorkbenchResolvedSourceYaml, resolveWorkbenchSourceFiles, serializeWorkbenchResolvedSourceYaml, validateWorkbenchResolvedSourceYaml, } from "./generic-spec.js";
19
- export { adapterCommandName, cloneWorkbenchAdapterManifest, collectWorkbenchAdapterAuthRequirements, collectWorkbenchAdapterInvocations, parseWorkbenchAdapterManifest, workbenchAdapterManifestRequiresAuth, workbenchAdapterManifestSupportsOperation, workbenchAdapterOperationCommand, withDefaultWorkbenchAdapterAuth, withDefaultWorkbenchAdapterAuthProfiles, } from "@workbench-ai/workbench-protocol";
19
+ export { BENCHMARK_SPEC_FILE, DEFAULT_EXECUTION_RESOURCES, engineCasePrivateFiles, engineCaseFilesForRuntimeInput, engineCasePublicFiles, engineResolveInvocationForSpec, engineResolveBindingForSpec, engineResolveBindingForSourceYaml, isWorkbenchSubjectManifestPath, parseWorkbenchSourceFiles, resolveEngineCaseExecutionConfig, resolveWorkbenchResolvedSourceYaml, resolveWorkbenchSourceFiles, runtimeNetwork, runtimeResources, serializeWorkbenchResolvedSourceYaml, validateWorkbenchResolvedSourceYaml, } from "./generic-spec.js";
20
+ export { composeRuntimeDockerfileWithAdapterInstallers, } from "./runtime-dockerfile.js";
21
+ export { adapterCommandName, cloneWorkbenchAdapterManifest, collectWorkbenchAdapterAuthRequirements, collectWorkbenchAdapterInvocations, parseWorkbenchAdapterManifest, workbenchAdapterManifestRequiresAuth, workbenchAdapterManifestSupportsOperation, workbenchAdapterOperationCommand, workbenchAdapterOperationExecutor, withDefaultWorkbenchAdapterAuth, withDefaultWorkbenchAdapterAuthProfiles, } from "@workbench-ai/workbench-protocol";
20
22
  export { adapterAuthEnv, createWorkbenchAdapterAuthBundle, defaultWorkbenchAdapterAuthStoreRoot, localWorkbenchAdapterAuthStore, normalizeWorkbenchAdapterAuthTarget, parseWorkbenchAdapterAuthTarget, sanitizeWorkbenchAdapterAuthBundle, } from "./adapter-auth.js";
21
23
  export { asRuntimeRecord, importNodeModule, nodeBuiltin, normalizeWorkbenchWorkerId, normalizeRuntimeRegistry, quoteShellArg, resolveDockerRuntimeImageRef, resolveWorkbenchWorkerId, } from "./runtime-utils.js";
22
24
  export { assignUsageRole, extractExecutionUsageFromTrace, mergeUsageSummaries, } from "./execution-usage.js";
23
25
  export { createWorkbenchProgressStdoutParser, publishWorkbenchProgressStdoutEnvelope, } from "./execution-events.js";
24
26
  export { resolveSandboxTemplateImage, } from "./sandbox-backends/template-images.js";
25
- export { readOutputTraceFiles, workbenchTracePhaseDirectory, workbenchTraceRunDirectory, workbenchTraceRunDirectoryName, } from "./trace-files.js";
27
+ export { readOutputTraceFiles, workbenchTraceExecutionDirectory, workbenchTraceRunDirectory, workbenchTraceRunDirectoryName, } from "./trace-files.js";
26
28
  export { assertWorkbenchAdapterOperationSupport, assertWorkbenchAdapterOperationResultOk, collectWorkbenchAdapterOperationIssues, collectWorkbenchAdapterOperationRequirements, ensureWorkbenchAdapterOutputDir, WORKBENCH_ADAPTER_RESULT_FILE, normalizeWorkbenchAdapterOperationRequest, normalizeWorkbenchAdapterOperationResult, readWorkbenchAdapterOperationRequest, readWorkbenchAdapterOperationResult, workbenchAdapterOperationResultPath, writeWorkbenchAdapterOperationResult, } from "@workbench-ai/workbench-protocol";
27
29
  export { applyWorkbenchSubjectPatch, } from "./subject-patch.js";
28
30
  export { createWorkbenchSandboxFileStore, createSandboxAdapterRequest, executionResultFromCompletedSandboxJob, materializeWorkbenchSandboxInput, readWorkbenchExecutionSpec, sanitizeWorkbenchExecutionJobForSandbox, } from "./sandbox-inputs.js";
@@ -31,8 +33,8 @@ export { createBaselineSubjectExecution, createBaselineSubjectJob, createWorkben
31
33
  export { addCapacity, capacityFits, runWorkbenchExecutionDag, subtractCapacity, workbenchJobDependencies, workbenchJobHostCost, workbenchJobResources, } from "./execution-scheduler.js";
32
34
  export { assertWorkbenchExecutionIsolation, collectWorkbenchExecutionIsolationIssues, validateWorkbenchExecutionOutputPayloads, } from "./execution-outputs.js";
33
35
  export { collectSandboxAllocationScopeIssues, collectExecutionCapabilityScopeIssues, collectSandboxHandleScopeIssues, createWorkbenchSandboxAllocation, createWorkbenchSandboxExecutionMetadata, createWorkbenchExecutionCapability, executeValidatedSandboxExecution, } from "./sandbox-plane.js";
34
- export { buildSubjectCasePhaseRefs, buildWorkbenchTracePhases, isWorkbenchPhaseActive, readWorkbenchExecutionId, readWorkbenchExecutionMetadataNumber, readWorkbenchExecutionMetadataString, readWorkbenchExecutionPurpose, resolveWorkbenchJobGroupStatus, } from "./execution-phases.js";
35
- export { finalizeWorkbenchExecutionTraceForJob, mergeWorkbenchExecutionTracesByJob, } from "./execution-traces.js";
36
+ export { buildSubjectCaseExecutionRefs, buildWorkbenchExecutionEvidence, isWorkbenchExecutionActive, readWorkbenchExecutionId, readWorkbenchExecutionMetadataNumber, readWorkbenchExecutionMetadataString, readWorkbenchExecutionPurpose, resolveWorkbenchJobGroupStatus, } from "./execution-evidence.js";
37
+ export { buildWorkbenchTraceSessionsFromFiles, combineWorkbenchTraceSessions, finalizeWorkbenchExecutionTraceForJob, mergeWorkbenchExecutionTracesByJob, readWorkbenchExecutionTraceFiles, traceSessionLabel, } from "./execution-traces.js";
36
38
  export { DOCKER_SANDBOX_BACKEND, assertSandboxHostHealthForProvider, createDockerSandboxBackendDescriptor, createDockerSandboxPlane, resolveWorkbenchSandboxProviderName, sandboxProviderAdmissionForResources, sandboxProviderDefaultMaxConcurrentJobs, sandboxProviderLeaseScope, sandboxHostHealthExpectationForProvider, } from "./sandbox-backends/index.js";
37
39
  export const DEFAULT_ENVIRONMENT_VERSIONS = [
38
40
  {
@@ -142,7 +144,7 @@ export const DEFAULT_ENVIRONMENTS = [
142
144
  {
143
145
  id: "env_libreoffice_agent",
144
146
  name: "LibreOffice + Agent",
145
- description: "Agent runtime with soffice and Python libraries for spreadsheet-heavy skill and rubric evaluations.",
147
+ description: "Agent runtime with soffice and Python libraries for spreadsheet-heavy evaluations.",
146
148
  currentVersionId: "envv_libreoffice_agent",
147
149
  builtIn: true,
148
150
  createdAt: "2026-04-29T00:00:00.000Z",
@@ -278,30 +280,36 @@ function adapterProtocolCommandSpec(adapter, operation, manifests = []) {
278
280
  return {
279
281
  use: "command",
280
282
  command: manifest ? workbenchAdapterOperationCommand(manifest, operation) : adapterCommandName(adapter.use),
283
+ executor: manifest ? workbenchAdapterOperationExecutor(manifest, operation) : "sandbox",
281
284
  };
282
285
  }
283
- function protocolPhaseForExecution(execution, manifests) {
284
- const role = executionPurposeRole(execution.purpose);
285
- const operation = execution.purpose === "improve" ? "optimizer.improve" : "subject.run";
286
+ function protocolStepForExecution(execution, manifests) {
287
+ if (execution.purpose !== "improve") {
288
+ throw new Error(`Protocol execution step only supports improve executions, not ${execution.purpose}.`);
289
+ }
290
+ const operation = "optimizer.improve";
286
291
  const command = adapterProtocolCommandSpec(execution.adapter, operation, manifests);
287
292
  return {
288
- kind: role,
293
+ kind: "optimizer",
289
294
  label: execution.purpose,
290
295
  operation,
296
+ executor: command.executor,
291
297
  adapter: execution.adapter,
292
298
  command: command.command,
293
299
  };
294
300
  }
295
- function attemptPhasesForExecution(execution, spec, manifests) {
301
+ function attemptStepsForExecution(execution, spec, manifests) {
296
302
  void spec;
297
- const enginePhase = {
303
+ const command = adapterProtocolCommandSpec(execution.adapter, "engine.run", manifests);
304
+ const engineStep = {
298
305
  kind: "engine",
299
306
  label: "engine",
300
307
  operation: "engine.run",
308
+ executor: command.executor,
301
309
  adapter: execution.adapter,
302
- command: adapterProtocolCommandSpec(execution.adapter, "engine.run", manifests).command,
310
+ command: command.command,
303
311
  };
304
- return [enginePhase];
312
+ return [engineStep];
305
313
  }
306
314
  function adapterConfigRecord(adapter, manifests = []) {
307
315
  const config = cloneJsonRecord(jsonRecord(adapter.with));
@@ -411,7 +419,10 @@ export function materializeWorkbenchRunResult(args) {
411
419
  .sort((left, right) => compareSampleOutputs(left.output, right.output));
412
420
  const outputJobIds = new Set(outputs.flatMap(({ jobs }) => jobs.map((job) => job.id)));
413
421
  const completedSampleKeys = new Set(outputs
414
- .map(({ output }) => evaluationSampleGroupKeyFromOutput(output))
422
+ .flatMap(({ jobs, output }) => [
423
+ evaluationSampleGroupKeyFromOutput(output),
424
+ ...jobs.map(evaluationSampleGroupKeyFromJob),
425
+ ])
415
426
  .filter((key) => key !== null));
416
427
  const errorSampleJobs = [
417
428
  ...subjectJobs.filter((job) => job.status === "failed"),
@@ -422,12 +433,13 @@ export function materializeWorkbenchRunResult(args) {
422
433
  ...outputs.map(({ jobs, output }) => withJobUsage(output.sample, completed, jobs[0])),
423
434
  ...errorSamples,
424
435
  ].sort((left, right) => left.index - right.index || left.id.localeCompare(right.id));
425
- const evalRecord = createEvaluationRecord(subjectId, samples);
436
+ const subjectName = normalizedSubjectDisplayName(args.spec.subject.name);
437
+ const evalRecord = createEvaluationRecord(subjectId, subjectName, samples);
426
438
  const usage = mergeUsageSummaries([
427
439
  subjectRevision.usage,
428
440
  ...samples.map((sample) => sample.usage),
429
441
  ]);
430
- const metrics = evaluationMeanMetrics(createEvaluationRecord(subjectId, samples));
442
+ const metrics = evaluationMeanMetrics(evalRecord);
431
443
  const attemptIndex = subjectRevision.attemptIndex;
432
444
  const evaluationTraces = [
433
445
  ...outputs.flatMap(({ output }) => output.traces),
@@ -457,6 +469,7 @@ export function materializeWorkbenchRunResult(args) {
457
469
  }
458
470
  const record = {
459
471
  id: subjectId,
472
+ ...(subjectName ? { name: subjectName } : {}),
460
473
  ordinal: args.existingSubjectCount + subjects.length,
461
474
  benchmarkFingerprint: args.benchmarkFingerprint,
462
475
  subjectFingerprint: args.subjectFingerprint ?? materializedSubjectFingerprint(args.spec, subjectRevision.files),
@@ -472,7 +485,7 @@ export function materializeWorkbenchRunResult(args) {
472
485
  meta,
473
486
  };
474
487
  subjects.push(record);
475
- evaluations.push(createEvaluationResultRecord({
488
+ evaluations.push(createEvaluationScorecard({
476
489
  runId: args.runId,
477
490
  benchmarkFingerprint: args.benchmarkFingerprint,
478
491
  createdAt: args.startedAt,
@@ -528,6 +541,8 @@ function materializedSubjectFingerprint(spec, files) {
528
541
  hash.update("workbench-subject-v1\0");
529
542
  hash.update("materialized\0runner\0");
530
543
  hash.update(JSON.stringify(spec.run));
544
+ hash.update("prepare");
545
+ hash.update(JSON.stringify(spec.subject.prepare ?? null));
531
546
  for (const file of filterSubjectSourceFiles(files).slice().sort((left, right) => left.path.localeCompare(right.path))) {
532
547
  hash.update("\0file\0");
533
548
  hash.update(file.path);
@@ -547,14 +562,15 @@ function materializedSubjectFiles(args) {
547
562
  }
548
563
  return [...byPath.values()].sort((left, right) => left.path.localeCompare(right.path));
549
564
  }
550
- function createEvaluationResultRecord(args) {
565
+ function createEvaluationScorecard(args) {
551
566
  const evaluation = args.evaluation;
552
567
  return {
553
- id: evaluationResultId(args.runId, args.subject.id),
568
+ id: evaluationScorecardId(args.runId, args.subject.id),
554
569
  runId: args.runId,
555
570
  benchmarkFingerprint: args.benchmarkFingerprint,
556
571
  subjectFingerprint: args.subject.subjectFingerprint,
557
572
  subjectId: args.subject.id,
573
+ ...(args.subject.name ? { subjectName: args.subject.name } : {}),
558
574
  createdAt: args.createdAt,
559
575
  updatedAt: evaluation.finishedAt ?? args.createdAt,
560
576
  status: evaluation.status,
@@ -568,7 +584,7 @@ function createEvaluationResultRecord(args) {
568
584
  evaluation,
569
585
  };
570
586
  }
571
- function evaluationResultId(runId, subjectId) {
587
+ export function evaluationScorecardId(runId, subjectId) {
572
588
  const runPart = runId.replace(/[^a-z0-9]+/giu, "_").replace(/^_+|_+$/gu, "").slice(-24);
573
589
  const subjectPart = subjectId.replace(/[^a-z0-9]+/giu, "_").replace(/^_+|_+$/gu, "").slice(-24);
574
590
  return `eval_${runPart}_${subjectPart}`;
@@ -584,7 +600,7 @@ export function isWorkbenchInternalOutputPath(filePath) {
584
600
  normalized === "sandbox-environment.json" ||
585
601
  normalized === "sandbox_error.log" ||
586
602
  normalized === "exit_code" ||
587
- /^[a-z-]+_(stdout\.log|stderr\.log|exit_code)$/u.test(normalized));
603
+ /^[a-z_-]+_(stdout\.log|stderr\.log|exit_code)$/u.test(normalized));
588
604
  }
589
605
  export function createSubjectRevisionTraceInputFiles(args) {
590
606
  const files = [];
@@ -620,6 +636,23 @@ export function createSubjectRevisionTraceInputFiles(args) {
620
636
  }, null, 2)}\n`));
621
637
  return dedupeSurfaceFiles(files);
622
638
  }
639
+ export function createSubjectEvaluationTraceInputFiles(args) {
640
+ const subject = args.subject;
641
+ if (!subject?.eval && !subject?.metrics) {
642
+ return [];
643
+ }
644
+ const filePath = normalizeRelativePath(args.path ?? `base-subject/${subject.id}/evaluation.json`);
645
+ const payload = {
646
+ kind: "subject_evaluation",
647
+ subjectId: subject.id,
648
+ status: subject.status,
649
+ metrics: subject.metrics ?? null,
650
+ fileChanges: subject.fileChanges,
651
+ eval: subject.eval ?? null,
652
+ prompt: subject.prompt ?? null,
653
+ };
654
+ return [textSurfaceFile(filePath, `${JSON.stringify(payload, null, 2)}\n`)];
655
+ }
623
656
  function isTerminalExecutionJob(job) {
624
657
  return job.kind === "execute" && (job.status === "succeeded" ||
625
658
  job.status === "failed" ||
@@ -866,16 +899,14 @@ export function createSubjectFilePreview(args) {
866
899
  };
867
900
  }
868
901
  export function createCaseReview(args) {
869
- const preferredSampleIndex = uniquePhaseSampleIndex(args.phases ?? []);
870
- const sampleMatchesCase = (sample) => sample.id === args.caseId ||
871
- sample.id.startsWith(`${args.caseId}__`) ||
872
- (sample.cases ?? []).some((entry) => entry.id === args.caseId || entry.id.startsWith(`${args.caseId}__`));
902
+ const preferredSampleIndex = uniqueExecutionSampleIndex(args.executions ?? []);
903
+ const sampleMatchesCase = (sample) => (sample.cases ?? []).some((entry) => entry.id === args.caseId);
873
904
  const samples = args.subject.eval?.samples ?? [];
874
905
  const sampleResult = samples.find((sample) => typeof preferredSampleIndex === "number" &&
875
906
  sample.index === preferredSampleIndex &&
876
907
  sampleMatchesCase(sample)) ?? samples.find(sampleMatchesCase);
877
- const caseResult = sampleResult?.cases?.find((entry) => entry.id === args.caseId || entry.id.startsWith(`${args.caseId}__`));
878
- if (!sampleResult && (args.phases?.length ?? 0) > 0) {
908
+ const caseResult = sampleResult?.cases?.find((entry) => entry.id === args.caseId);
909
+ if (!sampleResult && (args.executions?.length ?? 0) > 0) {
879
910
  return {
880
911
  subjectId: args.subject.id,
881
912
  caseId: args.caseId,
@@ -884,7 +915,7 @@ export function createCaseReview(args) {
884
915
  ? { sampleIndex: preferredSampleIndex }
885
916
  : {}),
886
917
  metrics: {},
887
- phases: args.phases ?? [],
918
+ executions: args.executions ?? [],
888
919
  criteria_results: [],
889
920
  };
890
921
  }
@@ -893,28 +924,21 @@ export function createCaseReview(args) {
893
924
  }
894
925
  const durationMs = typeof caseResult?.durationMs === "number"
895
926
  ? caseResult.durationMs
896
- : sampleResult?.cases?.length === 1 &&
897
- typeof sampleResult.durationMs === "number"
898
- ? sampleResult.durationMs
899
- : !caseResult && typeof sampleResult.durationMs === "number"
900
- ? sampleResult.durationMs
901
- : undefined;
902
- const sampleStatus = sampleResult.status === "planned" ? undefined : sampleResult.status;
903
- const status = caseResult?.status ?? sampleStatus;
927
+ : undefined;
904
928
  return {
905
929
  subjectId: args.subject.id,
906
- caseId: caseResult?.id ?? sampleResult.id,
930
+ caseId: caseResult?.id ?? args.caseId,
907
931
  caseLabel: caseResult?.label ?? args.caseId,
908
932
  sampleId: sampleResult.id,
909
933
  sampleIndex: sampleResult.index,
910
- ...(status ? { status } : {}),
911
- metrics: caseResult?.metrics ?? sampleResult.metrics ?? {},
934
+ ...(caseResult?.status ? { status: caseResult.status } : {}),
935
+ metrics: caseResult?.metrics ?? {},
912
936
  ...(typeof durationMs === "number" ? { durationMs } : {}),
913
937
  ...(caseResult?.source ? { source: caseResult.source } : {}),
914
- ...((caseResult?.feedback ?? sampleResult.feedback) !== undefined
915
- ? { feedback: caseResult?.feedback ?? sampleResult.feedback }
938
+ ...(caseResult?.feedback !== undefined
939
+ ? { feedback: caseResult.feedback }
916
940
  : {}),
917
- phases: args.phases ?? [],
941
+ executions: args.executions ?? [],
918
942
  criteria_results: (caseResult?.criteria ?? []).map((criterion) => ({
919
943
  criterion_id: criterion.criterion_id,
920
944
  pass: criterion.pass,
@@ -924,9 +948,9 @@ export function createCaseReview(args) {
924
948
  })),
925
949
  };
926
950
  }
927
- function uniquePhaseSampleIndex(phases) {
928
- const sampleIndices = new Set(phases
929
- .map((phase) => phase.sampleIndex)
951
+ function uniqueExecutionSampleIndex(executions) {
952
+ const sampleIndices = new Set(executions
953
+ .map((execution) => execution.sampleIndex)
930
954
  .filter((index) => typeof index === "number"));
931
955
  if (sampleIndices.size !== 1) {
932
956
  return null;
@@ -951,6 +975,7 @@ function parseAuthoredWorkbenchSourceSpec(source) {
951
975
  name: resolved.subject.name,
952
976
  description: resolved.subject.description,
953
977
  files: { path: resolved.subject.files.path },
978
+ ...(resolved.subject.prepare ? { prepare: { ...resolved.subject.prepare } } : {}),
954
979
  run: runSpecFromInvocation(resolved.run),
955
980
  },
956
981
  ...(resolved.optimizer
@@ -1101,11 +1126,18 @@ export async function executeWorkbenchExecutionJob(args, options) {
1101
1126
  const runtimeArgs = adapterAuthProfiles.length > 0
1102
1127
  ? { ...args, adapterAuthProfiles }
1103
1128
  : args;
1104
- const executionForSandbox = readWorkbenchExecutionSpec(runtimeArgs.job);
1129
+ const executionForRuntime = readWorkbenchExecutionSpec(runtimeArgs.job);
1130
+ const executor = workbenchExecutionExecutorForRuntimeInput(runtimeArgs);
1131
+ if (executor === "host") {
1132
+ return await withWorkbenchRuntimeControlServer(runtimeArgs, options, startedAt, async (adapterRuntimeEnv) => executeAdapterInCurrentRuntime({
1133
+ ...runtimeArgs,
1134
+ adapterRuntimeEnv,
1135
+ }, executionForRuntime, startedAt, createWorkbenchExecutionCapability(executionForRuntime, { now: startedAt })));
1136
+ }
1105
1137
  const fileStore = createWorkbenchSandboxFileStore(runtimeArgs);
1106
1138
  const planeFactory = options.createSandboxPlaneForProvider ?? createSandboxBackendPlaneForProvider;
1107
1139
  const plane = planeFactory(options.sandboxProvider, runtimeArgs, startedAt, fileStore);
1108
- const validated = await executeValidatedSandboxExecution(plane, executionForSandbox, {
1140
+ const validated = await executeValidatedSandboxExecution(plane, executionForRuntime, {
1109
1141
  now: startedAt,
1110
1142
  runnerId: resolveWorkbenchWorkerId([
1111
1143
  process.env.WORKBENCH_WORKER_ID,
@@ -1121,6 +1153,215 @@ export async function executeWorkbenchExecutionJob(args, options) {
1121
1153
  return failWorkbenchRunJob(args.job, startedAt, error);
1122
1154
  }
1123
1155
  }
1156
+ export function workbenchExecutionExecutorForRuntimeInput(args) {
1157
+ if (args.runtimeControlOperation) {
1158
+ return "sandbox";
1159
+ }
1160
+ const execution = readWorkbenchExecutionSpec(args.job);
1161
+ const operation = adapterOperationForExecutionPurpose(execution.purpose);
1162
+ if (!operation) {
1163
+ return "sandbox";
1164
+ }
1165
+ const manifest = args.adapterManifests?.find((entry) => entry.id === execution.adapter.use);
1166
+ return manifest ? workbenchAdapterOperationExecutor(manifest, operation) : "sandbox";
1167
+ }
1168
+ function adapterOperationForExecutionPurpose(purpose) {
1169
+ if (purpose === "improve") {
1170
+ return "optimizer.improve";
1171
+ }
1172
+ if (purpose === "attempt") {
1173
+ return "engine.run";
1174
+ }
1175
+ return null;
1176
+ }
1177
+ const RUNTIME_CONTROL_MAX_BODY_BYTES = 512 * 1024 * 1024;
1178
+ async function withWorkbenchRuntimeControlServer(args, options, startedAt, run) {
1179
+ const [{ createServer }] = await Promise.all([
1180
+ importNodeModule(nodeBuiltin("http")),
1181
+ ]);
1182
+ const token = randomBytes(24).toString("base64url");
1183
+ const server = createServer((request, response) => {
1184
+ void handleWorkbenchRuntimeControlHttpRequest({
1185
+ request,
1186
+ response,
1187
+ token,
1188
+ args,
1189
+ options,
1190
+ startedAt,
1191
+ });
1192
+ });
1193
+ const url = await new Promise((resolve, reject) => {
1194
+ server.once("error", reject);
1195
+ server.listen(0, "127.0.0.1", () => {
1196
+ server.off("error", reject);
1197
+ const address = server.address();
1198
+ if (!address || typeof address === "string") {
1199
+ reject(new Error("Workbench runtime-control server did not expose a local TCP address."));
1200
+ return;
1201
+ }
1202
+ resolve(`http://127.0.0.1:${address.port}`);
1203
+ });
1204
+ });
1205
+ try {
1206
+ return await run({
1207
+ [WORKBENCH_RUNTIME_CONTROL_URL_ENV]: url,
1208
+ [WORKBENCH_RUNTIME_CONTROL_TOKEN_ENV]: token,
1209
+ });
1210
+ }
1211
+ finally {
1212
+ await new Promise((resolve) => server.close(() => resolve()));
1213
+ }
1214
+ }
1215
+ async function handleWorkbenchRuntimeControlHttpRequest(args) {
1216
+ const { request, response } = args;
1217
+ try {
1218
+ if (request.method !== "POST" || request.url !== "/v1/operation-sequence") {
1219
+ writeRuntimeControlJson(response, 404, { error: "Unknown Workbench runtime-control endpoint." });
1220
+ return;
1221
+ }
1222
+ if (request.headers.authorization !== `Bearer ${args.token}`) {
1223
+ writeRuntimeControlJson(response, 401, { error: "Workbench runtime-control token is invalid." });
1224
+ return;
1225
+ }
1226
+ const parsed = JSON.parse(await readRuntimeControlBody(request));
1227
+ const controlRequest = normalizeRuntimeControlOperationSequenceRequest(parsed);
1228
+ const result = await executeRuntimeControlOperationSequenceInSandbox(args.args, args.options, args.startedAt, controlRequest);
1229
+ writeRuntimeControlJson(response, 200, result);
1230
+ }
1231
+ catch (error) {
1232
+ writeRuntimeControlJson(response, 500, {
1233
+ error: error instanceof Error ? error.stack ?? error.message : String(error),
1234
+ });
1235
+ }
1236
+ }
1237
+ function writeRuntimeControlJson(response, statusCode, payload) {
1238
+ response.statusCode = statusCode;
1239
+ response.setHeader("content-type", "application/json");
1240
+ response.end(`${JSON.stringify(payload, null, 2)}\n`);
1241
+ }
1242
+ function readRuntimeControlBody(request) {
1243
+ return new Promise((resolve, reject) => {
1244
+ const chunks = [];
1245
+ let size = 0;
1246
+ request.on("data", (chunk) => {
1247
+ size += chunk.length;
1248
+ if (size > RUNTIME_CONTROL_MAX_BODY_BYTES) {
1249
+ reject(new Error("Workbench runtime-control request body is too large."));
1250
+ request.destroy();
1251
+ return;
1252
+ }
1253
+ chunks.push(chunk);
1254
+ });
1255
+ request.on("error", reject);
1256
+ request.on("end", () => resolve(Buffer.concat(chunks).toString("utf8")));
1257
+ });
1258
+ }
1259
+ function normalizeRuntimeControlOperationSequenceRequest(value) {
1260
+ if (!value || typeof value !== "object" || Array.isArray(value)) {
1261
+ throw new Error("Workbench runtime-control operation sequence request must be an object.");
1262
+ }
1263
+ const record = value;
1264
+ if (!Array.isArray(record.operations) || record.operations.length === 0) {
1265
+ throw new Error("Workbench runtime-control operation sequence requires at least one operation.");
1266
+ }
1267
+ const inputs = normalizeRuntimeControlInputs(record.inputs);
1268
+ return {
1269
+ ...(inputs ? { inputs } : {}),
1270
+ operations: record.operations.map((entry, index) => normalizeRuntimeControlOperation(entry, `operations[${index}]`)),
1271
+ ...(typeof record.prepare === "boolean" ? { prepare: record.prepare } : {}),
1272
+ ...(typeof record.collectWorkspace === "boolean" ? { collectWorkspace: record.collectWorkspace } : {}),
1273
+ };
1274
+ }
1275
+ function normalizeRuntimeControlInputs(value) {
1276
+ if (value === undefined) {
1277
+ return undefined;
1278
+ }
1279
+ if (!value || typeof value !== "object" || Array.isArray(value)) {
1280
+ throw new Error("Workbench runtime-control inputs must be an object.");
1281
+ }
1282
+ const record = value;
1283
+ const inputs = {};
1284
+ if (hasOwn(record, "subject")) {
1285
+ inputs.subject = normalizeRuntimeControlFiles(record.subject, "inputs.subject");
1286
+ }
1287
+ if (hasOwn(record, "case")) {
1288
+ inputs.case = normalizeRuntimeControlFiles(record.case, "inputs.case");
1289
+ }
1290
+ if (hasOwn(record, "enginePrivate")) {
1291
+ inputs.enginePrivate = normalizeRuntimeControlFiles(record.enginePrivate, "inputs.enginePrivate");
1292
+ }
1293
+ if (hasOwn(record, "traces")) {
1294
+ inputs.traces = normalizeRuntimeControlFiles(record.traces, "inputs.traces");
1295
+ }
1296
+ if (hasOwn(record, "workspace")) {
1297
+ inputs.workspace = normalizeRuntimeControlFiles(record.workspace, "inputs.workspace");
1298
+ }
1299
+ if (hasOwn(record, "output")) {
1300
+ inputs.output = normalizeRuntimeControlFiles(record.output, "inputs.output");
1301
+ }
1302
+ return inputs;
1303
+ }
1304
+ function normalizeRuntimeControlFiles(value, label) {
1305
+ if (value === undefined) {
1306
+ return [];
1307
+ }
1308
+ if (!Array.isArray(value)) {
1309
+ throw new Error(`Workbench runtime-control ${label} must be an array.`);
1310
+ }
1311
+ return value.map((entry, index) => {
1312
+ if (!isSurfaceSnapshotFile(entry)) {
1313
+ throw new Error(`Workbench runtime-control ${label}[${index}] must be a surface snapshot file.`);
1314
+ }
1315
+ return { ...entry, path: normalizeRelativePath(entry.path) };
1316
+ });
1317
+ }
1318
+ function hasOwn(value, key) {
1319
+ return Object.prototype.hasOwnProperty.call(value, key);
1320
+ }
1321
+ function normalizeRuntimeControlOperation(value, label) {
1322
+ if (!value || typeof value !== "object" || Array.isArray(value)) {
1323
+ throw new Error(`Workbench runtime-control ${label} must be an object.`);
1324
+ }
1325
+ const record = value;
1326
+ const operation = record.operation;
1327
+ if (operation !== "engine.resolve" &&
1328
+ operation !== "engine.run" &&
1329
+ operation !== "subject.run" &&
1330
+ operation !== "optimizer.improve") {
1331
+ throw new Error(`Workbench runtime-control ${label}.operation is invalid.`);
1332
+ }
1333
+ const invocation = record.invocation;
1334
+ if (!invocation || typeof invocation !== "object" || Array.isArray(invocation)) {
1335
+ throw new Error(`Workbench runtime-control ${label}.invocation must be an object.`);
1336
+ }
1337
+ const invocationRecord = invocation;
1338
+ if (typeof invocationRecord.use !== "string" || invocationRecord.use.length === 0) {
1339
+ throw new Error(`Workbench runtime-control ${label}.invocation.use is required.`);
1340
+ }
1341
+ const withConfig = invocationRecord.with === undefined
1342
+ ? {}
1343
+ : isJsonPayload(invocationRecord.with)
1344
+ ? invocationRecord.with
1345
+ : null;
1346
+ if (withConfig === null) {
1347
+ throw new Error(`Workbench runtime-control ${label}.invocation.with must be JSON.`);
1348
+ }
1349
+ if (invocationRecord.auth !== undefined && !isJsonPayload(invocationRecord.auth)) {
1350
+ throw new Error(`Workbench runtime-control ${label}.invocation.auth must be JSON.`);
1351
+ }
1352
+ return {
1353
+ operation,
1354
+ invocation: {
1355
+ use: invocationRecord.use,
1356
+ with: withConfig,
1357
+ ...(invocationRecord.auth !== undefined ? { auth: invocationRecord.auth } : {}),
1358
+ ...(typeof invocationRecord.command === "string" && invocationRecord.command.trim()
1359
+ ? { command: invocationRecord.command }
1360
+ : {}),
1361
+ },
1362
+ ...(typeof record.label === "string" && record.label.trim() ? { label: record.label } : {}),
1363
+ };
1364
+ }
1124
1365
  async function explicitAdapterAuthProfilesForExecution(execution, args, loadLocalAdapterProfiles) {
1125
1366
  const required = requiredAdapterAuthTargetsForExecution(execution, args);
1126
1367
  if (required.length === 0) {
@@ -1155,7 +1396,7 @@ function adapterAuthTargetKey(target) {
1155
1396
  export function workbenchExecutionPurpose(job) {
1156
1397
  return readWorkbenchExecutionPurpose(job);
1157
1398
  }
1158
- export async function executeAdapterInCurrentSandboxRuntime(args, execution, startedAt, capability) {
1399
+ export async function executeAdapterInCurrentRuntime(args, execution, startedAt, capability) {
1159
1400
  const eventPublisher = createWorkbenchExecutionEventPublisher({
1160
1401
  projectId: args.job.projectId,
1161
1402
  runId: args.job.runId,
@@ -1174,10 +1415,10 @@ export async function executeAdapterInCurrentSandboxRuntime(args, execution, sta
1174
1415
  };
1175
1416
  try {
1176
1417
  if (execution.purpose === "improve") {
1177
- return await executeSubjectRevisionExecutionInSandbox(runtimeInput, execution, startedAt, capability, eventPublisher);
1418
+ return await executeSubjectRevisionExecutionInCurrentRuntime(runtimeInput, execution, startedAt, capability, eventPublisher);
1178
1419
  }
1179
1420
  if (execution.purpose === "attempt") {
1180
- return await executeAttemptExecutionInSandbox(runtimeInput, execution, startedAt, capability, eventPublisher);
1421
+ return await executeAttemptExecutionInCurrentRuntime(runtimeInput, execution, startedAt, capability, eventPublisher);
1181
1422
  }
1182
1423
  throw new Error(`Unsupported execution purpose ${execution.purpose}.`);
1183
1424
  }
@@ -1274,7 +1515,7 @@ function adapterAuthRequest(bundles, root, currentAdapterId) {
1274
1515
  }
1275
1516
  return entries;
1276
1517
  }
1277
- function adapterAuthRequestForPhase(args, adapterId) {
1518
+ function adapterAuthRequestForStep(args, adapterId) {
1278
1519
  const profiles = (args.adapterAuthProfiles ?? [])
1279
1520
  .map((bundle) => sanitizeWorkbenchAdapterAuthBundle(bundle));
1280
1521
  if (profiles.length === 0) {
@@ -1295,12 +1536,19 @@ function adapterAuthProfilesForExecution(execution, args) {
1295
1536
  }
1296
1537
  function requiredAdapterAuthTargetsForExecution(execution, args) {
1297
1538
  const manifests = args.adapterManifests ?? [];
1298
- return collectWorkbenchAdapterAuthRequirements(adapterInvocationsForExecution(execution, args.spec), manifests)
1539
+ return collectWorkbenchAdapterAuthRequirements(adapterInvocationsForExecution(execution, args), manifests)
1299
1540
  .map((target) => normalizeWorkbenchAdapterAuthTarget(target));
1300
1541
  }
1301
- function adapterInvocationsForExecution(execution, spec) {
1542
+ function adapterInvocationsForExecution(execution, args) {
1543
+ if (args.runtimeControlOperation) {
1544
+ return uniqueAdapterInvocations(args.runtimeControlOperation.operations.map((operation) => ({
1545
+ use: operation.invocation.use,
1546
+ with: operation.invocation.with ?? {},
1547
+ ...(operation.invocation.auth !== undefined ? { auth: operation.invocation.auth } : {}),
1548
+ })));
1549
+ }
1302
1550
  if (execution.purpose === "attempt") {
1303
- return uniqueAdapterInvocations([execution.adapter, spec.run]);
1551
+ return uniqueAdapterInvocations([execution.adapter, args.spec.run]);
1304
1552
  }
1305
1553
  return [execution.adapter];
1306
1554
  }
@@ -1341,7 +1589,7 @@ function completedJobFromSandboxResult(fallbackJob, startedAt, result) {
1341
1589
  }
1342
1590
  return attachSandboxMetadataToJob(failWorkbenchRunJob(fallbackJob, result.startedAt || startedAt, result.error ?? `Sandbox execution ${result.status}.`, result.finishedAt), asRuntimeRecord(result.metadata).sandbox);
1343
1591
  }
1344
- async function executeSubjectRevisionExecutionInSandbox(args, execution, startedAt, capability, eventPublisher) {
1592
+ async function executeSubjectRevisionExecutionInCurrentRuntime(args, execution, startedAt, capability, eventPublisher) {
1345
1593
  const { workload, result } = await runHostedProtocolExecutionResult(args, execution, startedAt, capability, eventPublisher);
1346
1594
  if (result.error || (result.exitCode ?? 0) !== 0) {
1347
1595
  return failWorkbenchRunJob(args.job, startedAt, result.error ?? `Adapter ${execution.adapter.use} exited with status ${result.exitCode}.`, result.finishedAt, result);
@@ -1382,7 +1630,7 @@ async function executeSubjectRevisionExecutionInSandbox(args, execution, started
1382
1630
  },
1383
1631
  };
1384
1632
  }
1385
- async function executeAttemptExecutionInSandbox(args, execution, startedAt, capability, eventPublisher) {
1633
+ async function executeAttemptExecutionInCurrentRuntime(args, execution, startedAt, capability, eventPublisher) {
1386
1634
  const workload = createWorkbenchRunWorkload({
1387
1635
  job: args.job,
1388
1636
  spec: args.spec,
@@ -1391,7 +1639,7 @@ async function executeAttemptExecutionInSandbox(args, execution, startedAt, capa
1391
1639
  engineCases: args.engineCases,
1392
1640
  traceFiles: args.traceFiles,
1393
1641
  });
1394
- const workloadResult = await runHostedCommandExecutionPhases(args, workload, attemptPhasesForExecution(execution, args.spec, args.adapterManifests), startedAt, {
1642
+ const workloadResult = await runHostedCommandExecutionSteps(args, workload, attemptStepsForExecution(execution, args.spec, args.adapterManifests), startedAt, {
1395
1643
  capability,
1396
1644
  eventPublisher,
1397
1645
  });
@@ -1405,10 +1653,7 @@ async function executeAttemptExecutionInSandbox(args, execution, startedAt, capa
1405
1653
  return failWorkbenchRunJob(args.job, startedAt, "Attempt engine must return a workbench-result result with a finite numeric score.", workloadResult.finishedAt, workloadResult);
1406
1654
  }
1407
1655
  const finishedAt = workloadResult.finishedAt ?? new Date().toISOString();
1408
- const usage = mergeUsageSummaries([
1409
- workloadResult.usage,
1410
- engineResult.usage,
1411
- ]);
1656
+ const usage = attemptUsageSummary(workloadResult.usage, engineResult.usage);
1412
1657
  const sample = evaluateSample({
1413
1658
  subjectId: workload.subjectId,
1414
1659
  files: workloadResult.files,
@@ -1453,6 +1698,282 @@ async function executeAttemptExecutionInSandbox(args, execution, startedAt, capa
1453
1698
  },
1454
1699
  };
1455
1700
  }
1701
+ export async function executeRuntimeControlOperationSequenceInCurrentRuntime(args, execution, startedAt, capability) {
1702
+ void execution;
1703
+ void capability;
1704
+ if (!args.runtimeControlOperation) {
1705
+ throw new Error("Runtime-control operation sequence is missing from the sandbox request.");
1706
+ }
1707
+ const childExecution = readWorkbenchExecutionSpec(args.job);
1708
+ const workload = createWorkbenchRunWorkload({
1709
+ job: args.job,
1710
+ spec: args.spec,
1711
+ baseFiles: args.baseFiles,
1712
+ engineResolveFiles: args.engineResolveFiles,
1713
+ engineCases: args.engineCases,
1714
+ traceFiles: args.traceFiles,
1715
+ });
1716
+ const runtimeArgs = { ...args };
1717
+ delete runtimeArgs.adapterRuntimeEnv;
1718
+ const adapterAuth = await materializeSandboxAdapterAuth(runtimeArgs, childExecution);
1719
+ let result;
1720
+ try {
1721
+ result = await runHostedCommandExecutionSteps({
1722
+ ...runtimeArgs,
1723
+ ...(adapterAuth.root ? { adapterAuthRoot: adapterAuth.root } : {}),
1724
+ ...(Object.keys(adapterAuth.env).length > 0
1725
+ ? { adapterAuthEnv: adapterAuth.env }
1726
+ : {}),
1727
+ }, workload, args.runtimeControlOperation.operations.map((operation, index) => runtimeControlStepForOperation(operation, index, args.adapterManifests)), startedAt, {
1728
+ runSubjectPrepare: args.runtimeControlOperation.prepare ?? false,
1729
+ workspaceFiles: args.runtimeControlOperation.inputs?.workspace ?? [],
1730
+ outputFiles: args.runtimeControlOperation.inputs?.output ?? [],
1731
+ collectWorkspace: args.runtimeControlOperation.collectWorkspace ?? false,
1732
+ });
1733
+ }
1734
+ finally {
1735
+ if (adapterAuth.cleanup) {
1736
+ await adapterAuth.cleanup().catch(() => undefined);
1737
+ }
1738
+ }
1739
+ const finishedAt = result.finishedAt ?? new Date().toISOString();
1740
+ const failed = Boolean(result.error) || (result.exitCode ?? 0) !== 0;
1741
+ return {
1742
+ ...args.job,
1743
+ status: failed ? "failed" : "succeeded",
1744
+ attempt: Math.max(1, args.job.attempt),
1745
+ startedAt,
1746
+ finishedAt,
1747
+ updatedAt: finishedAt,
1748
+ ...(failed ? { error: result.error ?? `Runtime-control operation sequence exited with status ${result.exitCode}.` } : {}),
1749
+ output: runtimeControlJobOutput(result, !failed),
1750
+ };
1751
+ }
1752
+ async function executeRuntimeControlOperationSequenceInSandbox(args, options, startedAt, request) {
1753
+ const childArgs = createRuntimeControlSandboxInput(args, request);
1754
+ const execution = readWorkbenchExecutionSpec(childArgs.job);
1755
+ const fileStore = createWorkbenchSandboxFileStore(childArgs);
1756
+ const planeFactory = options.createSandboxPlaneForProvider ?? createSandboxBackendPlaneForProvider;
1757
+ const plane = planeFactory(options.sandboxProvider, childArgs, startedAt, fileStore);
1758
+ assertSandboxBackendSupportsNetworkPolicy(plane.backend, execution);
1759
+ const sandboxOptions = {
1760
+ now: startedAt,
1761
+ runnerId: resolveWorkbenchWorkerId([
1762
+ process.env.WORKBENCH_WORKER_ID,
1763
+ process.env.EC2_INSTANCE_ID,
1764
+ os.hostname(),
1765
+ process.env.HOSTNAME,
1766
+ ], "local-runner"),
1767
+ fileStore,
1768
+ };
1769
+ const inputs = await fileStore.materializeInputs(execution);
1770
+ const environment = plane.prepareEnvironment
1771
+ ? await plane.prepareEnvironment(execution, sandboxOptions)
1772
+ : {
1773
+ backend: plane.backend.name,
1774
+ kind: execution.sandbox.kind,
1775
+ ref: execution.sandbox.ref,
1776
+ };
1777
+ const allocation = createWorkbenchSandboxAllocation(execution, {
1778
+ backend: plane.backend.name,
1779
+ runnerId: sandboxOptions.runnerId,
1780
+ now: startedAt,
1781
+ });
1782
+ const capability = createWorkbenchExecutionCapability(execution, { now: startedAt });
1783
+ assertRuntimeControlScope("Runtime-control sandbox allocation", collectSandboxAllocationScopeIssues(allocation, execution, { now: startedAt }));
1784
+ assertRuntimeControlScope("Runtime-control execution capability", collectExecutionCapabilityScopeIssues(capability, execution, { now: startedAt }));
1785
+ const sandbox = await plane.createSandbox({
1786
+ execution,
1787
+ environment,
1788
+ allocation,
1789
+ capability,
1790
+ inputs,
1791
+ }, sandboxOptions);
1792
+ assertRuntimeControlScope("Runtime-control sandbox handle", collectSandboxHandleScopeIssues(sandbox, allocation, execution));
1793
+ let result;
1794
+ try {
1795
+ result = await plane.exec({
1796
+ execution,
1797
+ environment,
1798
+ sandbox,
1799
+ allocation,
1800
+ capability,
1801
+ inputs,
1802
+ }, sandboxOptions);
1803
+ }
1804
+ finally {
1805
+ await plane.destroySandbox(sandbox, sandboxOptions);
1806
+ }
1807
+ const completedJob = completedJobFromSandboxResult(childArgs.job, startedAt, result);
1808
+ return runtimeControlResultFromCompletedJob(completedJob);
1809
+ }
1810
+ function createRuntimeControlSandboxInput(args, request) {
1811
+ const parentExecution = readWorkbenchExecutionSpec(args.job);
1812
+ const parentWorkload = createWorkbenchRunWorkload({
1813
+ job: args.job,
1814
+ spec: args.spec,
1815
+ baseFiles: args.baseFiles,
1816
+ engineResolveFiles: args.engineResolveFiles,
1817
+ engineCases: args.engineCases,
1818
+ traceFiles: args.traceFiles,
1819
+ });
1820
+ const nonce = runtimeControlNonce();
1821
+ const childExecutionId = `${parentExecution.id}:runtime:${nonce}`;
1822
+ const childJobId = `${args.job.id}:runtime:${nonce}`;
1823
+ const parentInput = asRuntimeRecord(args.job.input);
1824
+ const publicFiles = runtimeControlInputFiles(request.inputs, "case", parentWorkload.engineCase ? engineCasePublicFiles(parentWorkload.engineCase) : []);
1825
+ const privateFiles = runtimeControlInputFiles(request.inputs, "enginePrivate", parentWorkload.engineCase ? engineCasePrivateFiles(parentWorkload.engineCase) : []);
1826
+ const subjectFiles = runtimeControlInputFiles(request.inputs, "subject", parentWorkload.subjectFiles);
1827
+ const traceFiles = runtimeControlInputFiles(request.inputs, "traces", parentWorkload.traceFiles);
1828
+ const adapter = request.operations[request.operations.length - 1]?.invocation;
1829
+ const childExecution = {
1830
+ ...parentExecution,
1831
+ id: childExecutionId,
1832
+ outputs: [],
1833
+ adapter: adapter
1834
+ ? {
1835
+ use: adapter.use,
1836
+ with: adapter.with ?? {},
1837
+ ...(adapter.auth !== undefined ? { auth: adapter.auth } : {}),
1838
+ }
1839
+ : parentExecution.adapter,
1840
+ metadata: {
1841
+ ...asRuntimeRecord(parentExecution.metadata),
1842
+ runtimeControl: true,
1843
+ caseId: parentWorkload.caseId,
1844
+ },
1845
+ };
1846
+ const engineCase = {
1847
+ id: parentWorkload.caseId,
1848
+ case: parentWorkload.engineCaseSpec ?? {
1849
+ version: 3,
1850
+ prompt: parentWorkload.prompt,
1851
+ },
1852
+ files: {
1853
+ public: publicFiles,
1854
+ private: privateFiles,
1855
+ },
1856
+ };
1857
+ const childJob = {
1858
+ ...args.job,
1859
+ id: childJobId,
1860
+ input: {
1861
+ ...parentInput,
1862
+ execution: childExecution,
1863
+ caseId: parentWorkload.caseId,
1864
+ },
1865
+ };
1866
+ const childArgs = {
1867
+ ...args,
1868
+ job: childJob,
1869
+ baseFiles: subjectFiles,
1870
+ engineResolveFiles: [...publicFiles, ...privateFiles],
1871
+ engineCases: [engineCase],
1872
+ traceFiles,
1873
+ runtimeControlOperation: request,
1874
+ };
1875
+ delete childArgs.adapterRuntimeEnv;
1876
+ delete childArgs.workspaceRoot;
1877
+ return childArgs;
1878
+ }
1879
+ function runtimeControlInputFiles(inputs, key, fallback) {
1880
+ if (inputs && Object.prototype.hasOwnProperty.call(inputs, key)) {
1881
+ return cloneSurfaceFiles(inputs[key] ?? []);
1882
+ }
1883
+ return cloneSurfaceFiles(fallback);
1884
+ }
1885
+ function runtimeControlStepForOperation(operation, index, manifests = []) {
1886
+ const command = operation.invocation.command?.trim()
1887
+ || adapterProtocolCommandSpec({
1888
+ use: operation.invocation.use,
1889
+ with: operation.invocation.with ?? {},
1890
+ ...(operation.invocation.auth !== undefined ? { auth: operation.invocation.auth } : {}),
1891
+ }, operation.operation, manifests).command;
1892
+ return {
1893
+ kind: operation.operation === "subject.run"
1894
+ ? "subject"
1895
+ : operation.operation === "optimizer.improve"
1896
+ ? "optimizer"
1897
+ : "engine",
1898
+ label: operation.label ?? `${operation.operation.replace(".", "_")}_${index + 1}`,
1899
+ operation: operation.operation,
1900
+ executor: "sandbox",
1901
+ adapter: {
1902
+ use: operation.invocation.use,
1903
+ with: operation.invocation.with ?? {},
1904
+ ...(operation.invocation.auth !== undefined ? { auth: operation.invocation.auth } : {}),
1905
+ },
1906
+ command,
1907
+ };
1908
+ }
1909
+ function runtimeControlResultFromCompletedJob(job) {
1910
+ return normalizeRuntimeControlResultOutput(asRuntimeRecord(job.output), job.status === "succeeded", job.error);
1911
+ }
1912
+ function runtimeControlJobOutput(result, ok) {
1913
+ return normalizeRuntimeControlResultOutput({
1914
+ ok,
1915
+ files: result.files,
1916
+ fileChanges: result.fileChanges,
1917
+ ...(result.operationResults ? { operationResults: result.operationResults } : {}),
1918
+ ...(result.workspaceFiles ? { workspaceFiles: result.workspaceFiles } : {}),
1919
+ ...(result.result ? { result: result.result } : {}),
1920
+ ...(result.usage ? { usage: result.usage } : {}),
1921
+ ...(result.summary !== undefined ? { summary: result.summary } : {}),
1922
+ ...(result.feedback !== undefined ? { feedback: result.feedback } : {}),
1923
+ ...(result.error ? { error: result.error } : {}),
1924
+ }, ok, result.error);
1925
+ }
1926
+ function normalizeRuntimeControlResultOutput(output, ok, fallbackError) {
1927
+ const files = Array.isArray(output.files)
1928
+ ? output.files.filter(isSurfaceSnapshotFile)
1929
+ : [];
1930
+ const workspaceFiles = Array.isArray(output.workspaceFiles)
1931
+ ? output.workspaceFiles.filter(isSurfaceSnapshotFile)
1932
+ : undefined;
1933
+ const operationResults = Array.isArray(output.operationResults)
1934
+ ? output.operationResults.filter(isWorkbenchAdapterOperationResult)
1935
+ : [];
1936
+ return {
1937
+ ok: ok && output.ok !== false,
1938
+ files,
1939
+ fileChanges: Array.isArray(output.fileChanges)
1940
+ ? output.fileChanges.filter((entry) => typeof entry === "string")
1941
+ : files.map((file) => file.path),
1942
+ operationResults,
1943
+ ...(workspaceFiles ? { workspaceFiles } : {}),
1944
+ ...(output.result && typeof output.result === "object" && !Array.isArray(output.result)
1945
+ ? { result: output.result }
1946
+ : {}),
1947
+ ...(output.usage && typeof output.usage === "object" && !Array.isArray(output.usage)
1948
+ ? { usage: output.usage }
1949
+ : {}),
1950
+ ...(typeof output.summary === "string" ? { summary: output.summary } : {}),
1951
+ ...(output.feedback !== undefined && isJsonPayload(output.feedback) ? { feedback: output.feedback } : {}),
1952
+ ...(typeof output.error === "string" ? { error: output.error } : fallbackError ? { error: fallbackError } : {}),
1953
+ };
1954
+ }
1955
+ function isWorkbenchAdapterOperationResult(value) {
1956
+ if (!value || typeof value !== "object" || Array.isArray(value)) {
1957
+ return false;
1958
+ }
1959
+ const record = value;
1960
+ return record.protocol === "workbench.adapter-result.v1" &&
1961
+ (record.operation === "engine.resolve" ||
1962
+ record.operation === "engine.run" ||
1963
+ record.operation === "subject.run" ||
1964
+ record.operation === "optimizer.improve");
1965
+ }
1966
+ function cloneSurfaceFiles(files) {
1967
+ return files.map((file) => ({ ...file, path: normalizeRelativePath(file.path) }));
1968
+ }
1969
+ function runtimeControlNonce() {
1970
+ return randomBytes(6).toString("hex");
1971
+ }
1972
+ function assertRuntimeControlScope(label, issues) {
1973
+ if (issues.length > 0) {
1974
+ throw new Error(`${label} failed validation:\n${issues.join("\n")}`);
1975
+ }
1976
+ }
1456
1977
  async function runHostedProtocolExecutionResult(args, execution, startedAt, capability, eventPublisher) {
1457
1978
  const workload = createWorkbenchRunWorkload({
1458
1979
  job: args.job,
@@ -1462,13 +1983,13 @@ async function runHostedProtocolExecutionResult(args, execution, startedAt, capa
1462
1983
  engineCases: args.engineCases,
1463
1984
  traceFiles: args.traceFiles,
1464
1985
  });
1465
- const result = await runHostedCommandExecutionPhases(args, workload, [protocolPhaseForExecution(execution, args.adapterManifests)], startedAt, {
1986
+ const result = await runHostedCommandExecutionSteps(args, workload, [protocolStepForExecution(execution, args.adapterManifests)], startedAt, {
1466
1987
  capability,
1467
1988
  eventPublisher,
1468
1989
  });
1469
1990
  return { workload, result };
1470
1991
  }
1471
- async function runHostedCommandExecutionPhases(args, workload, phases, startedAt, options = {}) {
1992
+ async function runHostedCommandExecutionSteps(args, workload, steps, startedAt, options = {}) {
1472
1993
  const [{ execFile }, fs, os, path, { promisify }] = await Promise.all([
1473
1994
  importNodeModule(nodeBuiltin("child_process")),
1474
1995
  importNodeModule(nodeBuiltin("fs/promises")),
@@ -1489,9 +2010,22 @@ async function runHostedCommandExecutionPhases(args, workload, phases, startedAt
1489
2010
  const workspace = await createRuntimeWorkspaceRoot(args, fs, os, path, "workbench-execution-sandbox-");
1490
2011
  try {
1491
2012
  await stageWorkbenchRunWorkload(workspace.root, workload);
2013
+ if (options.workspaceFiles && options.workspaceFiles.length > 0) {
2014
+ await stageInitialWorkspaceFiles(workspace.root, options.workspaceFiles);
2015
+ }
2016
+ if (options.outputFiles && options.outputFiles.length > 0) {
2017
+ await writeSurfaceFiles(outputDir(workspace.root), options.outputFiles);
2018
+ }
2019
+ const execution = readWorkbenchExecutionSpec(workload.job);
2020
+ const hostAdapterIds = new Set(steps.flatMap((step) => step.executor === "host"
2021
+ ? [step.adapter?.use ?? execution.adapter.use]
2022
+ : []));
2023
+ const hostAdapterRoots = hostAdapterIds.size > 0
2024
+ ? await materializeHostAdapterRoots(workspace.root, args.adapterFiles ?? [], hostAdapterIds)
2025
+ : new Map();
1492
2026
  let exitCode = 0;
1493
2027
  let runtimeError;
1494
- const phaseResults = [];
2028
+ const operationResults = [];
1495
2029
  try {
1496
2030
  if (!environmentVersion) {
1497
2031
  throw new Error("environment is required for adapter command executions.");
@@ -1503,49 +2037,64 @@ async function runHostedCommandExecutionPhases(args, workload, phases, startedAt
1503
2037
  network: environmentVersion.spec.network,
1504
2038
  }, null, 2)}\n`);
1505
2039
  }
1506
- const phaseTimeoutMs = environmentVersion
2040
+ const stepTimeoutMs = environmentVersion
1507
2041
  ? environmentVersionTimeoutMs(environmentVersion)
1508
2042
  : 5 * 60 * 1000;
1509
- const execution = readWorkbenchExecutionSpec(workload.job);
1510
- for (const phase of phases) {
1511
- await resetHostedWorkloadPhaseOutput(workspace.root, phase);
1512
- if (phase.kind === "engine" && execution.purpose === "attempt") {
1513
- await stageAttemptScoringInputs(workspace.root, workload);
2043
+ const shouldRunSubjectPrepare = options.runSubjectPrepare ?? steps.some((step) => step.executor === "sandbox");
2044
+ if (shouldRunSubjectPrepare) {
2045
+ await runSubjectPrepareCommand({
2046
+ root: workspace.root,
2047
+ workload,
2048
+ execution,
2049
+ execFileAsync,
2050
+ timeoutMs: stepTimeoutMs,
2051
+ eventPublisher: options.eventPublisher,
2052
+ });
2053
+ }
2054
+ let enginePrivateStaged = false;
2055
+ for (const step of steps) {
2056
+ if (step.kind === "engine" && !enginePrivateStaged) {
2057
+ await stageWorkbenchEnginePrivateFiles(workspace.root, workload);
2058
+ enginePrivateStaged = true;
1514
2059
  }
1515
- const adapterRequestPath = await writeWorkbenchAdapterRequest(workspace.root, workload, execution, phase, adapterAuthRequestForPhase(args, phase.adapter?.use ?? execution.adapter.use), args.adapterManifests);
1516
- const phaseRole = phaseEventRole(phase);
1517
- await publishCommandPhaseEvent(options.eventPublisher, {
1518
- phase: phase.label,
2060
+ await resetHostedWorkloadStepOutput(workspace.root);
2061
+ const adapterRequestPath = await writeWorkbenchAdapterRequest(workspace.root, workload, execution, step, adapterAuthRequestForStep(args, step.adapter?.use ?? execution.adapter.use), args.adapterManifests);
2062
+ const stepRole = stepEventRole(step);
2063
+ await publishCommandStepEvent(options.eventPublisher, {
2064
+ step: step.label,
1519
2065
  status: "started",
1520
- ...(phaseRole ? { role: phaseRole } : {}),
2066
+ ...(stepRole ? { role: stepRole } : {}),
1521
2067
  });
1522
2068
  try {
1523
- if (!phase.command) {
1524
- throw new Error(`Adapter phase ${phase.label} is missing a command.`);
2069
+ if (!step.command) {
2070
+ throw new Error(`Adapter step ${step.label} is missing a command.`);
1525
2071
  }
1526
- const command = createHostedWorkloadShellCommand(workspace.root, phase.command, phase.label, phase.okExitCodes);
2072
+ const adapterRoot = step.executor === "host"
2073
+ ? hostAdapterRoots.get(step.adapter?.use ?? execution.adapter.use)
2074
+ : undefined;
2075
+ const command = createHostedWorkloadShellCommand(workspace.root, step.command, step.label, step.okExitCodes);
1527
2076
  await execFileAsync("sh", ["-c", command], {
1528
- cwd: workspace.root,
1529
- env: createHostedWorkloadPhaseEnv(workspace.root, adapterRequestPath, args.adapterAuthEnv),
2077
+ cwd: adapterRoot ?? workspace.root,
2078
+ env: createHostedWorkloadAdapterEnv(workspace.root, adapterRequestPath, args.adapterAuthEnv, adapterRoot ? { adapterRoot } : undefined, args.adapterRuntimeEnv),
1530
2079
  maxBuffer: 10 * 1024 * 1024,
1531
- timeout: phaseTimeoutMs,
2080
+ timeout: stepTimeoutMs,
1532
2081
  });
1533
- const operationResult = await readWorkbenchAdapterOperationResult(outputDir(workspace.root), phase.operation);
1534
- assertWorkbenchAdapterOperationResultOk(operationResult, `Adapter ${phase.adapter?.use ?? execution.adapter.use} ${phase.operation}`);
1535
- phaseResults.push(operationResult);
1536
- await publishCommandPhaseEvent(options.eventPublisher, {
1537
- phase: phase.label,
2082
+ const operationResult = await readWorkbenchAdapterOperationResult(outputDir(workspace.root), step.operation);
2083
+ assertWorkbenchAdapterOperationResultOk(operationResult, `Adapter ${step.adapter?.use ?? execution.adapter.use} ${step.operation}`);
2084
+ operationResults.push(operationResult);
2085
+ await publishCommandStepEvent(options.eventPublisher, {
2086
+ step: step.label,
1538
2087
  status: "succeeded",
1539
- ...(phaseRole ? { role: phaseRole } : {}),
2088
+ ...(stepRole ? { role: stepRole } : {}),
1540
2089
  });
1541
2090
  }
1542
2091
  catch (error) {
1543
- await publishCommandPhaseEvent(options.eventPublisher, {
1544
- phase: phase.label,
2092
+ await publishCommandStepEvent(options.eventPublisher, {
2093
+ step: step.label,
1545
2094
  status: "failed",
1546
2095
  exitCode: readExitCode(error),
1547
2096
  error: error instanceof Error ? error.message : String(error),
1548
- ...(phaseRole ? { role: phaseRole } : {}),
2097
+ ...(stepRole ? { role: stepRole } : {}),
1549
2098
  });
1550
2099
  throw error;
1551
2100
  }
@@ -1569,16 +2118,56 @@ async function runHostedCommandExecutionPhases(args, workload, phases, startedAt
1569
2118
  startedAt,
1570
2119
  });
1571
2120
  }
1572
- return await readWorkbenchRunWorkloadResult(workspace.root, workload, {
2121
+ const result = await readWorkbenchRunWorkloadResult(workspace.root, workload, {
1573
2122
  exitCode,
1574
2123
  startedAt,
1575
- phaseResults,
2124
+ operationResults,
1576
2125
  });
2126
+ if (options.collectWorkspace) {
2127
+ result.workspaceFiles = await readMutableWorkspaceSnapshotFiles(workspace.root);
2128
+ }
2129
+ return result;
1577
2130
  }
1578
2131
  finally {
1579
2132
  await workspace.cleanup();
1580
2133
  }
1581
2134
  }
2135
+ async function runSubjectPrepareCommand(args) {
2136
+ const command = args.workload.spec.subject.prepare?.command;
2137
+ if (!command) {
2138
+ return;
2139
+ }
2140
+ const role = args.execution.purpose === "improve" ? "optimizer" : "runner";
2141
+ await publishCommandStepEvent(args.eventPublisher, {
2142
+ step: "subject_prepare",
2143
+ status: "started",
2144
+ role,
2145
+ });
2146
+ try {
2147
+ const shellCommand = createHostedWorkloadShellCommand(args.root, command, "subject_prepare");
2148
+ await args.execFileAsync("sh", ["-c", shellCommand], {
2149
+ cwd: args.root,
2150
+ env: createHostedWorkloadPrepareEnv(args.root),
2151
+ maxBuffer: 10 * 1024 * 1024,
2152
+ timeout: args.timeoutMs,
2153
+ });
2154
+ await publishCommandStepEvent(args.eventPublisher, {
2155
+ step: "subject_prepare",
2156
+ status: "succeeded",
2157
+ role,
2158
+ });
2159
+ }
2160
+ catch (error) {
2161
+ await publishCommandStepEvent(args.eventPublisher, {
2162
+ step: "subject_prepare",
2163
+ status: "failed",
2164
+ exitCode: readExitCode(error),
2165
+ error: error instanceof Error ? error.message : String(error),
2166
+ role,
2167
+ });
2168
+ throw new Error(`Subject prepare command failed: ${error instanceof Error ? error.message : String(error)}`);
2169
+ }
2170
+ }
1582
2171
  async function createRuntimeWorkspaceRoot(args, fs, os, path, prefix) {
1583
2172
  if (args.workspaceRoot) {
1584
2173
  await fs.mkdir(args.workspaceRoot, { recursive: true });
@@ -1614,19 +2203,22 @@ async function createRuntimeWorkspaceRoot(args, fs, os, path, prefix) {
1614
2203
  },
1615
2204
  };
1616
2205
  }
1617
- function phaseEventRole(phase) {
1618
- if (phase.kind === "optimizer") {
2206
+ function stepEventRole(step) {
2207
+ if (step.kind === "optimizer") {
1619
2208
  return "optimizer";
1620
2209
  }
1621
- if (phase.kind === "runner") {
2210
+ if (step.kind === "subject") {
1622
2211
  return "runner";
1623
2212
  }
1624
- if (phase.kind === "engine") {
2213
+ if (step.kind === "engine") {
1625
2214
  return "engine";
1626
2215
  }
1627
2216
  return undefined;
1628
2217
  }
1629
2218
  function adapterOperationUsageSummary(result) {
2219
+ if (hasExplicitUsageRole(result.usage)) {
2220
+ return completeUsageSummary(result.usage);
2221
+ }
1630
2222
  if (result.operation === "optimizer.improve") {
1631
2223
  return assignUsageRole("optimizer", result.usage);
1632
2224
  }
@@ -1638,11 +2230,16 @@ function adapterOperationUsageSummary(result) {
1638
2230
  }
1639
2231
  return result.usage;
1640
2232
  }
1641
- function executionPurposeRole(purpose) {
1642
- if (purpose === "improve") {
1643
- return "optimizer";
1644
- }
1645
- return "runner";
2233
+ function attemptUsageSummary(workloadUsage, resultUsage) {
2234
+ const normalizedWorkloadUsage = completeUsageSummary(workloadUsage);
2235
+ const legacyEngineUsage = normalizedWorkloadUsage?.engine
2236
+ ? undefined
2237
+ : assignUsageRole("engine", resultUsage);
2238
+ return mergeUsageSummaries([normalizedWorkloadUsage, legacyEngineUsage]);
2239
+ }
2240
+ function hasExplicitUsageRole(usage) {
2241
+ const normalized = completeUsageSummary(usage);
2242
+ return Boolean(normalized?.optimizer || normalized?.runner || normalized?.engine);
1646
2243
  }
1647
2244
  function createSubjectPatchFromResult(result, spec) {
1648
2245
  if (result.subjectPatch) {
@@ -1720,47 +2317,103 @@ export async function stageWorkbenchRunWorkload(root, workload) {
1720
2317
  fs
1721
2318
  .rm(runtimePrivateDir(root), { recursive: true, force: true })
1722
2319
  .catch(() => undefined),
1723
- fs
1724
- .rm(runtimeLogsDir(root), { recursive: true, force: true })
1725
- .catch(() => undefined),
1726
2320
  ]);
1727
2321
  await fs.mkdir(inputDir(root), { recursive: true });
1728
2322
  await fs.mkdir(outputDir(root), { recursive: true });
1729
2323
  if (purpose === "attempt") {
1730
- assertMutableWorkspaceFiles(workload.subjectFiles, "Subject files");
1731
2324
  await fs.mkdir(subjectDir(root), { recursive: true });
1732
2325
  await fs.mkdir(caseDir(root), { recursive: true });
1733
- await fs.mkdir(runtimeLogsAgentDir(root), { recursive: true });
1734
- await fs.mkdir(runtimeLogsVerifierDir(root), { recursive: true });
1735
2326
  const engineCase = requireWorkloadEngineCase(workload, "Attempt staging");
1736
2327
  await writeSurfaceFiles(subjectDir(root), workload.subjectFiles);
1737
- await writeSurfaceFiles(caseDir(root), engineCaseSubjectVisibleFiles(engineCase));
1738
- await writeSurfaceFiles(root, workload.subjectFiles);
2328
+ await writeSurfaceFiles(caseDir(root), engineCasePublicFiles(engineCase));
1739
2329
  return;
1740
2330
  }
1741
2331
  if (purpose === "improve") {
1742
- assertMutableWorkspaceFiles(workload.subjectFiles, "Subject files");
1743
2332
  await fs.mkdir(subjectDir(root), { recursive: true });
1744
2333
  await writeSurfaceFiles(subjectDir(root), workload.subjectFiles);
1745
- await writeSurfaceFiles(root, workload.subjectFiles);
1746
2334
  await fs.mkdir(tracesDir(root), { recursive: true });
1747
2335
  await writeSurfaceFiles(tracesDir(root), workload.traceFiles);
1748
2336
  }
1749
2337
  }
1750
- async function stageAttemptScoringInputs(root, workload) {
2338
+ async function stageWorkbenchEnginePrivateFiles(root, workload) {
2339
+ if (readWorkloadExecutionPurpose(workload) !== "attempt") {
2340
+ return;
2341
+ }
1751
2342
  const fs = await importNodeModule(nodeBuiltin("fs/promises"));
1752
- const engineCase = requireWorkloadEngineCase(workload, "Attempt scoring");
1753
- await Promise.all([
1754
- fs
1755
- .rm(runtimeEnginePrivateDir(root), { recursive: true, force: true })
1756
- .catch(() => undefined),
1757
- fs
1758
- .rm(runtimeLogsVerifierDir(root), { recursive: true, force: true })
1759
- .catch(() => undefined),
1760
- ]);
1761
2343
  await fs.mkdir(runtimeEnginePrivateDir(root), { recursive: true });
1762
- await fs.mkdir(runtimeLogsVerifierDir(root), { recursive: true });
1763
- await writeSurfaceFiles(runtimeEnginePrivateDir(root), engineCaseEnginePrivateFiles(engineCase));
2344
+ await writeSurfaceFiles(runtimeEnginePrivateDir(root), engineCasePrivateFiles(requireWorkloadEngineCase(workload, "Engine-private staging")));
2345
+ }
2346
+ async function stageInitialWorkspaceFiles(root, files) {
2347
+ await writeSurfaceFiles(root, files.filter((file) => isMutableWorkspaceSnapshotPath(file.path)));
2348
+ }
2349
+ async function readMutableWorkspaceSnapshotFiles(root) {
2350
+ return (await readSurfaceFiles(root))
2351
+ .filter((file) => isMutableWorkspaceSnapshotPath(file.path))
2352
+ .sort((left, right) => left.path.localeCompare(right.path));
2353
+ }
2354
+ function isMutableWorkspaceSnapshotPath(filePath) {
2355
+ const normalized = normalizeRelativePath(filePath);
2356
+ return Boolean(normalized &&
2357
+ !normalized.startsWith("../") &&
2358
+ normalized !== "input" &&
2359
+ !normalized.startsWith("input/") &&
2360
+ normalized !== "private" &&
2361
+ !normalized.startsWith("private/") &&
2362
+ normalized !== "output" &&
2363
+ !normalized.startsWith("output/") &&
2364
+ normalized !== ".workbench" &&
2365
+ !normalized.startsWith(".workbench/"));
2366
+ }
2367
+ async function materializeHostAdapterRoots(root, adapterFiles, adapterIds) {
2368
+ if (adapterFiles.length === 0 || adapterIds.size === 0) {
2369
+ return new Map();
2370
+ }
2371
+ const fs = await importNodeModule(nodeBuiltin("fs/promises"));
2372
+ const path = await importNodeModule(nodeBuiltin("path"));
2373
+ const sourceRoots = hostAdapterSourceRoots(adapterFiles, adapterIds);
2374
+ const roots = new Map();
2375
+ for (const [adapterId, sourceRoot] of sourceRoots) {
2376
+ const targetRoot = path.join(root, ".workbench", "adapters", adapterId);
2377
+ const files = adapterFiles.flatMap((file) => {
2378
+ const relativePath = adapterFilePathWithinRoot(file.path, sourceRoot);
2379
+ return relativePath === null
2380
+ ? []
2381
+ : [{ ...file, path: relativePath }];
2382
+ });
2383
+ await fs.rm(targetRoot, { recursive: true, force: true }).catch(() => undefined);
2384
+ await fs.mkdir(targetRoot, { recursive: true });
2385
+ await writeSurfaceFiles(targetRoot, files);
2386
+ roots.set(adapterId, await fs.realpath(targetRoot));
2387
+ }
2388
+ return roots;
2389
+ }
2390
+ function hostAdapterSourceRoots(adapterFiles, adapterIds) {
2391
+ const roots = new Map();
2392
+ for (const file of adapterFiles) {
2393
+ const normalized = normalizeRelativePath(file.path);
2394
+ if (!normalized.endsWith("workbench.adapter.yaml")) {
2395
+ continue;
2396
+ }
2397
+ const manifest = parseWorkbenchAdapterManifest(file.content);
2398
+ if (!adapterIds.has(manifest.id)) {
2399
+ continue;
2400
+ }
2401
+ const sourceRoot = normalized === "workbench.adapter.yaml"
2402
+ ? ""
2403
+ : normalized.slice(0, -"workbench.adapter.yaml".length).replace(/\/+$/u, "");
2404
+ roots.set(manifest.id, sourceRoot);
2405
+ }
2406
+ return roots;
2407
+ }
2408
+ function adapterFilePathWithinRoot(filePath, sourceRoot) {
2409
+ const normalized = normalizeRelativePath(filePath);
2410
+ if (!sourceRoot) {
2411
+ return normalized;
2412
+ }
2413
+ if (!normalized.startsWith(`${sourceRoot}/`)) {
2414
+ return null;
2415
+ }
2416
+ return normalized.slice(sourceRoot.length + 1);
1764
2417
  }
1765
2418
  async function readHostedRunFailureResult(root, workload, options) {
1766
2419
  const traceFiles = await readRuntimeTraceFiles(root, workload);
@@ -1788,16 +2441,16 @@ async function readWorkbenchRunWorkloadResult(root, workload, options = {}) {
1788
2441
  const primaryOperation = purpose === "improve"
1789
2442
  ? "optimizer.improve"
1790
2443
  : "engine.run";
1791
- const primaryResult = [...(options.phaseResults ?? [])]
2444
+ const primaryResult = [...(options.operationResults ?? [])]
1792
2445
  .reverse()
1793
2446
  .find((result) => result.operation === primaryOperation);
1794
2447
  const resultPayload = jsonRecord(primaryResult?.value);
1795
2448
  const usage = mergeUsageSummaries([
1796
2449
  options.usage,
1797
- ...(options.phaseResults ?? []).map(adapterOperationUsageSummary),
2450
+ ...(options.operationResults ?? []).map(adapterOperationUsageSummary),
1798
2451
  ]);
1799
- const metrics = normalizeRewardMetrics(resultPayload.metrics);
1800
- const cases = normalizeRewardCases(resultPayload.cases);
2452
+ const metrics = normalizeResultMetrics(resultPayload.metrics);
2453
+ const cases = normalizeResultCases(resultPayload.cases);
1801
2454
  const includeResultScoring = purpose === "attempt";
1802
2455
  const files = [...outputFiles, ...traceFiles].sort((left, right) => left.path.localeCompare(right.path));
1803
2456
  const subjectPatch = purpose === "improve" ? primaryResult?.value : undefined;
@@ -1809,6 +2462,7 @@ async function readWorkbenchRunWorkloadResult(root, workload, options = {}) {
1809
2462
  return {
1810
2463
  files,
1811
2464
  fileChanges: declaredChanges,
2465
+ ...(options.operationResults ? { operationResults: [...options.operationResults] } : {}),
1812
2466
  ...(subjectPatch ? { subjectPatch } : {}),
1813
2467
  ...(engineResult ? { result: engineResult } : {}),
1814
2468
  ...(includeResultScoring && metrics ? { metrics } : {}),
@@ -1835,10 +2489,10 @@ async function readRuntimeTraceFiles(root, workload) {
1835
2489
  const path = await importNodeModule(nodeBuiltin("path"));
1836
2490
  const traceRoot = path.join(outputDir(root), ".workbench", "traces", workload.job.id);
1837
2491
  const purpose = readWorkloadExecutionPurpose(workload);
1838
- const outputTraceRoot = workbenchTracePhaseDirectory({
2492
+ const outputTraceRoot = workbenchTraceExecutionDirectory({
1839
2493
  sequence: 1,
1840
2494
  runId: workload.job.runId,
1841
- phase: purpose,
2495
+ purpose,
1842
2496
  });
1843
2497
  return (await readSurfaceFiles(traceRoot)).map((file) => ({
1844
2498
  ...file,
@@ -1868,13 +2522,13 @@ function createHostedWorkloadShellCommand(root, command, prefix = "", okExitCode
1868
2522
  'exit "$status"',
1869
2523
  ].join("; ");
1870
2524
  }
1871
- async function resetHostedWorkloadPhaseOutput(root, _phase) {
2525
+ async function resetHostedWorkloadStepOutput(root) {
1872
2526
  const fs = await importNodeModule(nodeBuiltin("fs/promises"));
1873
2527
  await fs
1874
2528
  .rm(workbenchAdapterOperationResultPath(outputDir(root)), { force: true })
1875
2529
  .catch(() => undefined);
1876
2530
  }
1877
- async function writeWorkbenchAdapterRequest(root, workload, execution, phase, auth, manifests) {
2531
+ async function writeWorkbenchAdapterRequest(root, workload, execution, step, auth, manifests) {
1878
2532
  const [fs, path] = await Promise.all([
1879
2533
  importNodeModule(nodeBuiltin("fs/promises")),
1880
2534
  importNodeModule(nodeBuiltin("path")),
@@ -1882,13 +2536,13 @@ async function writeWorkbenchAdapterRequest(root, workload, execution, phase, au
1882
2536
  const requestPath = path.join(root, ".workbench", "request.json");
1883
2537
  await fs.mkdir(path.dirname(requestPath), { recursive: true });
1884
2538
  const casePrompt = workload.engineCaseSpec?.prompt;
1885
- const adapter = phase.adapter ?? execution.adapter;
2539
+ const adapter = step.adapter ?? execution.adapter;
1886
2540
  const subjectCommand = adapterProtocolCommandSpec(workload.spec.run, "subject.run", manifests).command;
1887
2541
  await fs.writeFile(requestPath, `${JSON.stringify({
1888
2542
  protocol: "workbench.adapter.v3",
1889
2543
  id: execution.id,
1890
2544
  jobId: workload.job.id,
1891
- operation: phase.operation,
2545
+ operation: step.operation,
1892
2546
  invocation: {
1893
2547
  use: adapter.use,
1894
2548
  with: adapterConfigRecord(adapter, manifests),
@@ -1903,6 +2557,7 @@ async function writeWorkbenchAdapterRequest(root, workload, execution, phase, au
1903
2557
  subject: {
1904
2558
  id: workload.subjectId,
1905
2559
  path: workload.spec.subject.files.path,
2560
+ ...(workload.spec.subject.prepare ? { prepare: { ...workload.spec.subject.prepare } } : {}),
1906
2561
  run: {
1907
2562
  ...workload.spec.run,
1908
2563
  command: subjectCommand,
@@ -1923,14 +2578,12 @@ async function writeWorkbenchAdapterRequest(root, workload, execution, phase, au
1923
2578
  },
1924
2579
  paths: {
1925
2580
  workspace: root,
1926
- cwd: root,
1927
2581
  output: outputDir(root),
1928
2582
  result: workbenchAdapterOperationResultPath(outputDir(root)),
1929
2583
  subject: subjectDir(root),
1930
2584
  ...(workload.engineCaseSpec ? { case: caseDir(root) } : {}),
1931
2585
  traces: tracesDir(root),
1932
- ...(phase.kind === "engine" ? { enginePrivate: runtimeEnginePrivateDir(root) } : {}),
1933
- logs: runtimeLogsDir(root),
2586
+ ...(step.kind === "engine" ? { enginePrivate: runtimeEnginePrivateDir(root) } : {}),
1934
2587
  },
1935
2588
  }, null, 2)}\n`);
1936
2589
  return requestPath;
@@ -1945,7 +2598,29 @@ function requireOptimizerEdits(spec) {
1945
2598
  }
1946
2599
  return edits;
1947
2600
  }
1948
- function createHostedWorkloadPhaseEnv(root, adapterRequestPath, adapterEnv = {}) {
2601
+ function createHostedWorkloadAdapterEnv(root, adapterRequestPath, adapterEnv = {}, options = {}, runtimeEnv = {}) {
2602
+ const env = createHostedWorkloadBaseEnv();
2603
+ env.WORKBENCH_ADAPTER_REQUEST = adapterRequestPath;
2604
+ env.WORKBENCH_OUTPUT = outputDir(root);
2605
+ env.WORKBENCH_RESULT = workbenchAdapterOperationResultPath(outputDir(root));
2606
+ if (options.adapterRoot) {
2607
+ env.WORKBENCH_ADAPTER_ROOT = options.adapterRoot;
2608
+ env.WORKBENCH_WORKSPACE_ROOT = root;
2609
+ env.PATH = [
2610
+ `${options.adapterRoot}/node_modules/.bin`,
2611
+ env.PATH,
2612
+ ].filter(Boolean).join(":");
2613
+ }
2614
+ Object.assign(env, adapterEnv);
2615
+ Object.assign(env, runtimeEnv);
2616
+ return env;
2617
+ }
2618
+ function createHostedWorkloadPrepareEnv(root) {
2619
+ const env = createHostedWorkloadBaseEnv();
2620
+ env.WORKBENCH_OUTPUT = outputDir(root);
2621
+ return env;
2622
+ }
2623
+ function createHostedWorkloadBaseEnv() {
1949
2624
  const env = {};
1950
2625
  for (const [key, value] of Object.entries(process.env)) {
1951
2626
  if (typeof value === "string") {
@@ -1957,20 +2632,52 @@ function createHostedWorkloadPhaseEnv(root, adapterRequestPath, adapterEnv = {})
1957
2632
  delete env[key];
1958
2633
  }
1959
2634
  }
1960
- const runtimeBins = [
2635
+ const runtimeBins = uniquePathEntries([
2636
+ ...nodeModuleBinDirsForAncestors(process.cwd()),
2637
+ ...nodeModuleBinDirsForAncestors(path.dirname(fileURLToPath(import.meta.url))),
2638
+ "/app/node_modules/.bin",
1961
2639
  "/workbench-runtime/node_modules/.bin",
1962
2640
  "/workbench-runtime/products/workbench/node_modules/.bin",
1963
- ].join(":");
1964
- const systemBins = "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin";
1965
- env.PATH = process.env.PATH
1966
- ? `${systemBins}:${runtimeBins}:${process.env.PATH}`
1967
- : `${systemBins}:${runtimeBins}`;
1968
- env.WORKBENCH_ADAPTER_REQUEST = adapterRequestPath;
1969
- env.WORKBENCH_OUTPUT = outputDir(root);
1970
- env.WORKBENCH_RESULT = workbenchAdapterOperationResultPath(outputDir(root));
1971
- Object.assign(env, adapterEnv);
2641
+ ]);
2642
+ env.PATH = uniquePathEntries([
2643
+ path.dirname(process.execPath),
2644
+ "/usr/local/sbin",
2645
+ "/usr/local/bin",
2646
+ "/usr/sbin",
2647
+ "/usr/bin",
2648
+ "/sbin",
2649
+ "/bin",
2650
+ ...runtimeBins,
2651
+ ...(process.env.PATH ? process.env.PATH.split(path.delimiter) : []),
2652
+ ]).join(path.delimiter);
1972
2653
  return env;
1973
2654
  }
2655
+ function nodeModuleBinDirsForAncestors(start) {
2656
+ const dirs = [];
2657
+ let current = path.resolve(start);
2658
+ for (let depth = 0; depth < 12; depth += 1) {
2659
+ dirs.push(path.join(current, "node_modules", ".bin"));
2660
+ const parent = path.dirname(current);
2661
+ if (parent === current) {
2662
+ break;
2663
+ }
2664
+ current = parent;
2665
+ }
2666
+ return dirs;
2667
+ }
2668
+ function uniquePathEntries(entries) {
2669
+ const seen = new Set();
2670
+ const output = [];
2671
+ for (const entry of entries) {
2672
+ const trimmed = entry.trim();
2673
+ if (!trimmed || seen.has(trimmed)) {
2674
+ continue;
2675
+ }
2676
+ seen.add(trimmed);
2677
+ output.push(trimmed);
2678
+ }
2679
+ return output;
2680
+ }
1974
2681
  function readWorkloadExecutionPurpose(workload) {
1975
2682
  const purpose = workbenchExecutionPurpose(workload.job);
1976
2683
  if (purpose === "improve" || purpose === "attempt") {
@@ -2005,35 +2712,6 @@ function runtimePrivateDir(root) {
2005
2712
  function runtimeEnginePrivateDir(root) {
2006
2713
  return `${runtimePrivateDir(root)}/engine`;
2007
2714
  }
2008
- function runtimeLogsDir(root) {
2009
- return `${root}/logs`;
2010
- }
2011
- function runtimeLogsAgentDir(root) {
2012
- return `${runtimeLogsDir(root)}/agent`;
2013
- }
2014
- function runtimeLogsVerifierDir(root) {
2015
- return `${runtimeLogsDir(root)}/verifier`;
2016
- }
2017
- function assertMutableWorkspaceFiles(files, label) {
2018
- const reserved = files
2019
- .map((file) => normalizeRelativePath(file.path))
2020
- .filter(isRuntimeReservedWorkspacePath);
2021
- if (reserved.length > 0) {
2022
- throw new Error(`${label} cannot target runtime-reserved workspace paths: ${reserved.join(", ")}.`);
2023
- }
2024
- }
2025
- function isRuntimeReservedWorkspacePath(normalizedPath) {
2026
- return normalizedPath === ".workbench" ||
2027
- normalizedPath.startsWith(".workbench/") ||
2028
- normalizedPath === "input" ||
2029
- normalizedPath.startsWith("input/") ||
2030
- normalizedPath === "output" ||
2031
- normalizedPath.startsWith("output/") ||
2032
- normalizedPath === "logs" ||
2033
- normalizedPath.startsWith("logs/") ||
2034
- normalizedPath === "private" ||
2035
- normalizedPath.startsWith("private/");
2036
- }
2037
2715
  async function writeSurfaceFiles(root, files) {
2038
2716
  const fs = await importNodeModule(nodeBuiltin("fs/promises"));
2039
2717
  const path = await importNodeModule(nodeBuiltin("path"));
@@ -2097,7 +2775,7 @@ function encodeSurfaceSnapshotContent(body, utf8Decoder) {
2097
2775
  };
2098
2776
  }
2099
2777
  }
2100
- function normalizeRewardMetrics(value) {
2778
+ function normalizeResultMetrics(value) {
2101
2779
  if (!value || typeof value !== "object" || Array.isArray(value)) {
2102
2780
  return undefined;
2103
2781
  }
@@ -2109,7 +2787,7 @@ function normalizeRewardMetrics(value) {
2109
2787
  }
2110
2788
  return Object.keys(metrics).length > 0 ? metrics : undefined;
2111
2789
  }
2112
- function normalizeRewardCases(value) {
2790
+ function normalizeResultCases(value) {
2113
2791
  if (!Array.isArray(value)) {
2114
2792
  return undefined;
2115
2793
  }
@@ -2122,7 +2800,7 @@ function normalizeRewardCases(value) {
2122
2800
  if (!id) {
2123
2801
  return [];
2124
2802
  }
2125
- const metrics = normalizeRewardMetrics(record.metrics) ?? {};
2803
+ const metrics = normalizeResultMetrics(record.metrics) ?? {};
2126
2804
  const status = record.status === "completed" || record.status === "error"
2127
2805
  ? record.status
2128
2806
  : undefined;
@@ -2146,9 +2824,7 @@ function normalizeRewardCases(value) {
2146
2824
  : undefined;
2147
2825
  const pass = typeof criterionRecord.pass === "boolean"
2148
2826
  ? criterionRecord.pass
2149
- : score !== undefined
2150
- ? score >= 0.5
2151
- : undefined;
2827
+ : undefined;
2152
2828
  if (!criterionId || score === undefined || pass === undefined) {
2153
2829
  return [];
2154
2830
  }
@@ -2261,13 +2937,13 @@ function evaluateSample(args) {
2261
2937
  if (typeof sampleScore !== "number" || !Number.isFinite(sampleScore)) {
2262
2938
  throw new Error("Evaluation sample requires an engine result with a finite numeric score.");
2263
2939
  }
2264
- const cases = args.workload.cases?.length ? args.workload.cases : undefined;
2265
2940
  const metrics = args.workload.metrics ?? {
2266
2941
  score: sampleScore,
2267
2942
  };
2268
2943
  if (metrics.score === undefined) {
2269
2944
  metrics.score = sampleScore;
2270
2945
  }
2946
+ const cases = args.workload.cases?.length ? args.workload.cases : undefined;
2271
2947
  const feedback = {
2272
2948
  ...(args.workload.summary !== undefined
2273
2949
  ? { summary: args.workload.summary }
@@ -2295,7 +2971,7 @@ function evaluateSample(args) {
2295
2971
  feedback,
2296
2972
  };
2297
2973
  }
2298
- function normalizeSampleJobOutput(value, fallbackFiles = []) {
2974
+ function normalizeSampleJobOutput(value) {
2299
2975
  if (!value || typeof value !== "object" || Array.isArray(value)) {
2300
2976
  return null;
2301
2977
  }
@@ -2314,9 +2990,6 @@ function normalizeSampleJobOutput(value, fallbackFiles = []) {
2314
2990
  !Number.isFinite(record.attemptIndex)) {
2315
2991
  return null;
2316
2992
  }
2317
- const sampleFiles = files.length > 0
2318
- ? files
2319
- : fallbackFiles.map((file) => ({ ...file }));
2320
2993
  return {
2321
2994
  subjectId: record.subjectId,
2322
2995
  attemptIndex: record.attemptIndex,
@@ -2324,10 +2997,10 @@ function normalizeSampleJobOutput(value, fallbackFiles = []) {
2324
2997
  fileChanges: Array.isArray(record.fileChanges)
2325
2998
  ? record.fileChanges.filter((entry) => typeof entry === "string")
2326
2999
  : [],
2327
- files: sampleFiles,
3000
+ files,
2328
3001
  traces: Array.isArray(record.traces)
2329
3002
  ? record.traces.filter((entry) => typeof entry === "string")
2330
- : traceFilePaths(sampleFiles),
3003
+ : traceFilePaths(files),
2331
3004
  };
2332
3005
  }
2333
3006
  function normalizeEvaluationSampleOutputs(args) {
@@ -2498,8 +3171,16 @@ function compareSampleOutputs(left, right) {
2498
3171
  }
2499
3172
  return left.sample.id.localeCompare(right.sample.id);
2500
3173
  }
2501
- function createEvaluationRecord(subjectId, rawSamples) {
2502
- const samples = mergeEvaluationSampleRecords(rawSamples);
3174
+ function createEvaluationRecord(subjectId, subjectName, rawSamples) {
3175
+ const samples = mergeEvaluationSampleRecords(rawSamples).map((sample) => subjectName
3176
+ ? {
3177
+ ...sample,
3178
+ subject: {
3179
+ ...sample.subject,
3180
+ label: subjectName,
3181
+ },
3182
+ }
3183
+ : sample);
2503
3184
  const startedAt = minTimestamp(samples.flatMap((sample) => (sample.startedAt ? [sample.startedAt] : [])));
2504
3185
  const finishedAt = maxTimestamp(samples.flatMap((sample) => (sample.finishedAt ? [sample.finishedAt] : [])));
2505
3186
  const durationValues = samples.flatMap((sample) => typeof sample.durationMs === "number" ? [sample.durationMs] : []);
@@ -2513,6 +3194,7 @@ function createEvaluationRecord(subjectId, rawSamples) {
2513
3194
  subject: {
2514
3195
  id: subjectId,
2515
3196
  kind: "subject",
3197
+ ...(subjectName ? { label: subjectName } : {}),
2516
3198
  },
2517
3199
  status: samples.length > 0 && completedSampleCount === samples.length
2518
3200
  ? "completed"
@@ -2533,6 +3215,10 @@ function createEvaluationRecord(subjectId, rawSamples) {
2533
3215
  samples,
2534
3216
  };
2535
3217
  }
3218
+ function normalizedSubjectDisplayName(value) {
3219
+ const normalized = value?.trim();
3220
+ return normalized ? normalized : null;
3221
+ }
2536
3222
  function aggregateSampleMetrics(samples) {
2537
3223
  const metricNames = new Set(samples.flatMap((sample) => Object.keys(sample.metrics ?? {})));
2538
3224
  if (metricNames.size === 0) {
@@ -2563,14 +3249,14 @@ function mergeEvaluationSampleRecords(samples) {
2563
3249
  function mergeEvaluationSampleGroup(group) {
2564
3250
  const first = group[0];
2565
3251
  if (group.length === 1) {
2566
- return normalizeSingleCaseDurations(first);
3252
+ return first;
2567
3253
  }
2568
3254
  const startedAt = minTimestamp(group.flatMap((sample) => (sample.startedAt ? [sample.startedAt] : [])));
2569
3255
  const finishedAt = maxTimestamp(group.flatMap((sample) => (sample.finishedAt ? [sample.finishedAt] : [])));
2570
3256
  const durationMs = startedAt && finishedAt
2571
3257
  ? Math.max(0, Date.parse(finishedAt) - Date.parse(startedAt))
2572
3258
  : undefined;
2573
- const cases = group.flatMap((sample) => normalizeCaseDurations(sample));
3259
+ const cases = group.flatMap((sample) => sample.cases ?? []);
2574
3260
  const metrics = aggregateSampleGroupMetrics(group);
2575
3261
  const usage = mergeUsageSummaries(group.map((sample) => sample.usage));
2576
3262
  const errors = group.flatMap((sample) => sample.error ? [sample.error] : []);
@@ -2588,22 +3274,6 @@ function mergeEvaluationSampleGroup(group) {
2588
3274
  ...(cases.length > 0 ? { cases } : {}),
2589
3275
  };
2590
3276
  }
2591
- function normalizeSingleCaseDurations(sample) {
2592
- if (!sample.cases) {
2593
- return sample;
2594
- }
2595
- const cases = normalizeCaseDurations(sample);
2596
- return cases.length === sample.cases.length
2597
- ? { ...sample, cases }
2598
- : sample;
2599
- }
2600
- function normalizeCaseDurations(sample) {
2601
- return (sample.cases ?? []).map((caseResult) => (typeof caseResult.durationMs === "number" ||
2602
- sample.cases?.length !== 1 ||
2603
- typeof sample.durationMs !== "number"
2604
- ? caseResult
2605
- : { ...caseResult, durationMs: sample.durationMs }));
2606
- }
2607
3277
  function aggregateSampleGroupMetrics(group) {
2608
3278
  const metricNames = new Set(group.flatMap((sample) => Object.keys(sample.metrics ?? {})));
2609
3279
  if (metricNames.size === 0) {