@workbench-ai/workbench-core 0.0.46 → 0.0.47
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/execution-events.d.ts +2 -2
- package/dist/execution-events.d.ts.map +1 -1
- package/dist/execution-events.js +3 -3
- package/dist/{execution-phases.d.ts → execution-evidence.d.ts} +8 -7
- package/dist/execution-evidence.d.ts.map +1 -0
- package/dist/{execution-phases.js → execution-evidence.js} +91 -51
- package/dist/execution-graph.js +1 -2
- package/dist/execution-jobs.js +1 -1
- package/dist/execution-outputs.d.ts.map +1 -1
- package/dist/execution-outputs.js +5 -10
- package/dist/execution-runtime-types.d.ts +7 -3
- package/dist/execution-runtime-types.d.ts.map +1 -1
- package/dist/execution-traces.d.ts +11 -1
- package/dist/execution-traces.d.ts.map +1 -1
- package/dist/execution-traces.js +305 -2
- package/dist/generic-spec.d.ts +8 -3
- package/dist/generic-spec.d.ts.map +1 -1
- package/dist/generic-spec.js +26 -37
- package/dist/index.d.ts +22 -11
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +868 -214
- package/dist/runtime-dockerfile.d.ts +14 -0
- package/dist/runtime-dockerfile.d.ts.map +1 -0
- package/dist/runtime-dockerfile.js +65 -0
- package/dist/sandbox-backends/docker.d.ts.map +1 -1
- package/dist/sandbox-backends/docker.js +9 -12
- package/dist/sandbox-backends/index.d.ts.map +1 -1
- package/dist/sandbox-backends/index.js +2 -1
- package/dist/sandbox-inputs.d.ts.map +1 -1
- package/dist/sandbox-inputs.js +1 -0
- package/dist/sandbox-plane.d.ts +1 -0
- package/dist/sandbox-plane.d.ts.map +1 -1
- package/dist/sandbox-plane.js +12 -22
- package/dist/trace-files.d.ts +2 -2
- package/dist/trace-files.d.ts.map +1 -1
- package/dist/trace-files.js +4 -4
- package/package.json +3 -3
- package/worker/sandbox-adapter-runner.cjs +22 -13
- package/dist/execution-phases.d.ts.map +0 -1
package/dist/index.js
CHANGED
|
@@ -1,28 +1,30 @@
|
|
|
1
|
-
import { createHash } from "node:crypto";
|
|
1
|
+
import { createHash, randomBytes } from "node:crypto";
|
|
2
2
|
import os from "node:os";
|
|
3
3
|
import path from "node:path";
|
|
4
|
+
import { fileURLToPath } from "node:url";
|
|
4
5
|
import YAML from "yaml";
|
|
5
|
-
import { adapterCommandName, assertWorkbenchAdapterOperationResultOk, collectWorkbenchAdapterAuthRequirements, readWorkbenchAdapterOperationResult, workbenchAdapterOperationCommand, workbenchAdapterOperationResultPath, } from "@workbench-ai/workbench-protocol";
|
|
6
|
-
import { BENCHMARK_SPEC_FILE,
|
|
6
|
+
import { adapterCommandName, assertWorkbenchAdapterOperationResultOk, collectWorkbenchAdapterAuthRequirements, parseWorkbenchAdapterManifest, readWorkbenchAdapterOperationResult, WORKBENCH_RUNTIME_CONTROL_TOKEN_ENV, WORKBENCH_RUNTIME_CONTROL_URL_ENV, workbenchAdapterOperationCommand, workbenchAdapterOperationExecutor, workbenchAdapterOperationResultPath, } from "@workbench-ai/workbench-protocol";
|
|
7
|
+
import { BENCHMARK_SPEC_FILE, engineCasePrivateFiles, engineCaseFilesForRuntimeInput, engineCasePublicFiles, resolveEngineCaseExecutionConfig, resolveWorkbenchResolvedSourceYaml as resolveWorkbenchResolvedSourceYamlInternal, validateWorkbenchResolvedSourceYaml as validateWorkbenchResolvedSourceYamlInternal, isWorkbenchSubjectManifestPath, } from "./generic-spec.js";
|
|
7
8
|
import { attachSandboxMetadataToJob, createWorkbenchSandboxFileStore, isSurfaceSnapshotFile, readWorkbenchExecutionSpec, } from "./sandbox-inputs.js";
|
|
8
|
-
import { asRuntimeRecord, importNodeModule, jsonRecord, nodeBuiltin, quoteShellArg, resolveWorkbenchWorkerId, } from "./runtime-utils.js";
|
|
9
|
-
import { executeValidatedSandboxExecution, } from "./sandbox-plane.js";
|
|
9
|
+
import { asRuntimeRecord, importNodeModule, isJsonPayload, jsonRecord, nodeBuiltin, quoteShellArg, resolveWorkbenchWorkerId, } from "./runtime-utils.js";
|
|
10
|
+
import { createWorkbenchExecutionCapability, createWorkbenchSandboxAllocation, collectExecutionCapabilityScopeIssues, collectSandboxAllocationScopeIssues, collectSandboxHandleScopeIssues, assertSandboxBackendSupportsNetworkPolicy, executeValidatedSandboxExecution, } from "./sandbox-plane.js";
|
|
10
11
|
import { createSandboxBackendPlaneForProvider, } from "./sandbox-backends/index.js";
|
|
11
12
|
import { applyWorkbenchSubjectPatch } from "./subject-patch.js";
|
|
12
13
|
import { assignUsageRole, completeUsageSummary, mergeUsageSummaries, normalizeUsageSummary, usageStats, } from "./execution-usage.js";
|
|
13
|
-
import { traceFilePaths,
|
|
14
|
+
import { traceFilePaths, workbenchTraceExecutionDirectory, } from "./trace-files.js";
|
|
14
15
|
import { engineCaseForCase, } from "./execution-jobs.js";
|
|
15
|
-
import { createWorkbenchExecutionEventPublisher,
|
|
16
|
-
import { readWorkbenchExecutionPurpose } from "./execution-
|
|
16
|
+
import { createWorkbenchExecutionEventPublisher, publishCommandStepEvent, } from "./execution-events.js";
|
|
17
|
+
import { readWorkbenchExecutionPurpose } from "./execution-evidence.js";
|
|
17
18
|
import { adapterAuthEnv, localWorkbenchAdapterAuthStore, normalizeWorkbenchAdapterAuthTarget, sanitizeWorkbenchAdapterAuthBundle, } from "./adapter-auth.js";
|
|
18
|
-
export { BENCHMARK_SPEC_FILE, DEFAULT_EXECUTION_RESOURCES,
|
|
19
|
-
export {
|
|
19
|
+
export { BENCHMARK_SPEC_FILE, DEFAULT_EXECUTION_RESOURCES, engineCasePrivateFiles, engineCaseFilesForRuntimeInput, engineCasePublicFiles, engineResolveInvocationForSpec, engineResolveBindingForSpec, engineResolveBindingForSourceYaml, isWorkbenchSubjectManifestPath, parseWorkbenchSourceFiles, resolveEngineCaseExecutionConfig, resolveWorkbenchResolvedSourceYaml, resolveWorkbenchSourceFiles, runtimeNetwork, runtimeResources, serializeWorkbenchResolvedSourceYaml, validateWorkbenchResolvedSourceYaml, } from "./generic-spec.js";
|
|
20
|
+
export { composeRuntimeDockerfileWithAdapterInstallers, } from "./runtime-dockerfile.js";
|
|
21
|
+
export { adapterCommandName, cloneWorkbenchAdapterManifest, collectWorkbenchAdapterAuthRequirements, collectWorkbenchAdapterInvocations, parseWorkbenchAdapterManifest, workbenchAdapterManifestRequiresAuth, workbenchAdapterManifestSupportsOperation, workbenchAdapterOperationCommand, workbenchAdapterOperationExecutor, withDefaultWorkbenchAdapterAuth, withDefaultWorkbenchAdapterAuthProfiles, } from "@workbench-ai/workbench-protocol";
|
|
20
22
|
export { adapterAuthEnv, createWorkbenchAdapterAuthBundle, defaultWorkbenchAdapterAuthStoreRoot, localWorkbenchAdapterAuthStore, normalizeWorkbenchAdapterAuthTarget, parseWorkbenchAdapterAuthTarget, sanitizeWorkbenchAdapterAuthBundle, } from "./adapter-auth.js";
|
|
21
23
|
export { asRuntimeRecord, importNodeModule, nodeBuiltin, normalizeWorkbenchWorkerId, normalizeRuntimeRegistry, quoteShellArg, resolveDockerRuntimeImageRef, resolveWorkbenchWorkerId, } from "./runtime-utils.js";
|
|
22
24
|
export { assignUsageRole, extractExecutionUsageFromTrace, mergeUsageSummaries, } from "./execution-usage.js";
|
|
23
25
|
export { createWorkbenchProgressStdoutParser, publishWorkbenchProgressStdoutEnvelope, } from "./execution-events.js";
|
|
24
26
|
export { resolveSandboxTemplateImage, } from "./sandbox-backends/template-images.js";
|
|
25
|
-
export { readOutputTraceFiles,
|
|
27
|
+
export { readOutputTraceFiles, workbenchTraceExecutionDirectory, workbenchTraceRunDirectory, workbenchTraceRunDirectoryName, } from "./trace-files.js";
|
|
26
28
|
export { assertWorkbenchAdapterOperationSupport, assertWorkbenchAdapterOperationResultOk, collectWorkbenchAdapterOperationIssues, collectWorkbenchAdapterOperationRequirements, ensureWorkbenchAdapterOutputDir, WORKBENCH_ADAPTER_RESULT_FILE, normalizeWorkbenchAdapterOperationRequest, normalizeWorkbenchAdapterOperationResult, readWorkbenchAdapterOperationRequest, readWorkbenchAdapterOperationResult, workbenchAdapterOperationResultPath, writeWorkbenchAdapterOperationResult, } from "@workbench-ai/workbench-protocol";
|
|
27
29
|
export { applyWorkbenchSubjectPatch, } from "./subject-patch.js";
|
|
28
30
|
export { createWorkbenchSandboxFileStore, createSandboxAdapterRequest, executionResultFromCompletedSandboxJob, materializeWorkbenchSandboxInput, readWorkbenchExecutionSpec, sanitizeWorkbenchExecutionJobForSandbox, } from "./sandbox-inputs.js";
|
|
@@ -31,8 +33,8 @@ export { createBaselineSubjectExecution, createBaselineSubjectJob, createWorkben
|
|
|
31
33
|
export { addCapacity, capacityFits, runWorkbenchExecutionDag, subtractCapacity, workbenchJobDependencies, workbenchJobHostCost, workbenchJobResources, } from "./execution-scheduler.js";
|
|
32
34
|
export { assertWorkbenchExecutionIsolation, collectWorkbenchExecutionIsolationIssues, validateWorkbenchExecutionOutputPayloads, } from "./execution-outputs.js";
|
|
33
35
|
export { collectSandboxAllocationScopeIssues, collectExecutionCapabilityScopeIssues, collectSandboxHandleScopeIssues, createWorkbenchSandboxAllocation, createWorkbenchSandboxExecutionMetadata, createWorkbenchExecutionCapability, executeValidatedSandboxExecution, } from "./sandbox-plane.js";
|
|
34
|
-
export {
|
|
35
|
-
export { finalizeWorkbenchExecutionTraceForJob, mergeWorkbenchExecutionTracesByJob, } from "./execution-traces.js";
|
|
36
|
+
export { buildSubjectCaseExecutionRefs, buildWorkbenchExecutionEvidence, isWorkbenchExecutionActive, readWorkbenchExecutionId, readWorkbenchExecutionMetadataNumber, readWorkbenchExecutionMetadataString, readWorkbenchExecutionPurpose, resolveWorkbenchJobGroupStatus, } from "./execution-evidence.js";
|
|
37
|
+
export { buildWorkbenchTraceSessionsFromFiles, combineWorkbenchTraceSessions, finalizeWorkbenchExecutionTraceForJob, mergeWorkbenchExecutionTracesByJob, readWorkbenchExecutionTraceFiles, traceSessionLabel, } from "./execution-traces.js";
|
|
36
38
|
export { DOCKER_SANDBOX_BACKEND, assertSandboxHostHealthForProvider, createDockerSandboxBackendDescriptor, createDockerSandboxPlane, resolveWorkbenchSandboxProviderName, sandboxProviderAdmissionForResources, sandboxProviderDefaultMaxConcurrentJobs, sandboxProviderLeaseScope, sandboxHostHealthExpectationForProvider, } from "./sandbox-backends/index.js";
|
|
37
39
|
export const DEFAULT_ENVIRONMENT_VERSIONS = [
|
|
38
40
|
{
|
|
@@ -142,7 +144,7 @@ export const DEFAULT_ENVIRONMENTS = [
|
|
|
142
144
|
{
|
|
143
145
|
id: "env_libreoffice_agent",
|
|
144
146
|
name: "LibreOffice + Agent",
|
|
145
|
-
description: "Agent runtime with soffice and Python libraries for spreadsheet-heavy
|
|
147
|
+
description: "Agent runtime with soffice and Python libraries for spreadsheet-heavy evaluations.",
|
|
146
148
|
currentVersionId: "envv_libreoffice_agent",
|
|
147
149
|
builtIn: true,
|
|
148
150
|
createdAt: "2026-04-29T00:00:00.000Z",
|
|
@@ -278,30 +280,36 @@ function adapterProtocolCommandSpec(adapter, operation, manifests = []) {
|
|
|
278
280
|
return {
|
|
279
281
|
use: "command",
|
|
280
282
|
command: manifest ? workbenchAdapterOperationCommand(manifest, operation) : adapterCommandName(adapter.use),
|
|
283
|
+
executor: manifest ? workbenchAdapterOperationExecutor(manifest, operation) : "sandbox",
|
|
281
284
|
};
|
|
282
285
|
}
|
|
283
|
-
function
|
|
284
|
-
|
|
285
|
-
|
|
286
|
+
function protocolStepForExecution(execution, manifests) {
|
|
287
|
+
if (execution.purpose !== "improve") {
|
|
288
|
+
throw new Error(`Protocol execution step only supports improve executions, not ${execution.purpose}.`);
|
|
289
|
+
}
|
|
290
|
+
const operation = "optimizer.improve";
|
|
286
291
|
const command = adapterProtocolCommandSpec(execution.adapter, operation, manifests);
|
|
287
292
|
return {
|
|
288
|
-
kind:
|
|
293
|
+
kind: "optimizer",
|
|
289
294
|
label: execution.purpose,
|
|
290
295
|
operation,
|
|
296
|
+
executor: command.executor,
|
|
291
297
|
adapter: execution.adapter,
|
|
292
298
|
command: command.command,
|
|
293
299
|
};
|
|
294
300
|
}
|
|
295
|
-
function
|
|
301
|
+
function attemptStepsForExecution(execution, spec, manifests) {
|
|
296
302
|
void spec;
|
|
297
|
-
const
|
|
303
|
+
const command = adapterProtocolCommandSpec(execution.adapter, "engine.run", manifests);
|
|
304
|
+
const engineStep = {
|
|
298
305
|
kind: "engine",
|
|
299
306
|
label: "engine",
|
|
300
307
|
operation: "engine.run",
|
|
308
|
+
executor: command.executor,
|
|
301
309
|
adapter: execution.adapter,
|
|
302
|
-
command:
|
|
310
|
+
command: command.command,
|
|
303
311
|
};
|
|
304
|
-
return [
|
|
312
|
+
return [engineStep];
|
|
305
313
|
}
|
|
306
314
|
function adapterConfigRecord(adapter, manifests = []) {
|
|
307
315
|
const config = cloneJsonRecord(jsonRecord(adapter.with));
|
|
@@ -411,7 +419,10 @@ export function materializeWorkbenchRunResult(args) {
|
|
|
411
419
|
.sort((left, right) => compareSampleOutputs(left.output, right.output));
|
|
412
420
|
const outputJobIds = new Set(outputs.flatMap(({ jobs }) => jobs.map((job) => job.id)));
|
|
413
421
|
const completedSampleKeys = new Set(outputs
|
|
414
|
-
.
|
|
422
|
+
.flatMap(({ jobs, output }) => [
|
|
423
|
+
evaluationSampleGroupKeyFromOutput(output),
|
|
424
|
+
...jobs.map(evaluationSampleGroupKeyFromJob),
|
|
425
|
+
])
|
|
415
426
|
.filter((key) => key !== null));
|
|
416
427
|
const errorSampleJobs = [
|
|
417
428
|
...subjectJobs.filter((job) => job.status === "failed"),
|
|
@@ -472,7 +483,7 @@ export function materializeWorkbenchRunResult(args) {
|
|
|
472
483
|
meta,
|
|
473
484
|
};
|
|
474
485
|
subjects.push(record);
|
|
475
|
-
evaluations.push(
|
|
486
|
+
evaluations.push(createEvaluationScorecard({
|
|
476
487
|
runId: args.runId,
|
|
477
488
|
benchmarkFingerprint: args.benchmarkFingerprint,
|
|
478
489
|
createdAt: args.startedAt,
|
|
@@ -528,6 +539,8 @@ function materializedSubjectFingerprint(spec, files) {
|
|
|
528
539
|
hash.update("workbench-subject-v1\0");
|
|
529
540
|
hash.update("materialized\0runner\0");
|
|
530
541
|
hash.update(JSON.stringify(spec.run));
|
|
542
|
+
hash.update("prepare");
|
|
543
|
+
hash.update(JSON.stringify(spec.subject.prepare ?? null));
|
|
531
544
|
for (const file of filterSubjectSourceFiles(files).slice().sort((left, right) => left.path.localeCompare(right.path))) {
|
|
532
545
|
hash.update("\0file\0");
|
|
533
546
|
hash.update(file.path);
|
|
@@ -547,10 +560,10 @@ function materializedSubjectFiles(args) {
|
|
|
547
560
|
}
|
|
548
561
|
return [...byPath.values()].sort((left, right) => left.path.localeCompare(right.path));
|
|
549
562
|
}
|
|
550
|
-
function
|
|
563
|
+
function createEvaluationScorecard(args) {
|
|
551
564
|
const evaluation = args.evaluation;
|
|
552
565
|
return {
|
|
553
|
-
id:
|
|
566
|
+
id: evaluationScorecardId(args.runId, args.subject.id),
|
|
554
567
|
runId: args.runId,
|
|
555
568
|
benchmarkFingerprint: args.benchmarkFingerprint,
|
|
556
569
|
subjectFingerprint: args.subject.subjectFingerprint,
|
|
@@ -568,7 +581,7 @@ function createEvaluationResultRecord(args) {
|
|
|
568
581
|
evaluation,
|
|
569
582
|
};
|
|
570
583
|
}
|
|
571
|
-
function
|
|
584
|
+
export function evaluationScorecardId(runId, subjectId) {
|
|
572
585
|
const runPart = runId.replace(/[^a-z0-9]+/giu, "_").replace(/^_+|_+$/gu, "").slice(-24);
|
|
573
586
|
const subjectPart = subjectId.replace(/[^a-z0-9]+/giu, "_").replace(/^_+|_+$/gu, "").slice(-24);
|
|
574
587
|
return `eval_${runPart}_${subjectPart}`;
|
|
@@ -584,7 +597,7 @@ export function isWorkbenchInternalOutputPath(filePath) {
|
|
|
584
597
|
normalized === "sandbox-environment.json" ||
|
|
585
598
|
normalized === "sandbox_error.log" ||
|
|
586
599
|
normalized === "exit_code" ||
|
|
587
|
-
/^[a-
|
|
600
|
+
/^[a-z_-]+_(stdout\.log|stderr\.log|exit_code)$/u.test(normalized));
|
|
588
601
|
}
|
|
589
602
|
export function createSubjectRevisionTraceInputFiles(args) {
|
|
590
603
|
const files = [];
|
|
@@ -620,6 +633,23 @@ export function createSubjectRevisionTraceInputFiles(args) {
|
|
|
620
633
|
}, null, 2)}\n`));
|
|
621
634
|
return dedupeSurfaceFiles(files);
|
|
622
635
|
}
|
|
636
|
+
export function createSubjectEvaluationTraceInputFiles(args) {
|
|
637
|
+
const subject = args.subject;
|
|
638
|
+
if (!subject?.eval && !subject?.metrics) {
|
|
639
|
+
return [];
|
|
640
|
+
}
|
|
641
|
+
const filePath = normalizeRelativePath(args.path ?? `base-subject/${subject.id}/evaluation.json`);
|
|
642
|
+
const payload = {
|
|
643
|
+
kind: "subject_evaluation",
|
|
644
|
+
subjectId: subject.id,
|
|
645
|
+
status: subject.status,
|
|
646
|
+
metrics: subject.metrics ?? null,
|
|
647
|
+
fileChanges: subject.fileChanges,
|
|
648
|
+
eval: subject.eval ?? null,
|
|
649
|
+
prompt: subject.prompt ?? null,
|
|
650
|
+
};
|
|
651
|
+
return [textSurfaceFile(filePath, `${JSON.stringify(payload, null, 2)}\n`)];
|
|
652
|
+
}
|
|
623
653
|
function isTerminalExecutionJob(job) {
|
|
624
654
|
return job.kind === "execute" && (job.status === "succeeded" ||
|
|
625
655
|
job.status === "failed" ||
|
|
@@ -866,16 +896,14 @@ export function createSubjectFilePreview(args) {
|
|
|
866
896
|
};
|
|
867
897
|
}
|
|
868
898
|
export function createCaseReview(args) {
|
|
869
|
-
const preferredSampleIndex =
|
|
870
|
-
const sampleMatchesCase = (sample) => sample.id === args.caseId
|
|
871
|
-
sample.id.startsWith(`${args.caseId}__`) ||
|
|
872
|
-
(sample.cases ?? []).some((entry) => entry.id === args.caseId || entry.id.startsWith(`${args.caseId}__`));
|
|
899
|
+
const preferredSampleIndex = uniqueExecutionSampleIndex(args.executions ?? []);
|
|
900
|
+
const sampleMatchesCase = (sample) => (sample.cases ?? []).some((entry) => entry.id === args.caseId);
|
|
873
901
|
const samples = args.subject.eval?.samples ?? [];
|
|
874
902
|
const sampleResult = samples.find((sample) => typeof preferredSampleIndex === "number" &&
|
|
875
903
|
sample.index === preferredSampleIndex &&
|
|
876
904
|
sampleMatchesCase(sample)) ?? samples.find(sampleMatchesCase);
|
|
877
|
-
const caseResult = sampleResult?.cases?.find((entry) => entry.id === args.caseId
|
|
878
|
-
if (!sampleResult && (args.
|
|
905
|
+
const caseResult = sampleResult?.cases?.find((entry) => entry.id === args.caseId);
|
|
906
|
+
if (!sampleResult && (args.executions?.length ?? 0) > 0) {
|
|
879
907
|
return {
|
|
880
908
|
subjectId: args.subject.id,
|
|
881
909
|
caseId: args.caseId,
|
|
@@ -884,7 +912,7 @@ export function createCaseReview(args) {
|
|
|
884
912
|
? { sampleIndex: preferredSampleIndex }
|
|
885
913
|
: {}),
|
|
886
914
|
metrics: {},
|
|
887
|
-
|
|
915
|
+
executions: args.executions ?? [],
|
|
888
916
|
criteria_results: [],
|
|
889
917
|
};
|
|
890
918
|
}
|
|
@@ -893,28 +921,21 @@ export function createCaseReview(args) {
|
|
|
893
921
|
}
|
|
894
922
|
const durationMs = typeof caseResult?.durationMs === "number"
|
|
895
923
|
? caseResult.durationMs
|
|
896
|
-
:
|
|
897
|
-
typeof sampleResult.durationMs === "number"
|
|
898
|
-
? sampleResult.durationMs
|
|
899
|
-
: !caseResult && typeof sampleResult.durationMs === "number"
|
|
900
|
-
? sampleResult.durationMs
|
|
901
|
-
: undefined;
|
|
902
|
-
const sampleStatus = sampleResult.status === "planned" ? undefined : sampleResult.status;
|
|
903
|
-
const status = caseResult?.status ?? sampleStatus;
|
|
924
|
+
: undefined;
|
|
904
925
|
return {
|
|
905
926
|
subjectId: args.subject.id,
|
|
906
|
-
caseId: caseResult?.id ??
|
|
927
|
+
caseId: caseResult?.id ?? args.caseId,
|
|
907
928
|
caseLabel: caseResult?.label ?? args.caseId,
|
|
908
929
|
sampleId: sampleResult.id,
|
|
909
930
|
sampleIndex: sampleResult.index,
|
|
910
|
-
...(status ? { status } : {}),
|
|
911
|
-
metrics: caseResult?.metrics ??
|
|
931
|
+
...(caseResult?.status ? { status: caseResult.status } : {}),
|
|
932
|
+
metrics: caseResult?.metrics ?? {},
|
|
912
933
|
...(typeof durationMs === "number" ? { durationMs } : {}),
|
|
913
934
|
...(caseResult?.source ? { source: caseResult.source } : {}),
|
|
914
|
-
...(
|
|
915
|
-
? { feedback: caseResult
|
|
935
|
+
...(caseResult?.feedback !== undefined
|
|
936
|
+
? { feedback: caseResult.feedback }
|
|
916
937
|
: {}),
|
|
917
|
-
|
|
938
|
+
executions: args.executions ?? [],
|
|
918
939
|
criteria_results: (caseResult?.criteria ?? []).map((criterion) => ({
|
|
919
940
|
criterion_id: criterion.criterion_id,
|
|
920
941
|
pass: criterion.pass,
|
|
@@ -924,9 +945,9 @@ export function createCaseReview(args) {
|
|
|
924
945
|
})),
|
|
925
946
|
};
|
|
926
947
|
}
|
|
927
|
-
function
|
|
928
|
-
const sampleIndices = new Set(
|
|
929
|
-
.map((
|
|
948
|
+
function uniqueExecutionSampleIndex(executions) {
|
|
949
|
+
const sampleIndices = new Set(executions
|
|
950
|
+
.map((execution) => execution.sampleIndex)
|
|
930
951
|
.filter((index) => typeof index === "number"));
|
|
931
952
|
if (sampleIndices.size !== 1) {
|
|
932
953
|
return null;
|
|
@@ -951,6 +972,7 @@ function parseAuthoredWorkbenchSourceSpec(source) {
|
|
|
951
972
|
name: resolved.subject.name,
|
|
952
973
|
description: resolved.subject.description,
|
|
953
974
|
files: { path: resolved.subject.files.path },
|
|
975
|
+
...(resolved.subject.prepare ? { prepare: { ...resolved.subject.prepare } } : {}),
|
|
954
976
|
run: runSpecFromInvocation(resolved.run),
|
|
955
977
|
},
|
|
956
978
|
...(resolved.optimizer
|
|
@@ -1101,11 +1123,18 @@ export async function executeWorkbenchExecutionJob(args, options) {
|
|
|
1101
1123
|
const runtimeArgs = adapterAuthProfiles.length > 0
|
|
1102
1124
|
? { ...args, adapterAuthProfiles }
|
|
1103
1125
|
: args;
|
|
1104
|
-
const
|
|
1126
|
+
const executionForRuntime = readWorkbenchExecutionSpec(runtimeArgs.job);
|
|
1127
|
+
const executor = workbenchExecutionExecutorForRuntimeInput(runtimeArgs);
|
|
1128
|
+
if (executor === "host") {
|
|
1129
|
+
return await withWorkbenchRuntimeControlServer(runtimeArgs, options, startedAt, async (adapterRuntimeEnv) => executeAdapterInCurrentRuntime({
|
|
1130
|
+
...runtimeArgs,
|
|
1131
|
+
adapterRuntimeEnv,
|
|
1132
|
+
}, executionForRuntime, startedAt, createWorkbenchExecutionCapability(executionForRuntime, { now: startedAt })));
|
|
1133
|
+
}
|
|
1105
1134
|
const fileStore = createWorkbenchSandboxFileStore(runtimeArgs);
|
|
1106
1135
|
const planeFactory = options.createSandboxPlaneForProvider ?? createSandboxBackendPlaneForProvider;
|
|
1107
1136
|
const plane = planeFactory(options.sandboxProvider, runtimeArgs, startedAt, fileStore);
|
|
1108
|
-
const validated = await executeValidatedSandboxExecution(plane,
|
|
1137
|
+
const validated = await executeValidatedSandboxExecution(plane, executionForRuntime, {
|
|
1109
1138
|
now: startedAt,
|
|
1110
1139
|
runnerId: resolveWorkbenchWorkerId([
|
|
1111
1140
|
process.env.WORKBENCH_WORKER_ID,
|
|
@@ -1121,6 +1150,215 @@ export async function executeWorkbenchExecutionJob(args, options) {
|
|
|
1121
1150
|
return failWorkbenchRunJob(args.job, startedAt, error);
|
|
1122
1151
|
}
|
|
1123
1152
|
}
|
|
1153
|
+
export function workbenchExecutionExecutorForRuntimeInput(args) {
|
|
1154
|
+
if (args.runtimeControlOperation) {
|
|
1155
|
+
return "sandbox";
|
|
1156
|
+
}
|
|
1157
|
+
const execution = readWorkbenchExecutionSpec(args.job);
|
|
1158
|
+
const operation = adapterOperationForExecutionPurpose(execution.purpose);
|
|
1159
|
+
if (!operation) {
|
|
1160
|
+
return "sandbox";
|
|
1161
|
+
}
|
|
1162
|
+
const manifest = args.adapterManifests?.find((entry) => entry.id === execution.adapter.use);
|
|
1163
|
+
return manifest ? workbenchAdapterOperationExecutor(manifest, operation) : "sandbox";
|
|
1164
|
+
}
|
|
1165
|
+
function adapterOperationForExecutionPurpose(purpose) {
|
|
1166
|
+
if (purpose === "improve") {
|
|
1167
|
+
return "optimizer.improve";
|
|
1168
|
+
}
|
|
1169
|
+
if (purpose === "attempt") {
|
|
1170
|
+
return "engine.run";
|
|
1171
|
+
}
|
|
1172
|
+
return null;
|
|
1173
|
+
}
|
|
1174
|
+
const RUNTIME_CONTROL_MAX_BODY_BYTES = 512 * 1024 * 1024;
|
|
1175
|
+
async function withWorkbenchRuntimeControlServer(args, options, startedAt, run) {
|
|
1176
|
+
const [{ createServer }] = await Promise.all([
|
|
1177
|
+
importNodeModule(nodeBuiltin("http")),
|
|
1178
|
+
]);
|
|
1179
|
+
const token = randomBytes(24).toString("base64url");
|
|
1180
|
+
const server = createServer((request, response) => {
|
|
1181
|
+
void handleWorkbenchRuntimeControlHttpRequest({
|
|
1182
|
+
request,
|
|
1183
|
+
response,
|
|
1184
|
+
token,
|
|
1185
|
+
args,
|
|
1186
|
+
options,
|
|
1187
|
+
startedAt,
|
|
1188
|
+
});
|
|
1189
|
+
});
|
|
1190
|
+
const url = await new Promise((resolve, reject) => {
|
|
1191
|
+
server.once("error", reject);
|
|
1192
|
+
server.listen(0, "127.0.0.1", () => {
|
|
1193
|
+
server.off("error", reject);
|
|
1194
|
+
const address = server.address();
|
|
1195
|
+
if (!address || typeof address === "string") {
|
|
1196
|
+
reject(new Error("Workbench runtime-control server did not expose a local TCP address."));
|
|
1197
|
+
return;
|
|
1198
|
+
}
|
|
1199
|
+
resolve(`http://127.0.0.1:${address.port}`);
|
|
1200
|
+
});
|
|
1201
|
+
});
|
|
1202
|
+
try {
|
|
1203
|
+
return await run({
|
|
1204
|
+
[WORKBENCH_RUNTIME_CONTROL_URL_ENV]: url,
|
|
1205
|
+
[WORKBENCH_RUNTIME_CONTROL_TOKEN_ENV]: token,
|
|
1206
|
+
});
|
|
1207
|
+
}
|
|
1208
|
+
finally {
|
|
1209
|
+
await new Promise((resolve) => server.close(() => resolve()));
|
|
1210
|
+
}
|
|
1211
|
+
}
|
|
1212
|
+
async function handleWorkbenchRuntimeControlHttpRequest(args) {
|
|
1213
|
+
const { request, response } = args;
|
|
1214
|
+
try {
|
|
1215
|
+
if (request.method !== "POST" || request.url !== "/v1/operation-sequence") {
|
|
1216
|
+
writeRuntimeControlJson(response, 404, { error: "Unknown Workbench runtime-control endpoint." });
|
|
1217
|
+
return;
|
|
1218
|
+
}
|
|
1219
|
+
if (request.headers.authorization !== `Bearer ${args.token}`) {
|
|
1220
|
+
writeRuntimeControlJson(response, 401, { error: "Workbench runtime-control token is invalid." });
|
|
1221
|
+
return;
|
|
1222
|
+
}
|
|
1223
|
+
const parsed = JSON.parse(await readRuntimeControlBody(request));
|
|
1224
|
+
const controlRequest = normalizeRuntimeControlOperationSequenceRequest(parsed);
|
|
1225
|
+
const result = await executeRuntimeControlOperationSequenceInSandbox(args.args, args.options, args.startedAt, controlRequest);
|
|
1226
|
+
writeRuntimeControlJson(response, 200, result);
|
|
1227
|
+
}
|
|
1228
|
+
catch (error) {
|
|
1229
|
+
writeRuntimeControlJson(response, 500, {
|
|
1230
|
+
error: error instanceof Error ? error.stack ?? error.message : String(error),
|
|
1231
|
+
});
|
|
1232
|
+
}
|
|
1233
|
+
}
|
|
1234
|
+
function writeRuntimeControlJson(response, statusCode, payload) {
|
|
1235
|
+
response.statusCode = statusCode;
|
|
1236
|
+
response.setHeader("content-type", "application/json");
|
|
1237
|
+
response.end(`${JSON.stringify(payload, null, 2)}\n`);
|
|
1238
|
+
}
|
|
1239
|
+
function readRuntimeControlBody(request) {
|
|
1240
|
+
return new Promise((resolve, reject) => {
|
|
1241
|
+
const chunks = [];
|
|
1242
|
+
let size = 0;
|
|
1243
|
+
request.on("data", (chunk) => {
|
|
1244
|
+
size += chunk.length;
|
|
1245
|
+
if (size > RUNTIME_CONTROL_MAX_BODY_BYTES) {
|
|
1246
|
+
reject(new Error("Workbench runtime-control request body is too large."));
|
|
1247
|
+
request.destroy();
|
|
1248
|
+
return;
|
|
1249
|
+
}
|
|
1250
|
+
chunks.push(chunk);
|
|
1251
|
+
});
|
|
1252
|
+
request.on("error", reject);
|
|
1253
|
+
request.on("end", () => resolve(Buffer.concat(chunks).toString("utf8")));
|
|
1254
|
+
});
|
|
1255
|
+
}
|
|
1256
|
+
function normalizeRuntimeControlOperationSequenceRequest(value) {
|
|
1257
|
+
if (!value || typeof value !== "object" || Array.isArray(value)) {
|
|
1258
|
+
throw new Error("Workbench runtime-control operation sequence request must be an object.");
|
|
1259
|
+
}
|
|
1260
|
+
const record = value;
|
|
1261
|
+
if (!Array.isArray(record.operations) || record.operations.length === 0) {
|
|
1262
|
+
throw new Error("Workbench runtime-control operation sequence requires at least one operation.");
|
|
1263
|
+
}
|
|
1264
|
+
const inputs = normalizeRuntimeControlInputs(record.inputs);
|
|
1265
|
+
return {
|
|
1266
|
+
...(inputs ? { inputs } : {}),
|
|
1267
|
+
operations: record.operations.map((entry, index) => normalizeRuntimeControlOperation(entry, `operations[${index}]`)),
|
|
1268
|
+
...(typeof record.prepare === "boolean" ? { prepare: record.prepare } : {}),
|
|
1269
|
+
...(typeof record.collectWorkspace === "boolean" ? { collectWorkspace: record.collectWorkspace } : {}),
|
|
1270
|
+
};
|
|
1271
|
+
}
|
|
1272
|
+
function normalizeRuntimeControlInputs(value) {
|
|
1273
|
+
if (value === undefined) {
|
|
1274
|
+
return undefined;
|
|
1275
|
+
}
|
|
1276
|
+
if (!value || typeof value !== "object" || Array.isArray(value)) {
|
|
1277
|
+
throw new Error("Workbench runtime-control inputs must be an object.");
|
|
1278
|
+
}
|
|
1279
|
+
const record = value;
|
|
1280
|
+
const inputs = {};
|
|
1281
|
+
if (hasOwn(record, "subject")) {
|
|
1282
|
+
inputs.subject = normalizeRuntimeControlFiles(record.subject, "inputs.subject");
|
|
1283
|
+
}
|
|
1284
|
+
if (hasOwn(record, "case")) {
|
|
1285
|
+
inputs.case = normalizeRuntimeControlFiles(record.case, "inputs.case");
|
|
1286
|
+
}
|
|
1287
|
+
if (hasOwn(record, "enginePrivate")) {
|
|
1288
|
+
inputs.enginePrivate = normalizeRuntimeControlFiles(record.enginePrivate, "inputs.enginePrivate");
|
|
1289
|
+
}
|
|
1290
|
+
if (hasOwn(record, "traces")) {
|
|
1291
|
+
inputs.traces = normalizeRuntimeControlFiles(record.traces, "inputs.traces");
|
|
1292
|
+
}
|
|
1293
|
+
if (hasOwn(record, "workspace")) {
|
|
1294
|
+
inputs.workspace = normalizeRuntimeControlFiles(record.workspace, "inputs.workspace");
|
|
1295
|
+
}
|
|
1296
|
+
if (hasOwn(record, "output")) {
|
|
1297
|
+
inputs.output = normalizeRuntimeControlFiles(record.output, "inputs.output");
|
|
1298
|
+
}
|
|
1299
|
+
return inputs;
|
|
1300
|
+
}
|
|
1301
|
+
function normalizeRuntimeControlFiles(value, label) {
|
|
1302
|
+
if (value === undefined) {
|
|
1303
|
+
return [];
|
|
1304
|
+
}
|
|
1305
|
+
if (!Array.isArray(value)) {
|
|
1306
|
+
throw new Error(`Workbench runtime-control ${label} must be an array.`);
|
|
1307
|
+
}
|
|
1308
|
+
return value.map((entry, index) => {
|
|
1309
|
+
if (!isSurfaceSnapshotFile(entry)) {
|
|
1310
|
+
throw new Error(`Workbench runtime-control ${label}[${index}] must be a surface snapshot file.`);
|
|
1311
|
+
}
|
|
1312
|
+
return { ...entry, path: normalizeRelativePath(entry.path) };
|
|
1313
|
+
});
|
|
1314
|
+
}
|
|
1315
|
+
function hasOwn(value, key) {
|
|
1316
|
+
return Object.prototype.hasOwnProperty.call(value, key);
|
|
1317
|
+
}
|
|
1318
|
+
function normalizeRuntimeControlOperation(value, label) {
|
|
1319
|
+
if (!value || typeof value !== "object" || Array.isArray(value)) {
|
|
1320
|
+
throw new Error(`Workbench runtime-control ${label} must be an object.`);
|
|
1321
|
+
}
|
|
1322
|
+
const record = value;
|
|
1323
|
+
const operation = record.operation;
|
|
1324
|
+
if (operation !== "engine.resolve" &&
|
|
1325
|
+
operation !== "engine.run" &&
|
|
1326
|
+
operation !== "subject.run" &&
|
|
1327
|
+
operation !== "optimizer.improve") {
|
|
1328
|
+
throw new Error(`Workbench runtime-control ${label}.operation is invalid.`);
|
|
1329
|
+
}
|
|
1330
|
+
const invocation = record.invocation;
|
|
1331
|
+
if (!invocation || typeof invocation !== "object" || Array.isArray(invocation)) {
|
|
1332
|
+
throw new Error(`Workbench runtime-control ${label}.invocation must be an object.`);
|
|
1333
|
+
}
|
|
1334
|
+
const invocationRecord = invocation;
|
|
1335
|
+
if (typeof invocationRecord.use !== "string" || invocationRecord.use.length === 0) {
|
|
1336
|
+
throw new Error(`Workbench runtime-control ${label}.invocation.use is required.`);
|
|
1337
|
+
}
|
|
1338
|
+
const withConfig = invocationRecord.with === undefined
|
|
1339
|
+
? {}
|
|
1340
|
+
: isJsonPayload(invocationRecord.with)
|
|
1341
|
+
? invocationRecord.with
|
|
1342
|
+
: null;
|
|
1343
|
+
if (withConfig === null) {
|
|
1344
|
+
throw new Error(`Workbench runtime-control ${label}.invocation.with must be JSON.`);
|
|
1345
|
+
}
|
|
1346
|
+
if (invocationRecord.auth !== undefined && !isJsonPayload(invocationRecord.auth)) {
|
|
1347
|
+
throw new Error(`Workbench runtime-control ${label}.invocation.auth must be JSON.`);
|
|
1348
|
+
}
|
|
1349
|
+
return {
|
|
1350
|
+
operation,
|
|
1351
|
+
invocation: {
|
|
1352
|
+
use: invocationRecord.use,
|
|
1353
|
+
with: withConfig,
|
|
1354
|
+
...(invocationRecord.auth !== undefined ? { auth: invocationRecord.auth } : {}),
|
|
1355
|
+
...(typeof invocationRecord.command === "string" && invocationRecord.command.trim()
|
|
1356
|
+
? { command: invocationRecord.command }
|
|
1357
|
+
: {}),
|
|
1358
|
+
},
|
|
1359
|
+
...(typeof record.label === "string" && record.label.trim() ? { label: record.label } : {}),
|
|
1360
|
+
};
|
|
1361
|
+
}
|
|
1124
1362
|
async function explicitAdapterAuthProfilesForExecution(execution, args, loadLocalAdapterProfiles) {
|
|
1125
1363
|
const required = requiredAdapterAuthTargetsForExecution(execution, args);
|
|
1126
1364
|
if (required.length === 0) {
|
|
@@ -1155,7 +1393,7 @@ function adapterAuthTargetKey(target) {
|
|
|
1155
1393
|
export function workbenchExecutionPurpose(job) {
|
|
1156
1394
|
return readWorkbenchExecutionPurpose(job);
|
|
1157
1395
|
}
|
|
1158
|
-
export async function
|
|
1396
|
+
export async function executeAdapterInCurrentRuntime(args, execution, startedAt, capability) {
|
|
1159
1397
|
const eventPublisher = createWorkbenchExecutionEventPublisher({
|
|
1160
1398
|
projectId: args.job.projectId,
|
|
1161
1399
|
runId: args.job.runId,
|
|
@@ -1174,10 +1412,10 @@ export async function executeAdapterInCurrentSandboxRuntime(args, execution, sta
|
|
|
1174
1412
|
};
|
|
1175
1413
|
try {
|
|
1176
1414
|
if (execution.purpose === "improve") {
|
|
1177
|
-
return await
|
|
1415
|
+
return await executeSubjectRevisionExecutionInCurrentRuntime(runtimeInput, execution, startedAt, capability, eventPublisher);
|
|
1178
1416
|
}
|
|
1179
1417
|
if (execution.purpose === "attempt") {
|
|
1180
|
-
return await
|
|
1418
|
+
return await executeAttemptExecutionInCurrentRuntime(runtimeInput, execution, startedAt, capability, eventPublisher);
|
|
1181
1419
|
}
|
|
1182
1420
|
throw new Error(`Unsupported execution purpose ${execution.purpose}.`);
|
|
1183
1421
|
}
|
|
@@ -1274,7 +1512,7 @@ function adapterAuthRequest(bundles, root, currentAdapterId) {
|
|
|
1274
1512
|
}
|
|
1275
1513
|
return entries;
|
|
1276
1514
|
}
|
|
1277
|
-
function
|
|
1515
|
+
function adapterAuthRequestForStep(args, adapterId) {
|
|
1278
1516
|
const profiles = (args.adapterAuthProfiles ?? [])
|
|
1279
1517
|
.map((bundle) => sanitizeWorkbenchAdapterAuthBundle(bundle));
|
|
1280
1518
|
if (profiles.length === 0) {
|
|
@@ -1295,12 +1533,19 @@ function adapterAuthProfilesForExecution(execution, args) {
|
|
|
1295
1533
|
}
|
|
1296
1534
|
function requiredAdapterAuthTargetsForExecution(execution, args) {
|
|
1297
1535
|
const manifests = args.adapterManifests ?? [];
|
|
1298
|
-
return collectWorkbenchAdapterAuthRequirements(adapterInvocationsForExecution(execution, args
|
|
1536
|
+
return collectWorkbenchAdapterAuthRequirements(adapterInvocationsForExecution(execution, args), manifests)
|
|
1299
1537
|
.map((target) => normalizeWorkbenchAdapterAuthTarget(target));
|
|
1300
1538
|
}
|
|
1301
|
-
function adapterInvocationsForExecution(execution,
|
|
1539
|
+
function adapterInvocationsForExecution(execution, args) {
|
|
1540
|
+
if (args.runtimeControlOperation) {
|
|
1541
|
+
return uniqueAdapterInvocations(args.runtimeControlOperation.operations.map((operation) => ({
|
|
1542
|
+
use: operation.invocation.use,
|
|
1543
|
+
with: operation.invocation.with ?? {},
|
|
1544
|
+
...(operation.invocation.auth !== undefined ? { auth: operation.invocation.auth } : {}),
|
|
1545
|
+
})));
|
|
1546
|
+
}
|
|
1302
1547
|
if (execution.purpose === "attempt") {
|
|
1303
|
-
return uniqueAdapterInvocations([execution.adapter, spec.run]);
|
|
1548
|
+
return uniqueAdapterInvocations([execution.adapter, args.spec.run]);
|
|
1304
1549
|
}
|
|
1305
1550
|
return [execution.adapter];
|
|
1306
1551
|
}
|
|
@@ -1341,7 +1586,7 @@ function completedJobFromSandboxResult(fallbackJob, startedAt, result) {
|
|
|
1341
1586
|
}
|
|
1342
1587
|
return attachSandboxMetadataToJob(failWorkbenchRunJob(fallbackJob, result.startedAt || startedAt, result.error ?? `Sandbox execution ${result.status}.`, result.finishedAt), asRuntimeRecord(result.metadata).sandbox);
|
|
1343
1588
|
}
|
|
1344
|
-
async function
|
|
1589
|
+
async function executeSubjectRevisionExecutionInCurrentRuntime(args, execution, startedAt, capability, eventPublisher) {
|
|
1345
1590
|
const { workload, result } = await runHostedProtocolExecutionResult(args, execution, startedAt, capability, eventPublisher);
|
|
1346
1591
|
if (result.error || (result.exitCode ?? 0) !== 0) {
|
|
1347
1592
|
return failWorkbenchRunJob(args.job, startedAt, result.error ?? `Adapter ${execution.adapter.use} exited with status ${result.exitCode}.`, result.finishedAt, result);
|
|
@@ -1382,7 +1627,7 @@ async function executeSubjectRevisionExecutionInSandbox(args, execution, started
|
|
|
1382
1627
|
},
|
|
1383
1628
|
};
|
|
1384
1629
|
}
|
|
1385
|
-
async function
|
|
1630
|
+
async function executeAttemptExecutionInCurrentRuntime(args, execution, startedAt, capability, eventPublisher) {
|
|
1386
1631
|
const workload = createWorkbenchRunWorkload({
|
|
1387
1632
|
job: args.job,
|
|
1388
1633
|
spec: args.spec,
|
|
@@ -1391,7 +1636,7 @@ async function executeAttemptExecutionInSandbox(args, execution, startedAt, capa
|
|
|
1391
1636
|
engineCases: args.engineCases,
|
|
1392
1637
|
traceFiles: args.traceFiles,
|
|
1393
1638
|
});
|
|
1394
|
-
const workloadResult = await
|
|
1639
|
+
const workloadResult = await runHostedCommandExecutionSteps(args, workload, attemptStepsForExecution(execution, args.spec, args.adapterManifests), startedAt, {
|
|
1395
1640
|
capability,
|
|
1396
1641
|
eventPublisher,
|
|
1397
1642
|
});
|
|
@@ -1405,10 +1650,7 @@ async function executeAttemptExecutionInSandbox(args, execution, startedAt, capa
|
|
|
1405
1650
|
return failWorkbenchRunJob(args.job, startedAt, "Attempt engine must return a workbench-result result with a finite numeric score.", workloadResult.finishedAt, workloadResult);
|
|
1406
1651
|
}
|
|
1407
1652
|
const finishedAt = workloadResult.finishedAt ?? new Date().toISOString();
|
|
1408
|
-
const usage =
|
|
1409
|
-
workloadResult.usage,
|
|
1410
|
-
engineResult.usage,
|
|
1411
|
-
]);
|
|
1653
|
+
const usage = attemptUsageSummary(workloadResult.usage, engineResult.usage);
|
|
1412
1654
|
const sample = evaluateSample({
|
|
1413
1655
|
subjectId: workload.subjectId,
|
|
1414
1656
|
files: workloadResult.files,
|
|
@@ -1453,6 +1695,282 @@ async function executeAttemptExecutionInSandbox(args, execution, startedAt, capa
|
|
|
1453
1695
|
},
|
|
1454
1696
|
};
|
|
1455
1697
|
}
|
|
1698
|
+
export async function executeRuntimeControlOperationSequenceInCurrentRuntime(args, execution, startedAt, capability) {
|
|
1699
|
+
void execution;
|
|
1700
|
+
void capability;
|
|
1701
|
+
if (!args.runtimeControlOperation) {
|
|
1702
|
+
throw new Error("Runtime-control operation sequence is missing from the sandbox request.");
|
|
1703
|
+
}
|
|
1704
|
+
const childExecution = readWorkbenchExecutionSpec(args.job);
|
|
1705
|
+
const workload = createWorkbenchRunWorkload({
|
|
1706
|
+
job: args.job,
|
|
1707
|
+
spec: args.spec,
|
|
1708
|
+
baseFiles: args.baseFiles,
|
|
1709
|
+
engineResolveFiles: args.engineResolveFiles,
|
|
1710
|
+
engineCases: args.engineCases,
|
|
1711
|
+
traceFiles: args.traceFiles,
|
|
1712
|
+
});
|
|
1713
|
+
const runtimeArgs = { ...args };
|
|
1714
|
+
delete runtimeArgs.adapterRuntimeEnv;
|
|
1715
|
+
const adapterAuth = await materializeSandboxAdapterAuth(runtimeArgs, childExecution);
|
|
1716
|
+
let result;
|
|
1717
|
+
try {
|
|
1718
|
+
result = await runHostedCommandExecutionSteps({
|
|
1719
|
+
...runtimeArgs,
|
|
1720
|
+
...(adapterAuth.root ? { adapterAuthRoot: adapterAuth.root } : {}),
|
|
1721
|
+
...(Object.keys(adapterAuth.env).length > 0
|
|
1722
|
+
? { adapterAuthEnv: adapterAuth.env }
|
|
1723
|
+
: {}),
|
|
1724
|
+
}, workload, args.runtimeControlOperation.operations.map((operation, index) => runtimeControlStepForOperation(operation, index, args.adapterManifests)), startedAt, {
|
|
1725
|
+
runSubjectPrepare: args.runtimeControlOperation.prepare ?? false,
|
|
1726
|
+
workspaceFiles: args.runtimeControlOperation.inputs?.workspace ?? [],
|
|
1727
|
+
outputFiles: args.runtimeControlOperation.inputs?.output ?? [],
|
|
1728
|
+
collectWorkspace: args.runtimeControlOperation.collectWorkspace ?? false,
|
|
1729
|
+
});
|
|
1730
|
+
}
|
|
1731
|
+
finally {
|
|
1732
|
+
if (adapterAuth.cleanup) {
|
|
1733
|
+
await adapterAuth.cleanup().catch(() => undefined);
|
|
1734
|
+
}
|
|
1735
|
+
}
|
|
1736
|
+
const finishedAt = result.finishedAt ?? new Date().toISOString();
|
|
1737
|
+
const failed = Boolean(result.error) || (result.exitCode ?? 0) !== 0;
|
|
1738
|
+
return {
|
|
1739
|
+
...args.job,
|
|
1740
|
+
status: failed ? "failed" : "succeeded",
|
|
1741
|
+
attempt: Math.max(1, args.job.attempt),
|
|
1742
|
+
startedAt,
|
|
1743
|
+
finishedAt,
|
|
1744
|
+
updatedAt: finishedAt,
|
|
1745
|
+
...(failed ? { error: result.error ?? `Runtime-control operation sequence exited with status ${result.exitCode}.` } : {}),
|
|
1746
|
+
output: runtimeControlJobOutput(result, !failed),
|
|
1747
|
+
};
|
|
1748
|
+
}
|
|
1749
|
+
async function executeRuntimeControlOperationSequenceInSandbox(args, options, startedAt, request) {
|
|
1750
|
+
const childArgs = createRuntimeControlSandboxInput(args, request);
|
|
1751
|
+
const execution = readWorkbenchExecutionSpec(childArgs.job);
|
|
1752
|
+
const fileStore = createWorkbenchSandboxFileStore(childArgs);
|
|
1753
|
+
const planeFactory = options.createSandboxPlaneForProvider ?? createSandboxBackendPlaneForProvider;
|
|
1754
|
+
const plane = planeFactory(options.sandboxProvider, childArgs, startedAt, fileStore);
|
|
1755
|
+
assertSandboxBackendSupportsNetworkPolicy(plane.backend, execution);
|
|
1756
|
+
const sandboxOptions = {
|
|
1757
|
+
now: startedAt,
|
|
1758
|
+
runnerId: resolveWorkbenchWorkerId([
|
|
1759
|
+
process.env.WORKBENCH_WORKER_ID,
|
|
1760
|
+
process.env.EC2_INSTANCE_ID,
|
|
1761
|
+
os.hostname(),
|
|
1762
|
+
process.env.HOSTNAME,
|
|
1763
|
+
], "local-runner"),
|
|
1764
|
+
fileStore,
|
|
1765
|
+
};
|
|
1766
|
+
const inputs = await fileStore.materializeInputs(execution);
|
|
1767
|
+
const environment = plane.prepareEnvironment
|
|
1768
|
+
? await plane.prepareEnvironment(execution, sandboxOptions)
|
|
1769
|
+
: {
|
|
1770
|
+
backend: plane.backend.name,
|
|
1771
|
+
kind: execution.sandbox.kind,
|
|
1772
|
+
ref: execution.sandbox.ref,
|
|
1773
|
+
};
|
|
1774
|
+
const allocation = createWorkbenchSandboxAllocation(execution, {
|
|
1775
|
+
backend: plane.backend.name,
|
|
1776
|
+
runnerId: sandboxOptions.runnerId,
|
|
1777
|
+
now: startedAt,
|
|
1778
|
+
});
|
|
1779
|
+
const capability = createWorkbenchExecutionCapability(execution, { now: startedAt });
|
|
1780
|
+
assertRuntimeControlScope("Runtime-control sandbox allocation", collectSandboxAllocationScopeIssues(allocation, execution, { now: startedAt }));
|
|
1781
|
+
assertRuntimeControlScope("Runtime-control execution capability", collectExecutionCapabilityScopeIssues(capability, execution, { now: startedAt }));
|
|
1782
|
+
const sandbox = await plane.createSandbox({
|
|
1783
|
+
execution,
|
|
1784
|
+
environment,
|
|
1785
|
+
allocation,
|
|
1786
|
+
capability,
|
|
1787
|
+
inputs,
|
|
1788
|
+
}, sandboxOptions);
|
|
1789
|
+
assertRuntimeControlScope("Runtime-control sandbox handle", collectSandboxHandleScopeIssues(sandbox, allocation, execution));
|
|
1790
|
+
let result;
|
|
1791
|
+
try {
|
|
1792
|
+
result = await plane.exec({
|
|
1793
|
+
execution,
|
|
1794
|
+
environment,
|
|
1795
|
+
sandbox,
|
|
1796
|
+
allocation,
|
|
1797
|
+
capability,
|
|
1798
|
+
inputs,
|
|
1799
|
+
}, sandboxOptions);
|
|
1800
|
+
}
|
|
1801
|
+
finally {
|
|
1802
|
+
await plane.destroySandbox(sandbox, sandboxOptions);
|
|
1803
|
+
}
|
|
1804
|
+
const completedJob = completedJobFromSandboxResult(childArgs.job, startedAt, result);
|
|
1805
|
+
return runtimeControlResultFromCompletedJob(completedJob);
|
|
1806
|
+
}
|
|
1807
|
+
function createRuntimeControlSandboxInput(args, request) {
|
|
1808
|
+
const parentExecution = readWorkbenchExecutionSpec(args.job);
|
|
1809
|
+
const parentWorkload = createWorkbenchRunWorkload({
|
|
1810
|
+
job: args.job,
|
|
1811
|
+
spec: args.spec,
|
|
1812
|
+
baseFiles: args.baseFiles,
|
|
1813
|
+
engineResolveFiles: args.engineResolveFiles,
|
|
1814
|
+
engineCases: args.engineCases,
|
|
1815
|
+
traceFiles: args.traceFiles,
|
|
1816
|
+
});
|
|
1817
|
+
const nonce = runtimeControlNonce();
|
|
1818
|
+
const childExecutionId = `${parentExecution.id}:runtime:${nonce}`;
|
|
1819
|
+
const childJobId = `${args.job.id}:runtime:${nonce}`;
|
|
1820
|
+
const parentInput = asRuntimeRecord(args.job.input);
|
|
1821
|
+
const publicFiles = runtimeControlInputFiles(request.inputs, "case", parentWorkload.engineCase ? engineCasePublicFiles(parentWorkload.engineCase) : []);
|
|
1822
|
+
const privateFiles = runtimeControlInputFiles(request.inputs, "enginePrivate", parentWorkload.engineCase ? engineCasePrivateFiles(parentWorkload.engineCase) : []);
|
|
1823
|
+
const subjectFiles = runtimeControlInputFiles(request.inputs, "subject", parentWorkload.subjectFiles);
|
|
1824
|
+
const traceFiles = runtimeControlInputFiles(request.inputs, "traces", parentWorkload.traceFiles);
|
|
1825
|
+
const adapter = request.operations[request.operations.length - 1]?.invocation;
|
|
1826
|
+
const childExecution = {
|
|
1827
|
+
...parentExecution,
|
|
1828
|
+
id: childExecutionId,
|
|
1829
|
+
outputs: [],
|
|
1830
|
+
adapter: adapter
|
|
1831
|
+
? {
|
|
1832
|
+
use: adapter.use,
|
|
1833
|
+
with: adapter.with ?? {},
|
|
1834
|
+
...(adapter.auth !== undefined ? { auth: adapter.auth } : {}),
|
|
1835
|
+
}
|
|
1836
|
+
: parentExecution.adapter,
|
|
1837
|
+
metadata: {
|
|
1838
|
+
...asRuntimeRecord(parentExecution.metadata),
|
|
1839
|
+
runtimeControl: true,
|
|
1840
|
+
caseId: parentWorkload.caseId,
|
|
1841
|
+
},
|
|
1842
|
+
};
|
|
1843
|
+
const engineCase = {
|
|
1844
|
+
id: parentWorkload.caseId,
|
|
1845
|
+
case: parentWorkload.engineCaseSpec ?? {
|
|
1846
|
+
version: 3,
|
|
1847
|
+
prompt: parentWorkload.prompt,
|
|
1848
|
+
},
|
|
1849
|
+
files: {
|
|
1850
|
+
public: publicFiles,
|
|
1851
|
+
private: privateFiles,
|
|
1852
|
+
},
|
|
1853
|
+
};
|
|
1854
|
+
const childJob = {
|
|
1855
|
+
...args.job,
|
|
1856
|
+
id: childJobId,
|
|
1857
|
+
input: {
|
|
1858
|
+
...parentInput,
|
|
1859
|
+
execution: childExecution,
|
|
1860
|
+
caseId: parentWorkload.caseId,
|
|
1861
|
+
},
|
|
1862
|
+
};
|
|
1863
|
+
const childArgs = {
|
|
1864
|
+
...args,
|
|
1865
|
+
job: childJob,
|
|
1866
|
+
baseFiles: subjectFiles,
|
|
1867
|
+
engineResolveFiles: [...publicFiles, ...privateFiles],
|
|
1868
|
+
engineCases: [engineCase],
|
|
1869
|
+
traceFiles,
|
|
1870
|
+
runtimeControlOperation: request,
|
|
1871
|
+
};
|
|
1872
|
+
delete childArgs.adapterRuntimeEnv;
|
|
1873
|
+
delete childArgs.workspaceRoot;
|
|
1874
|
+
return childArgs;
|
|
1875
|
+
}
|
|
1876
|
+
function runtimeControlInputFiles(inputs, key, fallback) {
|
|
1877
|
+
if (inputs && Object.prototype.hasOwnProperty.call(inputs, key)) {
|
|
1878
|
+
return cloneSurfaceFiles(inputs[key] ?? []);
|
|
1879
|
+
}
|
|
1880
|
+
return cloneSurfaceFiles(fallback);
|
|
1881
|
+
}
|
|
1882
|
+
function runtimeControlStepForOperation(operation, index, manifests = []) {
|
|
1883
|
+
const command = operation.invocation.command?.trim()
|
|
1884
|
+
|| adapterProtocolCommandSpec({
|
|
1885
|
+
use: operation.invocation.use,
|
|
1886
|
+
with: operation.invocation.with ?? {},
|
|
1887
|
+
...(operation.invocation.auth !== undefined ? { auth: operation.invocation.auth } : {}),
|
|
1888
|
+
}, operation.operation, manifests).command;
|
|
1889
|
+
return {
|
|
1890
|
+
kind: operation.operation === "subject.run"
|
|
1891
|
+
? "subject"
|
|
1892
|
+
: operation.operation === "optimizer.improve"
|
|
1893
|
+
? "optimizer"
|
|
1894
|
+
: "engine",
|
|
1895
|
+
label: operation.label ?? `${operation.operation.replace(".", "_")}_${index + 1}`,
|
|
1896
|
+
operation: operation.operation,
|
|
1897
|
+
executor: "sandbox",
|
|
1898
|
+
adapter: {
|
|
1899
|
+
use: operation.invocation.use,
|
|
1900
|
+
with: operation.invocation.with ?? {},
|
|
1901
|
+
...(operation.invocation.auth !== undefined ? { auth: operation.invocation.auth } : {}),
|
|
1902
|
+
},
|
|
1903
|
+
command,
|
|
1904
|
+
};
|
|
1905
|
+
}
|
|
1906
|
+
function runtimeControlResultFromCompletedJob(job) {
|
|
1907
|
+
return normalizeRuntimeControlResultOutput(asRuntimeRecord(job.output), job.status === "succeeded", job.error);
|
|
1908
|
+
}
|
|
1909
|
+
function runtimeControlJobOutput(result, ok) {
|
|
1910
|
+
return normalizeRuntimeControlResultOutput({
|
|
1911
|
+
ok,
|
|
1912
|
+
files: result.files,
|
|
1913
|
+
fileChanges: result.fileChanges,
|
|
1914
|
+
...(result.operationResults ? { operationResults: result.operationResults } : {}),
|
|
1915
|
+
...(result.workspaceFiles ? { workspaceFiles: result.workspaceFiles } : {}),
|
|
1916
|
+
...(result.result ? { result: result.result } : {}),
|
|
1917
|
+
...(result.usage ? { usage: result.usage } : {}),
|
|
1918
|
+
...(result.summary !== undefined ? { summary: result.summary } : {}),
|
|
1919
|
+
...(result.feedback !== undefined ? { feedback: result.feedback } : {}),
|
|
1920
|
+
...(result.error ? { error: result.error } : {}),
|
|
1921
|
+
}, ok, result.error);
|
|
1922
|
+
}
|
|
1923
|
+
function normalizeRuntimeControlResultOutput(output, ok, fallbackError) {
|
|
1924
|
+
const files = Array.isArray(output.files)
|
|
1925
|
+
? output.files.filter(isSurfaceSnapshotFile)
|
|
1926
|
+
: [];
|
|
1927
|
+
const workspaceFiles = Array.isArray(output.workspaceFiles)
|
|
1928
|
+
? output.workspaceFiles.filter(isSurfaceSnapshotFile)
|
|
1929
|
+
: undefined;
|
|
1930
|
+
const operationResults = Array.isArray(output.operationResults)
|
|
1931
|
+
? output.operationResults.filter(isWorkbenchAdapterOperationResult)
|
|
1932
|
+
: [];
|
|
1933
|
+
return {
|
|
1934
|
+
ok: ok && output.ok !== false,
|
|
1935
|
+
files,
|
|
1936
|
+
fileChanges: Array.isArray(output.fileChanges)
|
|
1937
|
+
? output.fileChanges.filter((entry) => typeof entry === "string")
|
|
1938
|
+
: files.map((file) => file.path),
|
|
1939
|
+
operationResults,
|
|
1940
|
+
...(workspaceFiles ? { workspaceFiles } : {}),
|
|
1941
|
+
...(output.result && typeof output.result === "object" && !Array.isArray(output.result)
|
|
1942
|
+
? { result: output.result }
|
|
1943
|
+
: {}),
|
|
1944
|
+
...(output.usage && typeof output.usage === "object" && !Array.isArray(output.usage)
|
|
1945
|
+
? { usage: output.usage }
|
|
1946
|
+
: {}),
|
|
1947
|
+
...(typeof output.summary === "string" ? { summary: output.summary } : {}),
|
|
1948
|
+
...(output.feedback !== undefined && isJsonPayload(output.feedback) ? { feedback: output.feedback } : {}),
|
|
1949
|
+
...(typeof output.error === "string" ? { error: output.error } : fallbackError ? { error: fallbackError } : {}),
|
|
1950
|
+
};
|
|
1951
|
+
}
|
|
1952
|
+
function isWorkbenchAdapterOperationResult(value) {
|
|
1953
|
+
if (!value || typeof value !== "object" || Array.isArray(value)) {
|
|
1954
|
+
return false;
|
|
1955
|
+
}
|
|
1956
|
+
const record = value;
|
|
1957
|
+
return record.protocol === "workbench.adapter-result.v1" &&
|
|
1958
|
+
(record.operation === "engine.resolve" ||
|
|
1959
|
+
record.operation === "engine.run" ||
|
|
1960
|
+
record.operation === "subject.run" ||
|
|
1961
|
+
record.operation === "optimizer.improve");
|
|
1962
|
+
}
|
|
1963
|
+
function cloneSurfaceFiles(files) {
|
|
1964
|
+
return files.map((file) => ({ ...file, path: normalizeRelativePath(file.path) }));
|
|
1965
|
+
}
|
|
1966
|
+
function runtimeControlNonce() {
|
|
1967
|
+
return randomBytes(6).toString("hex");
|
|
1968
|
+
}
|
|
1969
|
+
function assertRuntimeControlScope(label, issues) {
|
|
1970
|
+
if (issues.length > 0) {
|
|
1971
|
+
throw new Error(`${label} failed validation:\n${issues.join("\n")}`);
|
|
1972
|
+
}
|
|
1973
|
+
}
|
|
1456
1974
|
async function runHostedProtocolExecutionResult(args, execution, startedAt, capability, eventPublisher) {
|
|
1457
1975
|
const workload = createWorkbenchRunWorkload({
|
|
1458
1976
|
job: args.job,
|
|
@@ -1462,13 +1980,13 @@ async function runHostedProtocolExecutionResult(args, execution, startedAt, capa
|
|
|
1462
1980
|
engineCases: args.engineCases,
|
|
1463
1981
|
traceFiles: args.traceFiles,
|
|
1464
1982
|
});
|
|
1465
|
-
const result = await
|
|
1983
|
+
const result = await runHostedCommandExecutionSteps(args, workload, [protocolStepForExecution(execution, args.adapterManifests)], startedAt, {
|
|
1466
1984
|
capability,
|
|
1467
1985
|
eventPublisher,
|
|
1468
1986
|
});
|
|
1469
1987
|
return { workload, result };
|
|
1470
1988
|
}
|
|
1471
|
-
async function
|
|
1989
|
+
async function runHostedCommandExecutionSteps(args, workload, steps, startedAt, options = {}) {
|
|
1472
1990
|
const [{ execFile }, fs, os, path, { promisify }] = await Promise.all([
|
|
1473
1991
|
importNodeModule(nodeBuiltin("child_process")),
|
|
1474
1992
|
importNodeModule(nodeBuiltin("fs/promises")),
|
|
@@ -1489,9 +2007,22 @@ async function runHostedCommandExecutionPhases(args, workload, phases, startedAt
|
|
|
1489
2007
|
const workspace = await createRuntimeWorkspaceRoot(args, fs, os, path, "workbench-execution-sandbox-");
|
|
1490
2008
|
try {
|
|
1491
2009
|
await stageWorkbenchRunWorkload(workspace.root, workload);
|
|
2010
|
+
if (options.workspaceFiles && options.workspaceFiles.length > 0) {
|
|
2011
|
+
await stageInitialWorkspaceFiles(workspace.root, options.workspaceFiles);
|
|
2012
|
+
}
|
|
2013
|
+
if (options.outputFiles && options.outputFiles.length > 0) {
|
|
2014
|
+
await writeSurfaceFiles(outputDir(workspace.root), options.outputFiles);
|
|
2015
|
+
}
|
|
2016
|
+
const execution = readWorkbenchExecutionSpec(workload.job);
|
|
2017
|
+
const hostAdapterIds = new Set(steps.flatMap((step) => step.executor === "host"
|
|
2018
|
+
? [step.adapter?.use ?? execution.adapter.use]
|
|
2019
|
+
: []));
|
|
2020
|
+
const hostAdapterRoots = hostAdapterIds.size > 0
|
|
2021
|
+
? await materializeHostAdapterRoots(workspace.root, args.adapterFiles ?? [], hostAdapterIds)
|
|
2022
|
+
: new Map();
|
|
1492
2023
|
let exitCode = 0;
|
|
1493
2024
|
let runtimeError;
|
|
1494
|
-
const
|
|
2025
|
+
const operationResults = [];
|
|
1495
2026
|
try {
|
|
1496
2027
|
if (!environmentVersion) {
|
|
1497
2028
|
throw new Error("environment is required for adapter command executions.");
|
|
@@ -1503,49 +2034,64 @@ async function runHostedCommandExecutionPhases(args, workload, phases, startedAt
|
|
|
1503
2034
|
network: environmentVersion.spec.network,
|
|
1504
2035
|
}, null, 2)}\n`);
|
|
1505
2036
|
}
|
|
1506
|
-
const
|
|
2037
|
+
const stepTimeoutMs = environmentVersion
|
|
1507
2038
|
? environmentVersionTimeoutMs(environmentVersion)
|
|
1508
2039
|
: 5 * 60 * 1000;
|
|
1509
|
-
const
|
|
1510
|
-
|
|
1511
|
-
await
|
|
1512
|
-
|
|
1513
|
-
|
|
2040
|
+
const shouldRunSubjectPrepare = options.runSubjectPrepare ?? steps.some((step) => step.executor === "sandbox");
|
|
2041
|
+
if (shouldRunSubjectPrepare) {
|
|
2042
|
+
await runSubjectPrepareCommand({
|
|
2043
|
+
root: workspace.root,
|
|
2044
|
+
workload,
|
|
2045
|
+
execution,
|
|
2046
|
+
execFileAsync,
|
|
2047
|
+
timeoutMs: stepTimeoutMs,
|
|
2048
|
+
eventPublisher: options.eventPublisher,
|
|
2049
|
+
});
|
|
2050
|
+
}
|
|
2051
|
+
let enginePrivateStaged = false;
|
|
2052
|
+
for (const step of steps) {
|
|
2053
|
+
if (step.kind === "engine" && !enginePrivateStaged) {
|
|
2054
|
+
await stageWorkbenchEnginePrivateFiles(workspace.root, workload);
|
|
2055
|
+
enginePrivateStaged = true;
|
|
1514
2056
|
}
|
|
1515
|
-
|
|
1516
|
-
const
|
|
1517
|
-
|
|
1518
|
-
|
|
2057
|
+
await resetHostedWorkloadStepOutput(workspace.root);
|
|
2058
|
+
const adapterRequestPath = await writeWorkbenchAdapterRequest(workspace.root, workload, execution, step, adapterAuthRequestForStep(args, step.adapter?.use ?? execution.adapter.use), args.adapterManifests);
|
|
2059
|
+
const stepRole = stepEventRole(step);
|
|
2060
|
+
await publishCommandStepEvent(options.eventPublisher, {
|
|
2061
|
+
step: step.label,
|
|
1519
2062
|
status: "started",
|
|
1520
|
-
...(
|
|
2063
|
+
...(stepRole ? { role: stepRole } : {}),
|
|
1521
2064
|
});
|
|
1522
2065
|
try {
|
|
1523
|
-
if (!
|
|
1524
|
-
throw new Error(`Adapter
|
|
2066
|
+
if (!step.command) {
|
|
2067
|
+
throw new Error(`Adapter step ${step.label} is missing a command.`);
|
|
1525
2068
|
}
|
|
1526
|
-
const
|
|
2069
|
+
const adapterRoot = step.executor === "host"
|
|
2070
|
+
? hostAdapterRoots.get(step.adapter?.use ?? execution.adapter.use)
|
|
2071
|
+
: undefined;
|
|
2072
|
+
const command = createHostedWorkloadShellCommand(workspace.root, step.command, step.label, step.okExitCodes);
|
|
1527
2073
|
await execFileAsync("sh", ["-c", command], {
|
|
1528
|
-
cwd: workspace.root,
|
|
1529
|
-
env:
|
|
2074
|
+
cwd: adapterRoot ?? workspace.root,
|
|
2075
|
+
env: createHostedWorkloadAdapterEnv(workspace.root, adapterRequestPath, args.adapterAuthEnv, adapterRoot ? { adapterRoot } : undefined, args.adapterRuntimeEnv),
|
|
1530
2076
|
maxBuffer: 10 * 1024 * 1024,
|
|
1531
|
-
timeout:
|
|
2077
|
+
timeout: stepTimeoutMs,
|
|
1532
2078
|
});
|
|
1533
|
-
const operationResult = await readWorkbenchAdapterOperationResult(outputDir(workspace.root),
|
|
1534
|
-
assertWorkbenchAdapterOperationResultOk(operationResult, `Adapter ${
|
|
1535
|
-
|
|
1536
|
-
await
|
|
1537
|
-
|
|
2079
|
+
const operationResult = await readWorkbenchAdapterOperationResult(outputDir(workspace.root), step.operation);
|
|
2080
|
+
assertWorkbenchAdapterOperationResultOk(operationResult, `Adapter ${step.adapter?.use ?? execution.adapter.use} ${step.operation}`);
|
|
2081
|
+
operationResults.push(operationResult);
|
|
2082
|
+
await publishCommandStepEvent(options.eventPublisher, {
|
|
2083
|
+
step: step.label,
|
|
1538
2084
|
status: "succeeded",
|
|
1539
|
-
...(
|
|
2085
|
+
...(stepRole ? { role: stepRole } : {}),
|
|
1540
2086
|
});
|
|
1541
2087
|
}
|
|
1542
2088
|
catch (error) {
|
|
1543
|
-
await
|
|
1544
|
-
|
|
2089
|
+
await publishCommandStepEvent(options.eventPublisher, {
|
|
2090
|
+
step: step.label,
|
|
1545
2091
|
status: "failed",
|
|
1546
2092
|
exitCode: readExitCode(error),
|
|
1547
2093
|
error: error instanceof Error ? error.message : String(error),
|
|
1548
|
-
...(
|
|
2094
|
+
...(stepRole ? { role: stepRole } : {}),
|
|
1549
2095
|
});
|
|
1550
2096
|
throw error;
|
|
1551
2097
|
}
|
|
@@ -1569,16 +2115,56 @@ async function runHostedCommandExecutionPhases(args, workload, phases, startedAt
|
|
|
1569
2115
|
startedAt,
|
|
1570
2116
|
});
|
|
1571
2117
|
}
|
|
1572
|
-
|
|
2118
|
+
const result = await readWorkbenchRunWorkloadResult(workspace.root, workload, {
|
|
1573
2119
|
exitCode,
|
|
1574
2120
|
startedAt,
|
|
1575
|
-
|
|
2121
|
+
operationResults,
|
|
1576
2122
|
});
|
|
2123
|
+
if (options.collectWorkspace) {
|
|
2124
|
+
result.workspaceFiles = await readMutableWorkspaceSnapshotFiles(workspace.root);
|
|
2125
|
+
}
|
|
2126
|
+
return result;
|
|
1577
2127
|
}
|
|
1578
2128
|
finally {
|
|
1579
2129
|
await workspace.cleanup();
|
|
1580
2130
|
}
|
|
1581
2131
|
}
|
|
2132
|
+
async function runSubjectPrepareCommand(args) {
|
|
2133
|
+
const command = args.workload.spec.subject.prepare?.command;
|
|
2134
|
+
if (!command) {
|
|
2135
|
+
return;
|
|
2136
|
+
}
|
|
2137
|
+
const role = args.execution.purpose === "improve" ? "optimizer" : "runner";
|
|
2138
|
+
await publishCommandStepEvent(args.eventPublisher, {
|
|
2139
|
+
step: "subject_prepare",
|
|
2140
|
+
status: "started",
|
|
2141
|
+
role,
|
|
2142
|
+
});
|
|
2143
|
+
try {
|
|
2144
|
+
const shellCommand = createHostedWorkloadShellCommand(args.root, command, "subject_prepare");
|
|
2145
|
+
await args.execFileAsync("sh", ["-c", shellCommand], {
|
|
2146
|
+
cwd: args.root,
|
|
2147
|
+
env: createHostedWorkloadPrepareEnv(args.root),
|
|
2148
|
+
maxBuffer: 10 * 1024 * 1024,
|
|
2149
|
+
timeout: args.timeoutMs,
|
|
2150
|
+
});
|
|
2151
|
+
await publishCommandStepEvent(args.eventPublisher, {
|
|
2152
|
+
step: "subject_prepare",
|
|
2153
|
+
status: "succeeded",
|
|
2154
|
+
role,
|
|
2155
|
+
});
|
|
2156
|
+
}
|
|
2157
|
+
catch (error) {
|
|
2158
|
+
await publishCommandStepEvent(args.eventPublisher, {
|
|
2159
|
+
step: "subject_prepare",
|
|
2160
|
+
status: "failed",
|
|
2161
|
+
exitCode: readExitCode(error),
|
|
2162
|
+
error: error instanceof Error ? error.message : String(error),
|
|
2163
|
+
role,
|
|
2164
|
+
});
|
|
2165
|
+
throw new Error(`Subject prepare command failed: ${error instanceof Error ? error.message : String(error)}`);
|
|
2166
|
+
}
|
|
2167
|
+
}
|
|
1582
2168
|
async function createRuntimeWorkspaceRoot(args, fs, os, path, prefix) {
|
|
1583
2169
|
if (args.workspaceRoot) {
|
|
1584
2170
|
await fs.mkdir(args.workspaceRoot, { recursive: true });
|
|
@@ -1614,19 +2200,22 @@ async function createRuntimeWorkspaceRoot(args, fs, os, path, prefix) {
|
|
|
1614
2200
|
},
|
|
1615
2201
|
};
|
|
1616
2202
|
}
|
|
1617
|
-
function
|
|
1618
|
-
if (
|
|
2203
|
+
function stepEventRole(step) {
|
|
2204
|
+
if (step.kind === "optimizer") {
|
|
1619
2205
|
return "optimizer";
|
|
1620
2206
|
}
|
|
1621
|
-
if (
|
|
2207
|
+
if (step.kind === "subject") {
|
|
1622
2208
|
return "runner";
|
|
1623
2209
|
}
|
|
1624
|
-
if (
|
|
2210
|
+
if (step.kind === "engine") {
|
|
1625
2211
|
return "engine";
|
|
1626
2212
|
}
|
|
1627
2213
|
return undefined;
|
|
1628
2214
|
}
|
|
1629
2215
|
function adapterOperationUsageSummary(result) {
|
|
2216
|
+
if (hasExplicitUsageRole(result.usage)) {
|
|
2217
|
+
return completeUsageSummary(result.usage);
|
|
2218
|
+
}
|
|
1630
2219
|
if (result.operation === "optimizer.improve") {
|
|
1631
2220
|
return assignUsageRole("optimizer", result.usage);
|
|
1632
2221
|
}
|
|
@@ -1638,11 +2227,16 @@ function adapterOperationUsageSummary(result) {
|
|
|
1638
2227
|
}
|
|
1639
2228
|
return result.usage;
|
|
1640
2229
|
}
|
|
1641
|
-
function
|
|
1642
|
-
|
|
1643
|
-
|
|
1644
|
-
|
|
1645
|
-
|
|
2230
|
+
function attemptUsageSummary(workloadUsage, resultUsage) {
|
|
2231
|
+
const normalizedWorkloadUsage = completeUsageSummary(workloadUsage);
|
|
2232
|
+
const legacyEngineUsage = normalizedWorkloadUsage?.engine
|
|
2233
|
+
? undefined
|
|
2234
|
+
: assignUsageRole("engine", resultUsage);
|
|
2235
|
+
return mergeUsageSummaries([normalizedWorkloadUsage, legacyEngineUsage]);
|
|
2236
|
+
}
|
|
2237
|
+
function hasExplicitUsageRole(usage) {
|
|
2238
|
+
const normalized = completeUsageSummary(usage);
|
|
2239
|
+
return Boolean(normalized?.optimizer || normalized?.runner || normalized?.engine);
|
|
1646
2240
|
}
|
|
1647
2241
|
function createSubjectPatchFromResult(result, spec) {
|
|
1648
2242
|
if (result.subjectPatch) {
|
|
@@ -1720,47 +2314,103 @@ export async function stageWorkbenchRunWorkload(root, workload) {
|
|
|
1720
2314
|
fs
|
|
1721
2315
|
.rm(runtimePrivateDir(root), { recursive: true, force: true })
|
|
1722
2316
|
.catch(() => undefined),
|
|
1723
|
-
fs
|
|
1724
|
-
.rm(runtimeLogsDir(root), { recursive: true, force: true })
|
|
1725
|
-
.catch(() => undefined),
|
|
1726
2317
|
]);
|
|
1727
2318
|
await fs.mkdir(inputDir(root), { recursive: true });
|
|
1728
2319
|
await fs.mkdir(outputDir(root), { recursive: true });
|
|
1729
2320
|
if (purpose === "attempt") {
|
|
1730
|
-
assertMutableWorkspaceFiles(workload.subjectFiles, "Subject files");
|
|
1731
2321
|
await fs.mkdir(subjectDir(root), { recursive: true });
|
|
1732
2322
|
await fs.mkdir(caseDir(root), { recursive: true });
|
|
1733
|
-
await fs.mkdir(runtimeLogsAgentDir(root), { recursive: true });
|
|
1734
|
-
await fs.mkdir(runtimeLogsVerifierDir(root), { recursive: true });
|
|
1735
2323
|
const engineCase = requireWorkloadEngineCase(workload, "Attempt staging");
|
|
1736
2324
|
await writeSurfaceFiles(subjectDir(root), workload.subjectFiles);
|
|
1737
|
-
await writeSurfaceFiles(caseDir(root),
|
|
1738
|
-
await writeSurfaceFiles(root, workload.subjectFiles);
|
|
2325
|
+
await writeSurfaceFiles(caseDir(root), engineCasePublicFiles(engineCase));
|
|
1739
2326
|
return;
|
|
1740
2327
|
}
|
|
1741
2328
|
if (purpose === "improve") {
|
|
1742
|
-
assertMutableWorkspaceFiles(workload.subjectFiles, "Subject files");
|
|
1743
2329
|
await fs.mkdir(subjectDir(root), { recursive: true });
|
|
1744
2330
|
await writeSurfaceFiles(subjectDir(root), workload.subjectFiles);
|
|
1745
|
-
await writeSurfaceFiles(root, workload.subjectFiles);
|
|
1746
2331
|
await fs.mkdir(tracesDir(root), { recursive: true });
|
|
1747
2332
|
await writeSurfaceFiles(tracesDir(root), workload.traceFiles);
|
|
1748
2333
|
}
|
|
1749
2334
|
}
|
|
1750
|
-
async function
|
|
2335
|
+
async function stageWorkbenchEnginePrivateFiles(root, workload) {
|
|
2336
|
+
if (readWorkloadExecutionPurpose(workload) !== "attempt") {
|
|
2337
|
+
return;
|
|
2338
|
+
}
|
|
1751
2339
|
const fs = await importNodeModule(nodeBuiltin("fs/promises"));
|
|
1752
|
-
const engineCase = requireWorkloadEngineCase(workload, "Attempt scoring");
|
|
1753
|
-
await Promise.all([
|
|
1754
|
-
fs
|
|
1755
|
-
.rm(runtimeEnginePrivateDir(root), { recursive: true, force: true })
|
|
1756
|
-
.catch(() => undefined),
|
|
1757
|
-
fs
|
|
1758
|
-
.rm(runtimeLogsVerifierDir(root), { recursive: true, force: true })
|
|
1759
|
-
.catch(() => undefined),
|
|
1760
|
-
]);
|
|
1761
2340
|
await fs.mkdir(runtimeEnginePrivateDir(root), { recursive: true });
|
|
1762
|
-
await
|
|
1763
|
-
|
|
2341
|
+
await writeSurfaceFiles(runtimeEnginePrivateDir(root), engineCasePrivateFiles(requireWorkloadEngineCase(workload, "Engine-private staging")));
|
|
2342
|
+
}
|
|
2343
|
+
async function stageInitialWorkspaceFiles(root, files) {
|
|
2344
|
+
await writeSurfaceFiles(root, files.filter((file) => isMutableWorkspaceSnapshotPath(file.path)));
|
|
2345
|
+
}
|
|
2346
|
+
async function readMutableWorkspaceSnapshotFiles(root) {
|
|
2347
|
+
return (await readSurfaceFiles(root))
|
|
2348
|
+
.filter((file) => isMutableWorkspaceSnapshotPath(file.path))
|
|
2349
|
+
.sort((left, right) => left.path.localeCompare(right.path));
|
|
2350
|
+
}
|
|
2351
|
+
function isMutableWorkspaceSnapshotPath(filePath) {
|
|
2352
|
+
const normalized = normalizeRelativePath(filePath);
|
|
2353
|
+
return Boolean(normalized &&
|
|
2354
|
+
!normalized.startsWith("../") &&
|
|
2355
|
+
normalized !== "input" &&
|
|
2356
|
+
!normalized.startsWith("input/") &&
|
|
2357
|
+
normalized !== "private" &&
|
|
2358
|
+
!normalized.startsWith("private/") &&
|
|
2359
|
+
normalized !== "output" &&
|
|
2360
|
+
!normalized.startsWith("output/") &&
|
|
2361
|
+
normalized !== ".workbench" &&
|
|
2362
|
+
!normalized.startsWith(".workbench/"));
|
|
2363
|
+
}
|
|
2364
|
+
async function materializeHostAdapterRoots(root, adapterFiles, adapterIds) {
|
|
2365
|
+
if (adapterFiles.length === 0 || adapterIds.size === 0) {
|
|
2366
|
+
return new Map();
|
|
2367
|
+
}
|
|
2368
|
+
const fs = await importNodeModule(nodeBuiltin("fs/promises"));
|
|
2369
|
+
const path = await importNodeModule(nodeBuiltin("path"));
|
|
2370
|
+
const sourceRoots = hostAdapterSourceRoots(adapterFiles, adapterIds);
|
|
2371
|
+
const roots = new Map();
|
|
2372
|
+
for (const [adapterId, sourceRoot] of sourceRoots) {
|
|
2373
|
+
const targetRoot = path.join(root, ".workbench", "adapters", adapterId);
|
|
2374
|
+
const files = adapterFiles.flatMap((file) => {
|
|
2375
|
+
const relativePath = adapterFilePathWithinRoot(file.path, sourceRoot);
|
|
2376
|
+
return relativePath === null
|
|
2377
|
+
? []
|
|
2378
|
+
: [{ ...file, path: relativePath }];
|
|
2379
|
+
});
|
|
2380
|
+
await fs.rm(targetRoot, { recursive: true, force: true }).catch(() => undefined);
|
|
2381
|
+
await fs.mkdir(targetRoot, { recursive: true });
|
|
2382
|
+
await writeSurfaceFiles(targetRoot, files);
|
|
2383
|
+
roots.set(adapterId, await fs.realpath(targetRoot));
|
|
2384
|
+
}
|
|
2385
|
+
return roots;
|
|
2386
|
+
}
|
|
2387
|
+
function hostAdapterSourceRoots(adapterFiles, adapterIds) {
|
|
2388
|
+
const roots = new Map();
|
|
2389
|
+
for (const file of adapterFiles) {
|
|
2390
|
+
const normalized = normalizeRelativePath(file.path);
|
|
2391
|
+
if (!normalized.endsWith("workbench.adapter.yaml")) {
|
|
2392
|
+
continue;
|
|
2393
|
+
}
|
|
2394
|
+
const manifest = parseWorkbenchAdapterManifest(file.content);
|
|
2395
|
+
if (!adapterIds.has(manifest.id)) {
|
|
2396
|
+
continue;
|
|
2397
|
+
}
|
|
2398
|
+
const sourceRoot = normalized === "workbench.adapter.yaml"
|
|
2399
|
+
? ""
|
|
2400
|
+
: normalized.slice(0, -"workbench.adapter.yaml".length).replace(/\/+$/u, "");
|
|
2401
|
+
roots.set(manifest.id, sourceRoot);
|
|
2402
|
+
}
|
|
2403
|
+
return roots;
|
|
2404
|
+
}
|
|
2405
|
+
function adapterFilePathWithinRoot(filePath, sourceRoot) {
|
|
2406
|
+
const normalized = normalizeRelativePath(filePath);
|
|
2407
|
+
if (!sourceRoot) {
|
|
2408
|
+
return normalized;
|
|
2409
|
+
}
|
|
2410
|
+
if (!normalized.startsWith(`${sourceRoot}/`)) {
|
|
2411
|
+
return null;
|
|
2412
|
+
}
|
|
2413
|
+
return normalized.slice(sourceRoot.length + 1);
|
|
1764
2414
|
}
|
|
1765
2415
|
async function readHostedRunFailureResult(root, workload, options) {
|
|
1766
2416
|
const traceFiles = await readRuntimeTraceFiles(root, workload);
|
|
@@ -1788,16 +2438,16 @@ async function readWorkbenchRunWorkloadResult(root, workload, options = {}) {
|
|
|
1788
2438
|
const primaryOperation = purpose === "improve"
|
|
1789
2439
|
? "optimizer.improve"
|
|
1790
2440
|
: "engine.run";
|
|
1791
|
-
const primaryResult = [...(options.
|
|
2441
|
+
const primaryResult = [...(options.operationResults ?? [])]
|
|
1792
2442
|
.reverse()
|
|
1793
2443
|
.find((result) => result.operation === primaryOperation);
|
|
1794
2444
|
const resultPayload = jsonRecord(primaryResult?.value);
|
|
1795
2445
|
const usage = mergeUsageSummaries([
|
|
1796
2446
|
options.usage,
|
|
1797
|
-
...(options.
|
|
2447
|
+
...(options.operationResults ?? []).map(adapterOperationUsageSummary),
|
|
1798
2448
|
]);
|
|
1799
|
-
const metrics =
|
|
1800
|
-
const cases =
|
|
2449
|
+
const metrics = normalizeResultMetrics(resultPayload.metrics);
|
|
2450
|
+
const cases = normalizeResultCases(resultPayload.cases);
|
|
1801
2451
|
const includeResultScoring = purpose === "attempt";
|
|
1802
2452
|
const files = [...outputFiles, ...traceFiles].sort((left, right) => left.path.localeCompare(right.path));
|
|
1803
2453
|
const subjectPatch = purpose === "improve" ? primaryResult?.value : undefined;
|
|
@@ -1809,6 +2459,7 @@ async function readWorkbenchRunWorkloadResult(root, workload, options = {}) {
|
|
|
1809
2459
|
return {
|
|
1810
2460
|
files,
|
|
1811
2461
|
fileChanges: declaredChanges,
|
|
2462
|
+
...(options.operationResults ? { operationResults: [...options.operationResults] } : {}),
|
|
1812
2463
|
...(subjectPatch ? { subjectPatch } : {}),
|
|
1813
2464
|
...(engineResult ? { result: engineResult } : {}),
|
|
1814
2465
|
...(includeResultScoring && metrics ? { metrics } : {}),
|
|
@@ -1835,10 +2486,10 @@ async function readRuntimeTraceFiles(root, workload) {
|
|
|
1835
2486
|
const path = await importNodeModule(nodeBuiltin("path"));
|
|
1836
2487
|
const traceRoot = path.join(outputDir(root), ".workbench", "traces", workload.job.id);
|
|
1837
2488
|
const purpose = readWorkloadExecutionPurpose(workload);
|
|
1838
|
-
const outputTraceRoot =
|
|
2489
|
+
const outputTraceRoot = workbenchTraceExecutionDirectory({
|
|
1839
2490
|
sequence: 1,
|
|
1840
2491
|
runId: workload.job.runId,
|
|
1841
|
-
|
|
2492
|
+
purpose,
|
|
1842
2493
|
});
|
|
1843
2494
|
return (await readSurfaceFiles(traceRoot)).map((file) => ({
|
|
1844
2495
|
...file,
|
|
@@ -1868,13 +2519,13 @@ function createHostedWorkloadShellCommand(root, command, prefix = "", okExitCode
|
|
|
1868
2519
|
'exit "$status"',
|
|
1869
2520
|
].join("; ");
|
|
1870
2521
|
}
|
|
1871
|
-
async function
|
|
2522
|
+
async function resetHostedWorkloadStepOutput(root) {
|
|
1872
2523
|
const fs = await importNodeModule(nodeBuiltin("fs/promises"));
|
|
1873
2524
|
await fs
|
|
1874
2525
|
.rm(workbenchAdapterOperationResultPath(outputDir(root)), { force: true })
|
|
1875
2526
|
.catch(() => undefined);
|
|
1876
2527
|
}
|
|
1877
|
-
async function writeWorkbenchAdapterRequest(root, workload, execution,
|
|
2528
|
+
async function writeWorkbenchAdapterRequest(root, workload, execution, step, auth, manifests) {
|
|
1878
2529
|
const [fs, path] = await Promise.all([
|
|
1879
2530
|
importNodeModule(nodeBuiltin("fs/promises")),
|
|
1880
2531
|
importNodeModule(nodeBuiltin("path")),
|
|
@@ -1882,13 +2533,13 @@ async function writeWorkbenchAdapterRequest(root, workload, execution, phase, au
|
|
|
1882
2533
|
const requestPath = path.join(root, ".workbench", "request.json");
|
|
1883
2534
|
await fs.mkdir(path.dirname(requestPath), { recursive: true });
|
|
1884
2535
|
const casePrompt = workload.engineCaseSpec?.prompt;
|
|
1885
|
-
const adapter =
|
|
2536
|
+
const adapter = step.adapter ?? execution.adapter;
|
|
1886
2537
|
const subjectCommand = adapterProtocolCommandSpec(workload.spec.run, "subject.run", manifests).command;
|
|
1887
2538
|
await fs.writeFile(requestPath, `${JSON.stringify({
|
|
1888
2539
|
protocol: "workbench.adapter.v3",
|
|
1889
2540
|
id: execution.id,
|
|
1890
2541
|
jobId: workload.job.id,
|
|
1891
|
-
operation:
|
|
2542
|
+
operation: step.operation,
|
|
1892
2543
|
invocation: {
|
|
1893
2544
|
use: adapter.use,
|
|
1894
2545
|
with: adapterConfigRecord(adapter, manifests),
|
|
@@ -1903,6 +2554,7 @@ async function writeWorkbenchAdapterRequest(root, workload, execution, phase, au
|
|
|
1903
2554
|
subject: {
|
|
1904
2555
|
id: workload.subjectId,
|
|
1905
2556
|
path: workload.spec.subject.files.path,
|
|
2557
|
+
...(workload.spec.subject.prepare ? { prepare: { ...workload.spec.subject.prepare } } : {}),
|
|
1906
2558
|
run: {
|
|
1907
2559
|
...workload.spec.run,
|
|
1908
2560
|
command: subjectCommand,
|
|
@@ -1923,14 +2575,12 @@ async function writeWorkbenchAdapterRequest(root, workload, execution, phase, au
|
|
|
1923
2575
|
},
|
|
1924
2576
|
paths: {
|
|
1925
2577
|
workspace: root,
|
|
1926
|
-
cwd: root,
|
|
1927
2578
|
output: outputDir(root),
|
|
1928
2579
|
result: workbenchAdapterOperationResultPath(outputDir(root)),
|
|
1929
2580
|
subject: subjectDir(root),
|
|
1930
2581
|
...(workload.engineCaseSpec ? { case: caseDir(root) } : {}),
|
|
1931
2582
|
traces: tracesDir(root),
|
|
1932
|
-
...(
|
|
1933
|
-
logs: runtimeLogsDir(root),
|
|
2583
|
+
...(step.kind === "engine" ? { enginePrivate: runtimeEnginePrivateDir(root) } : {}),
|
|
1934
2584
|
},
|
|
1935
2585
|
}, null, 2)}\n`);
|
|
1936
2586
|
return requestPath;
|
|
@@ -1945,7 +2595,29 @@ function requireOptimizerEdits(spec) {
|
|
|
1945
2595
|
}
|
|
1946
2596
|
return edits;
|
|
1947
2597
|
}
|
|
1948
|
-
function
|
|
2598
|
+
function createHostedWorkloadAdapterEnv(root, adapterRequestPath, adapterEnv = {}, options = {}, runtimeEnv = {}) {
|
|
2599
|
+
const env = createHostedWorkloadBaseEnv();
|
|
2600
|
+
env.WORKBENCH_ADAPTER_REQUEST = adapterRequestPath;
|
|
2601
|
+
env.WORKBENCH_OUTPUT = outputDir(root);
|
|
2602
|
+
env.WORKBENCH_RESULT = workbenchAdapterOperationResultPath(outputDir(root));
|
|
2603
|
+
if (options.adapterRoot) {
|
|
2604
|
+
env.WORKBENCH_ADAPTER_ROOT = options.adapterRoot;
|
|
2605
|
+
env.WORKBENCH_WORKSPACE_ROOT = root;
|
|
2606
|
+
env.PATH = [
|
|
2607
|
+
`${options.adapterRoot}/node_modules/.bin`,
|
|
2608
|
+
env.PATH,
|
|
2609
|
+
].filter(Boolean).join(":");
|
|
2610
|
+
}
|
|
2611
|
+
Object.assign(env, adapterEnv);
|
|
2612
|
+
Object.assign(env, runtimeEnv);
|
|
2613
|
+
return env;
|
|
2614
|
+
}
|
|
2615
|
+
function createHostedWorkloadPrepareEnv(root) {
|
|
2616
|
+
const env = createHostedWorkloadBaseEnv();
|
|
2617
|
+
env.WORKBENCH_OUTPUT = outputDir(root);
|
|
2618
|
+
return env;
|
|
2619
|
+
}
|
|
2620
|
+
function createHostedWorkloadBaseEnv() {
|
|
1949
2621
|
const env = {};
|
|
1950
2622
|
for (const [key, value] of Object.entries(process.env)) {
|
|
1951
2623
|
if (typeof value === "string") {
|
|
@@ -1957,20 +2629,52 @@ function createHostedWorkloadPhaseEnv(root, adapterRequestPath, adapterEnv = {})
|
|
|
1957
2629
|
delete env[key];
|
|
1958
2630
|
}
|
|
1959
2631
|
}
|
|
1960
|
-
const runtimeBins = [
|
|
2632
|
+
const runtimeBins = uniquePathEntries([
|
|
2633
|
+
...nodeModuleBinDirsForAncestors(process.cwd()),
|
|
2634
|
+
...nodeModuleBinDirsForAncestors(path.dirname(fileURLToPath(import.meta.url))),
|
|
2635
|
+
"/app/node_modules/.bin",
|
|
1961
2636
|
"/workbench-runtime/node_modules/.bin",
|
|
1962
2637
|
"/workbench-runtime/products/workbench/node_modules/.bin",
|
|
1963
|
-
]
|
|
1964
|
-
|
|
1965
|
-
|
|
1966
|
-
|
|
1967
|
-
|
|
1968
|
-
|
|
1969
|
-
|
|
1970
|
-
|
|
1971
|
-
|
|
2638
|
+
]);
|
|
2639
|
+
env.PATH = uniquePathEntries([
|
|
2640
|
+
path.dirname(process.execPath),
|
|
2641
|
+
"/usr/local/sbin",
|
|
2642
|
+
"/usr/local/bin",
|
|
2643
|
+
"/usr/sbin",
|
|
2644
|
+
"/usr/bin",
|
|
2645
|
+
"/sbin",
|
|
2646
|
+
"/bin",
|
|
2647
|
+
...runtimeBins,
|
|
2648
|
+
...(process.env.PATH ? process.env.PATH.split(path.delimiter) : []),
|
|
2649
|
+
]).join(path.delimiter);
|
|
1972
2650
|
return env;
|
|
1973
2651
|
}
|
|
2652
|
+
function nodeModuleBinDirsForAncestors(start) {
|
|
2653
|
+
const dirs = [];
|
|
2654
|
+
let current = path.resolve(start);
|
|
2655
|
+
for (let depth = 0; depth < 12; depth += 1) {
|
|
2656
|
+
dirs.push(path.join(current, "node_modules", ".bin"));
|
|
2657
|
+
const parent = path.dirname(current);
|
|
2658
|
+
if (parent === current) {
|
|
2659
|
+
break;
|
|
2660
|
+
}
|
|
2661
|
+
current = parent;
|
|
2662
|
+
}
|
|
2663
|
+
return dirs;
|
|
2664
|
+
}
|
|
2665
|
+
function uniquePathEntries(entries) {
|
|
2666
|
+
const seen = new Set();
|
|
2667
|
+
const output = [];
|
|
2668
|
+
for (const entry of entries) {
|
|
2669
|
+
const trimmed = entry.trim();
|
|
2670
|
+
if (!trimmed || seen.has(trimmed)) {
|
|
2671
|
+
continue;
|
|
2672
|
+
}
|
|
2673
|
+
seen.add(trimmed);
|
|
2674
|
+
output.push(trimmed);
|
|
2675
|
+
}
|
|
2676
|
+
return output;
|
|
2677
|
+
}
|
|
1974
2678
|
function readWorkloadExecutionPurpose(workload) {
|
|
1975
2679
|
const purpose = workbenchExecutionPurpose(workload.job);
|
|
1976
2680
|
if (purpose === "improve" || purpose === "attempt") {
|
|
@@ -2005,35 +2709,6 @@ function runtimePrivateDir(root) {
|
|
|
2005
2709
|
function runtimeEnginePrivateDir(root) {
|
|
2006
2710
|
return `${runtimePrivateDir(root)}/engine`;
|
|
2007
2711
|
}
|
|
2008
|
-
function runtimeLogsDir(root) {
|
|
2009
|
-
return `${root}/logs`;
|
|
2010
|
-
}
|
|
2011
|
-
function runtimeLogsAgentDir(root) {
|
|
2012
|
-
return `${runtimeLogsDir(root)}/agent`;
|
|
2013
|
-
}
|
|
2014
|
-
function runtimeLogsVerifierDir(root) {
|
|
2015
|
-
return `${runtimeLogsDir(root)}/verifier`;
|
|
2016
|
-
}
|
|
2017
|
-
function assertMutableWorkspaceFiles(files, label) {
|
|
2018
|
-
const reserved = files
|
|
2019
|
-
.map((file) => normalizeRelativePath(file.path))
|
|
2020
|
-
.filter(isRuntimeReservedWorkspacePath);
|
|
2021
|
-
if (reserved.length > 0) {
|
|
2022
|
-
throw new Error(`${label} cannot target runtime-reserved workspace paths: ${reserved.join(", ")}.`);
|
|
2023
|
-
}
|
|
2024
|
-
}
|
|
2025
|
-
function isRuntimeReservedWorkspacePath(normalizedPath) {
|
|
2026
|
-
return normalizedPath === ".workbench" ||
|
|
2027
|
-
normalizedPath.startsWith(".workbench/") ||
|
|
2028
|
-
normalizedPath === "input" ||
|
|
2029
|
-
normalizedPath.startsWith("input/") ||
|
|
2030
|
-
normalizedPath === "output" ||
|
|
2031
|
-
normalizedPath.startsWith("output/") ||
|
|
2032
|
-
normalizedPath === "logs" ||
|
|
2033
|
-
normalizedPath.startsWith("logs/") ||
|
|
2034
|
-
normalizedPath === "private" ||
|
|
2035
|
-
normalizedPath.startsWith("private/");
|
|
2036
|
-
}
|
|
2037
2712
|
async function writeSurfaceFiles(root, files) {
|
|
2038
2713
|
const fs = await importNodeModule(nodeBuiltin("fs/promises"));
|
|
2039
2714
|
const path = await importNodeModule(nodeBuiltin("path"));
|
|
@@ -2097,7 +2772,7 @@ function encodeSurfaceSnapshotContent(body, utf8Decoder) {
|
|
|
2097
2772
|
};
|
|
2098
2773
|
}
|
|
2099
2774
|
}
|
|
2100
|
-
function
|
|
2775
|
+
function normalizeResultMetrics(value) {
|
|
2101
2776
|
if (!value || typeof value !== "object" || Array.isArray(value)) {
|
|
2102
2777
|
return undefined;
|
|
2103
2778
|
}
|
|
@@ -2109,7 +2784,7 @@ function normalizeRewardMetrics(value) {
|
|
|
2109
2784
|
}
|
|
2110
2785
|
return Object.keys(metrics).length > 0 ? metrics : undefined;
|
|
2111
2786
|
}
|
|
2112
|
-
function
|
|
2787
|
+
function normalizeResultCases(value) {
|
|
2113
2788
|
if (!Array.isArray(value)) {
|
|
2114
2789
|
return undefined;
|
|
2115
2790
|
}
|
|
@@ -2122,7 +2797,7 @@ function normalizeRewardCases(value) {
|
|
|
2122
2797
|
if (!id) {
|
|
2123
2798
|
return [];
|
|
2124
2799
|
}
|
|
2125
|
-
const metrics =
|
|
2800
|
+
const metrics = normalizeResultMetrics(record.metrics) ?? {};
|
|
2126
2801
|
const status = record.status === "completed" || record.status === "error"
|
|
2127
2802
|
? record.status
|
|
2128
2803
|
: undefined;
|
|
@@ -2146,9 +2821,7 @@ function normalizeRewardCases(value) {
|
|
|
2146
2821
|
: undefined;
|
|
2147
2822
|
const pass = typeof criterionRecord.pass === "boolean"
|
|
2148
2823
|
? criterionRecord.pass
|
|
2149
|
-
:
|
|
2150
|
-
? score >= 0.5
|
|
2151
|
-
: undefined;
|
|
2824
|
+
: undefined;
|
|
2152
2825
|
if (!criterionId || score === undefined || pass === undefined) {
|
|
2153
2826
|
return [];
|
|
2154
2827
|
}
|
|
@@ -2261,13 +2934,13 @@ function evaluateSample(args) {
|
|
|
2261
2934
|
if (typeof sampleScore !== "number" || !Number.isFinite(sampleScore)) {
|
|
2262
2935
|
throw new Error("Evaluation sample requires an engine result with a finite numeric score.");
|
|
2263
2936
|
}
|
|
2264
|
-
const cases = args.workload.cases?.length ? args.workload.cases : undefined;
|
|
2265
2937
|
const metrics = args.workload.metrics ?? {
|
|
2266
2938
|
score: sampleScore,
|
|
2267
2939
|
};
|
|
2268
2940
|
if (metrics.score === undefined) {
|
|
2269
2941
|
metrics.score = sampleScore;
|
|
2270
2942
|
}
|
|
2943
|
+
const cases = args.workload.cases?.length ? args.workload.cases : undefined;
|
|
2271
2944
|
const feedback = {
|
|
2272
2945
|
...(args.workload.summary !== undefined
|
|
2273
2946
|
? { summary: args.workload.summary }
|
|
@@ -2295,7 +2968,7 @@ function evaluateSample(args) {
|
|
|
2295
2968
|
feedback,
|
|
2296
2969
|
};
|
|
2297
2970
|
}
|
|
2298
|
-
function normalizeSampleJobOutput(value
|
|
2971
|
+
function normalizeSampleJobOutput(value) {
|
|
2299
2972
|
if (!value || typeof value !== "object" || Array.isArray(value)) {
|
|
2300
2973
|
return null;
|
|
2301
2974
|
}
|
|
@@ -2314,9 +2987,6 @@ function normalizeSampleJobOutput(value, fallbackFiles = []) {
|
|
|
2314
2987
|
!Number.isFinite(record.attemptIndex)) {
|
|
2315
2988
|
return null;
|
|
2316
2989
|
}
|
|
2317
|
-
const sampleFiles = files.length > 0
|
|
2318
|
-
? files
|
|
2319
|
-
: fallbackFiles.map((file) => ({ ...file }));
|
|
2320
2990
|
return {
|
|
2321
2991
|
subjectId: record.subjectId,
|
|
2322
2992
|
attemptIndex: record.attemptIndex,
|
|
@@ -2324,10 +2994,10 @@ function normalizeSampleJobOutput(value, fallbackFiles = []) {
|
|
|
2324
2994
|
fileChanges: Array.isArray(record.fileChanges)
|
|
2325
2995
|
? record.fileChanges.filter((entry) => typeof entry === "string")
|
|
2326
2996
|
: [],
|
|
2327
|
-
files
|
|
2997
|
+
files,
|
|
2328
2998
|
traces: Array.isArray(record.traces)
|
|
2329
2999
|
? record.traces.filter((entry) => typeof entry === "string")
|
|
2330
|
-
: traceFilePaths(
|
|
3000
|
+
: traceFilePaths(files),
|
|
2331
3001
|
};
|
|
2332
3002
|
}
|
|
2333
3003
|
function normalizeEvaluationSampleOutputs(args) {
|
|
@@ -2563,14 +3233,14 @@ function mergeEvaluationSampleRecords(samples) {
|
|
|
2563
3233
|
function mergeEvaluationSampleGroup(group) {
|
|
2564
3234
|
const first = group[0];
|
|
2565
3235
|
if (group.length === 1) {
|
|
2566
|
-
return
|
|
3236
|
+
return first;
|
|
2567
3237
|
}
|
|
2568
3238
|
const startedAt = minTimestamp(group.flatMap((sample) => (sample.startedAt ? [sample.startedAt] : [])));
|
|
2569
3239
|
const finishedAt = maxTimestamp(group.flatMap((sample) => (sample.finishedAt ? [sample.finishedAt] : [])));
|
|
2570
3240
|
const durationMs = startedAt && finishedAt
|
|
2571
3241
|
? Math.max(0, Date.parse(finishedAt) - Date.parse(startedAt))
|
|
2572
3242
|
: undefined;
|
|
2573
|
-
const cases = group.flatMap((sample) =>
|
|
3243
|
+
const cases = group.flatMap((sample) => sample.cases ?? []);
|
|
2574
3244
|
const metrics = aggregateSampleGroupMetrics(group);
|
|
2575
3245
|
const usage = mergeUsageSummaries(group.map((sample) => sample.usage));
|
|
2576
3246
|
const errors = group.flatMap((sample) => sample.error ? [sample.error] : []);
|
|
@@ -2588,22 +3258,6 @@ function mergeEvaluationSampleGroup(group) {
|
|
|
2588
3258
|
...(cases.length > 0 ? { cases } : {}),
|
|
2589
3259
|
};
|
|
2590
3260
|
}
|
|
2591
|
-
function normalizeSingleCaseDurations(sample) {
|
|
2592
|
-
if (!sample.cases) {
|
|
2593
|
-
return sample;
|
|
2594
|
-
}
|
|
2595
|
-
const cases = normalizeCaseDurations(sample);
|
|
2596
|
-
return cases.length === sample.cases.length
|
|
2597
|
-
? { ...sample, cases }
|
|
2598
|
-
: sample;
|
|
2599
|
-
}
|
|
2600
|
-
function normalizeCaseDurations(sample) {
|
|
2601
|
-
return (sample.cases ?? []).map((caseResult) => (typeof caseResult.durationMs === "number" ||
|
|
2602
|
-
sample.cases?.length !== 1 ||
|
|
2603
|
-
typeof sample.durationMs !== "number"
|
|
2604
|
-
? caseResult
|
|
2605
|
-
: { ...caseResult, durationMs: sample.durationMs }));
|
|
2606
|
-
}
|
|
2607
3261
|
function aggregateSampleGroupMetrics(group) {
|
|
2608
3262
|
const metricNames = new Set(group.flatMap((sample) => Object.keys(sample.metrics ?? {})));
|
|
2609
3263
|
if (metricNames.size === 0) {
|