@workbench-ai/workbench-core 0.0.46 → 0.0.48
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/execution-events.d.ts +2 -2
- package/dist/execution-events.d.ts.map +1 -1
- package/dist/execution-events.js +3 -3
- package/dist/{execution-phases.d.ts → execution-evidence.d.ts} +8 -7
- package/dist/execution-evidence.d.ts.map +1 -0
- package/dist/{execution-phases.js → execution-evidence.js} +91 -51
- package/dist/execution-graph.js +1 -2
- package/dist/execution-jobs.js +1 -1
- package/dist/execution-outputs.d.ts.map +1 -1
- package/dist/execution-outputs.js +5 -10
- package/dist/execution-runtime-types.d.ts +7 -3
- package/dist/execution-runtime-types.d.ts.map +1 -1
- package/dist/execution-traces.d.ts +11 -1
- package/dist/execution-traces.d.ts.map +1 -1
- package/dist/execution-traces.js +305 -2
- package/dist/generic-spec.d.ts +8 -3
- package/dist/generic-spec.d.ts.map +1 -1
- package/dist/generic-spec.js +26 -37
- package/dist/index.d.ts +22 -11
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +888 -218
- package/dist/runtime-dockerfile.d.ts +14 -0
- package/dist/runtime-dockerfile.d.ts.map +1 -0
- package/dist/runtime-dockerfile.js +65 -0
- package/dist/sandbox-backends/docker.d.ts.map +1 -1
- package/dist/sandbox-backends/docker.js +9 -12
- package/dist/sandbox-backends/index.d.ts.map +1 -1
- package/dist/sandbox-backends/index.js +2 -1
- package/dist/sandbox-inputs.d.ts.map +1 -1
- package/dist/sandbox-inputs.js +1 -0
- package/dist/sandbox-plane.d.ts +1 -0
- package/dist/sandbox-plane.d.ts.map +1 -1
- package/dist/sandbox-plane.js +12 -22
- package/dist/trace-files.d.ts +2 -2
- package/dist/trace-files.d.ts.map +1 -1
- package/dist/trace-files.js +4 -4
- package/package.json +3 -3
- package/worker/sandbox-adapter-runner.cjs +22 -13
- package/dist/execution-phases.d.ts.map +0 -1
package/dist/index.js
CHANGED
|
@@ -1,28 +1,30 @@
|
|
|
1
|
-
import { createHash } from "node:crypto";
|
|
1
|
+
import { createHash, randomBytes } from "node:crypto";
|
|
2
2
|
import os from "node:os";
|
|
3
3
|
import path from "node:path";
|
|
4
|
+
import { fileURLToPath } from "node:url";
|
|
4
5
|
import YAML from "yaml";
|
|
5
|
-
import { adapterCommandName, assertWorkbenchAdapterOperationResultOk, collectWorkbenchAdapterAuthRequirements, readWorkbenchAdapterOperationResult, workbenchAdapterOperationCommand, workbenchAdapterOperationResultPath, } from "@workbench-ai/workbench-protocol";
|
|
6
|
-
import { BENCHMARK_SPEC_FILE,
|
|
6
|
+
import { adapterCommandName, assertWorkbenchAdapterOperationResultOk, collectWorkbenchAdapterAuthRequirements, parseWorkbenchAdapterManifest, readWorkbenchAdapterOperationResult, WORKBENCH_RUNTIME_CONTROL_TOKEN_ENV, WORKBENCH_RUNTIME_CONTROL_URL_ENV, workbenchAdapterOperationCommand, workbenchAdapterOperationExecutor, workbenchAdapterOperationResultPath, } from "@workbench-ai/workbench-protocol";
|
|
7
|
+
import { BENCHMARK_SPEC_FILE, engineCasePrivateFiles, engineCaseFilesForRuntimeInput, engineCasePublicFiles, resolveEngineCaseExecutionConfig, resolveWorkbenchResolvedSourceYaml as resolveWorkbenchResolvedSourceYamlInternal, validateWorkbenchResolvedSourceYaml as validateWorkbenchResolvedSourceYamlInternal, isWorkbenchSubjectManifestPath, } from "./generic-spec.js";
|
|
7
8
|
import { attachSandboxMetadataToJob, createWorkbenchSandboxFileStore, isSurfaceSnapshotFile, readWorkbenchExecutionSpec, } from "./sandbox-inputs.js";
|
|
8
|
-
import { asRuntimeRecord, importNodeModule, jsonRecord, nodeBuiltin, quoteShellArg, resolveWorkbenchWorkerId, } from "./runtime-utils.js";
|
|
9
|
-
import { executeValidatedSandboxExecution, } from "./sandbox-plane.js";
|
|
9
|
+
import { asRuntimeRecord, importNodeModule, isJsonPayload, jsonRecord, nodeBuiltin, quoteShellArg, resolveWorkbenchWorkerId, } from "./runtime-utils.js";
|
|
10
|
+
import { createWorkbenchExecutionCapability, createWorkbenchSandboxAllocation, collectExecutionCapabilityScopeIssues, collectSandboxAllocationScopeIssues, collectSandboxHandleScopeIssues, assertSandboxBackendSupportsNetworkPolicy, executeValidatedSandboxExecution, } from "./sandbox-plane.js";
|
|
10
11
|
import { createSandboxBackendPlaneForProvider, } from "./sandbox-backends/index.js";
|
|
11
12
|
import { applyWorkbenchSubjectPatch } from "./subject-patch.js";
|
|
12
13
|
import { assignUsageRole, completeUsageSummary, mergeUsageSummaries, normalizeUsageSummary, usageStats, } from "./execution-usage.js";
|
|
13
|
-
import { traceFilePaths,
|
|
14
|
+
import { traceFilePaths, workbenchTraceExecutionDirectory, } from "./trace-files.js";
|
|
14
15
|
import { engineCaseForCase, } from "./execution-jobs.js";
|
|
15
|
-
import { createWorkbenchExecutionEventPublisher,
|
|
16
|
-
import { readWorkbenchExecutionPurpose } from "./execution-
|
|
16
|
+
import { createWorkbenchExecutionEventPublisher, publishCommandStepEvent, } from "./execution-events.js";
|
|
17
|
+
import { readWorkbenchExecutionPurpose } from "./execution-evidence.js";
|
|
17
18
|
import { adapterAuthEnv, localWorkbenchAdapterAuthStore, normalizeWorkbenchAdapterAuthTarget, sanitizeWorkbenchAdapterAuthBundle, } from "./adapter-auth.js";
|
|
18
|
-
export { BENCHMARK_SPEC_FILE, DEFAULT_EXECUTION_RESOURCES,
|
|
19
|
-
export {
|
|
19
|
+
export { BENCHMARK_SPEC_FILE, DEFAULT_EXECUTION_RESOURCES, engineCasePrivateFiles, engineCaseFilesForRuntimeInput, engineCasePublicFiles, engineResolveInvocationForSpec, engineResolveBindingForSpec, engineResolveBindingForSourceYaml, isWorkbenchSubjectManifestPath, parseWorkbenchSourceFiles, resolveEngineCaseExecutionConfig, resolveWorkbenchResolvedSourceYaml, resolveWorkbenchSourceFiles, runtimeNetwork, runtimeResources, serializeWorkbenchResolvedSourceYaml, validateWorkbenchResolvedSourceYaml, } from "./generic-spec.js";
|
|
20
|
+
export { composeRuntimeDockerfileWithAdapterInstallers, } from "./runtime-dockerfile.js";
|
|
21
|
+
export { adapterCommandName, cloneWorkbenchAdapterManifest, collectWorkbenchAdapterAuthRequirements, collectWorkbenchAdapterInvocations, parseWorkbenchAdapterManifest, workbenchAdapterManifestRequiresAuth, workbenchAdapterManifestSupportsOperation, workbenchAdapterOperationCommand, workbenchAdapterOperationExecutor, withDefaultWorkbenchAdapterAuth, withDefaultWorkbenchAdapterAuthProfiles, } from "@workbench-ai/workbench-protocol";
|
|
20
22
|
export { adapterAuthEnv, createWorkbenchAdapterAuthBundle, defaultWorkbenchAdapterAuthStoreRoot, localWorkbenchAdapterAuthStore, normalizeWorkbenchAdapterAuthTarget, parseWorkbenchAdapterAuthTarget, sanitizeWorkbenchAdapterAuthBundle, } from "./adapter-auth.js";
|
|
21
23
|
export { asRuntimeRecord, importNodeModule, nodeBuiltin, normalizeWorkbenchWorkerId, normalizeRuntimeRegistry, quoteShellArg, resolveDockerRuntimeImageRef, resolveWorkbenchWorkerId, } from "./runtime-utils.js";
|
|
22
24
|
export { assignUsageRole, extractExecutionUsageFromTrace, mergeUsageSummaries, } from "./execution-usage.js";
|
|
23
25
|
export { createWorkbenchProgressStdoutParser, publishWorkbenchProgressStdoutEnvelope, } from "./execution-events.js";
|
|
24
26
|
export { resolveSandboxTemplateImage, } from "./sandbox-backends/template-images.js";
|
|
25
|
-
export { readOutputTraceFiles,
|
|
27
|
+
export { readOutputTraceFiles, workbenchTraceExecutionDirectory, workbenchTraceRunDirectory, workbenchTraceRunDirectoryName, } from "./trace-files.js";
|
|
26
28
|
export { assertWorkbenchAdapterOperationSupport, assertWorkbenchAdapterOperationResultOk, collectWorkbenchAdapterOperationIssues, collectWorkbenchAdapterOperationRequirements, ensureWorkbenchAdapterOutputDir, WORKBENCH_ADAPTER_RESULT_FILE, normalizeWorkbenchAdapterOperationRequest, normalizeWorkbenchAdapterOperationResult, readWorkbenchAdapterOperationRequest, readWorkbenchAdapterOperationResult, workbenchAdapterOperationResultPath, writeWorkbenchAdapterOperationResult, } from "@workbench-ai/workbench-protocol";
|
|
27
29
|
export { applyWorkbenchSubjectPatch, } from "./subject-patch.js";
|
|
28
30
|
export { createWorkbenchSandboxFileStore, createSandboxAdapterRequest, executionResultFromCompletedSandboxJob, materializeWorkbenchSandboxInput, readWorkbenchExecutionSpec, sanitizeWorkbenchExecutionJobForSandbox, } from "./sandbox-inputs.js";
|
|
@@ -31,8 +33,8 @@ export { createBaselineSubjectExecution, createBaselineSubjectJob, createWorkben
|
|
|
31
33
|
export { addCapacity, capacityFits, runWorkbenchExecutionDag, subtractCapacity, workbenchJobDependencies, workbenchJobHostCost, workbenchJobResources, } from "./execution-scheduler.js";
|
|
32
34
|
export { assertWorkbenchExecutionIsolation, collectWorkbenchExecutionIsolationIssues, validateWorkbenchExecutionOutputPayloads, } from "./execution-outputs.js";
|
|
33
35
|
export { collectSandboxAllocationScopeIssues, collectExecutionCapabilityScopeIssues, collectSandboxHandleScopeIssues, createWorkbenchSandboxAllocation, createWorkbenchSandboxExecutionMetadata, createWorkbenchExecutionCapability, executeValidatedSandboxExecution, } from "./sandbox-plane.js";
|
|
34
|
-
export {
|
|
35
|
-
export { finalizeWorkbenchExecutionTraceForJob, mergeWorkbenchExecutionTracesByJob, } from "./execution-traces.js";
|
|
36
|
+
export { buildSubjectCaseExecutionRefs, buildWorkbenchExecutionEvidence, isWorkbenchExecutionActive, readWorkbenchExecutionId, readWorkbenchExecutionMetadataNumber, readWorkbenchExecutionMetadataString, readWorkbenchExecutionPurpose, resolveWorkbenchJobGroupStatus, } from "./execution-evidence.js";
|
|
37
|
+
export { buildWorkbenchTraceSessionsFromFiles, combineWorkbenchTraceSessions, finalizeWorkbenchExecutionTraceForJob, mergeWorkbenchExecutionTracesByJob, readWorkbenchExecutionTraceFiles, traceSessionLabel, } from "./execution-traces.js";
|
|
36
38
|
export { DOCKER_SANDBOX_BACKEND, assertSandboxHostHealthForProvider, createDockerSandboxBackendDescriptor, createDockerSandboxPlane, resolveWorkbenchSandboxProviderName, sandboxProviderAdmissionForResources, sandboxProviderDefaultMaxConcurrentJobs, sandboxProviderLeaseScope, sandboxHostHealthExpectationForProvider, } from "./sandbox-backends/index.js";
|
|
37
39
|
export const DEFAULT_ENVIRONMENT_VERSIONS = [
|
|
38
40
|
{
|
|
@@ -142,7 +144,7 @@ export const DEFAULT_ENVIRONMENTS = [
|
|
|
142
144
|
{
|
|
143
145
|
id: "env_libreoffice_agent",
|
|
144
146
|
name: "LibreOffice + Agent",
|
|
145
|
-
description: "Agent runtime with soffice and Python libraries for spreadsheet-heavy
|
|
147
|
+
description: "Agent runtime with soffice and Python libraries for spreadsheet-heavy evaluations.",
|
|
146
148
|
currentVersionId: "envv_libreoffice_agent",
|
|
147
149
|
builtIn: true,
|
|
148
150
|
createdAt: "2026-04-29T00:00:00.000Z",
|
|
@@ -278,30 +280,36 @@ function adapterProtocolCommandSpec(adapter, operation, manifests = []) {
|
|
|
278
280
|
return {
|
|
279
281
|
use: "command",
|
|
280
282
|
command: manifest ? workbenchAdapterOperationCommand(manifest, operation) : adapterCommandName(adapter.use),
|
|
283
|
+
executor: manifest ? workbenchAdapterOperationExecutor(manifest, operation) : "sandbox",
|
|
281
284
|
};
|
|
282
285
|
}
|
|
283
|
-
function
|
|
284
|
-
|
|
285
|
-
|
|
286
|
+
function protocolStepForExecution(execution, manifests) {
|
|
287
|
+
if (execution.purpose !== "improve") {
|
|
288
|
+
throw new Error(`Protocol execution step only supports improve executions, not ${execution.purpose}.`);
|
|
289
|
+
}
|
|
290
|
+
const operation = "optimizer.improve";
|
|
286
291
|
const command = adapterProtocolCommandSpec(execution.adapter, operation, manifests);
|
|
287
292
|
return {
|
|
288
|
-
kind:
|
|
293
|
+
kind: "optimizer",
|
|
289
294
|
label: execution.purpose,
|
|
290
295
|
operation,
|
|
296
|
+
executor: command.executor,
|
|
291
297
|
adapter: execution.adapter,
|
|
292
298
|
command: command.command,
|
|
293
299
|
};
|
|
294
300
|
}
|
|
295
|
-
function
|
|
301
|
+
function attemptStepsForExecution(execution, spec, manifests) {
|
|
296
302
|
void spec;
|
|
297
|
-
const
|
|
303
|
+
const command = adapterProtocolCommandSpec(execution.adapter, "engine.run", manifests);
|
|
304
|
+
const engineStep = {
|
|
298
305
|
kind: "engine",
|
|
299
306
|
label: "engine",
|
|
300
307
|
operation: "engine.run",
|
|
308
|
+
executor: command.executor,
|
|
301
309
|
adapter: execution.adapter,
|
|
302
|
-
command:
|
|
310
|
+
command: command.command,
|
|
303
311
|
};
|
|
304
|
-
return [
|
|
312
|
+
return [engineStep];
|
|
305
313
|
}
|
|
306
314
|
function adapterConfigRecord(adapter, manifests = []) {
|
|
307
315
|
const config = cloneJsonRecord(jsonRecord(adapter.with));
|
|
@@ -411,7 +419,10 @@ export function materializeWorkbenchRunResult(args) {
|
|
|
411
419
|
.sort((left, right) => compareSampleOutputs(left.output, right.output));
|
|
412
420
|
const outputJobIds = new Set(outputs.flatMap(({ jobs }) => jobs.map((job) => job.id)));
|
|
413
421
|
const completedSampleKeys = new Set(outputs
|
|
414
|
-
.
|
|
422
|
+
.flatMap(({ jobs, output }) => [
|
|
423
|
+
evaluationSampleGroupKeyFromOutput(output),
|
|
424
|
+
...jobs.map(evaluationSampleGroupKeyFromJob),
|
|
425
|
+
])
|
|
415
426
|
.filter((key) => key !== null));
|
|
416
427
|
const errorSampleJobs = [
|
|
417
428
|
...subjectJobs.filter((job) => job.status === "failed"),
|
|
@@ -422,12 +433,13 @@ export function materializeWorkbenchRunResult(args) {
|
|
|
422
433
|
...outputs.map(({ jobs, output }) => withJobUsage(output.sample, completed, jobs[0])),
|
|
423
434
|
...errorSamples,
|
|
424
435
|
].sort((left, right) => left.index - right.index || left.id.localeCompare(right.id));
|
|
425
|
-
const
|
|
436
|
+
const subjectName = normalizedSubjectDisplayName(args.spec.subject.name);
|
|
437
|
+
const evalRecord = createEvaluationRecord(subjectId, subjectName, samples);
|
|
426
438
|
const usage = mergeUsageSummaries([
|
|
427
439
|
subjectRevision.usage,
|
|
428
440
|
...samples.map((sample) => sample.usage),
|
|
429
441
|
]);
|
|
430
|
-
const metrics = evaluationMeanMetrics(
|
|
442
|
+
const metrics = evaluationMeanMetrics(evalRecord);
|
|
431
443
|
const attemptIndex = subjectRevision.attemptIndex;
|
|
432
444
|
const evaluationTraces = [
|
|
433
445
|
...outputs.flatMap(({ output }) => output.traces),
|
|
@@ -457,6 +469,7 @@ export function materializeWorkbenchRunResult(args) {
|
|
|
457
469
|
}
|
|
458
470
|
const record = {
|
|
459
471
|
id: subjectId,
|
|
472
|
+
...(subjectName ? { name: subjectName } : {}),
|
|
460
473
|
ordinal: args.existingSubjectCount + subjects.length,
|
|
461
474
|
benchmarkFingerprint: args.benchmarkFingerprint,
|
|
462
475
|
subjectFingerprint: args.subjectFingerprint ?? materializedSubjectFingerprint(args.spec, subjectRevision.files),
|
|
@@ -472,7 +485,7 @@ export function materializeWorkbenchRunResult(args) {
|
|
|
472
485
|
meta,
|
|
473
486
|
};
|
|
474
487
|
subjects.push(record);
|
|
475
|
-
evaluations.push(
|
|
488
|
+
evaluations.push(createEvaluationScorecard({
|
|
476
489
|
runId: args.runId,
|
|
477
490
|
benchmarkFingerprint: args.benchmarkFingerprint,
|
|
478
491
|
createdAt: args.startedAt,
|
|
@@ -528,6 +541,8 @@ function materializedSubjectFingerprint(spec, files) {
|
|
|
528
541
|
hash.update("workbench-subject-v1\0");
|
|
529
542
|
hash.update("materialized\0runner\0");
|
|
530
543
|
hash.update(JSON.stringify(spec.run));
|
|
544
|
+
hash.update("prepare");
|
|
545
|
+
hash.update(JSON.stringify(spec.subject.prepare ?? null));
|
|
531
546
|
for (const file of filterSubjectSourceFiles(files).slice().sort((left, right) => left.path.localeCompare(right.path))) {
|
|
532
547
|
hash.update("\0file\0");
|
|
533
548
|
hash.update(file.path);
|
|
@@ -547,14 +562,15 @@ function materializedSubjectFiles(args) {
|
|
|
547
562
|
}
|
|
548
563
|
return [...byPath.values()].sort((left, right) => left.path.localeCompare(right.path));
|
|
549
564
|
}
|
|
550
|
-
function
|
|
565
|
+
function createEvaluationScorecard(args) {
|
|
551
566
|
const evaluation = args.evaluation;
|
|
552
567
|
return {
|
|
553
|
-
id:
|
|
568
|
+
id: evaluationScorecardId(args.runId, args.subject.id),
|
|
554
569
|
runId: args.runId,
|
|
555
570
|
benchmarkFingerprint: args.benchmarkFingerprint,
|
|
556
571
|
subjectFingerprint: args.subject.subjectFingerprint,
|
|
557
572
|
subjectId: args.subject.id,
|
|
573
|
+
...(args.subject.name ? { subjectName: args.subject.name } : {}),
|
|
558
574
|
createdAt: args.createdAt,
|
|
559
575
|
updatedAt: evaluation.finishedAt ?? args.createdAt,
|
|
560
576
|
status: evaluation.status,
|
|
@@ -568,7 +584,7 @@ function createEvaluationResultRecord(args) {
|
|
|
568
584
|
evaluation,
|
|
569
585
|
};
|
|
570
586
|
}
|
|
571
|
-
function
|
|
587
|
+
export function evaluationScorecardId(runId, subjectId) {
|
|
572
588
|
const runPart = runId.replace(/[^a-z0-9]+/giu, "_").replace(/^_+|_+$/gu, "").slice(-24);
|
|
573
589
|
const subjectPart = subjectId.replace(/[^a-z0-9]+/giu, "_").replace(/^_+|_+$/gu, "").slice(-24);
|
|
574
590
|
return `eval_${runPart}_${subjectPart}`;
|
|
@@ -584,7 +600,7 @@ export function isWorkbenchInternalOutputPath(filePath) {
|
|
|
584
600
|
normalized === "sandbox-environment.json" ||
|
|
585
601
|
normalized === "sandbox_error.log" ||
|
|
586
602
|
normalized === "exit_code" ||
|
|
587
|
-
/^[a-
|
|
603
|
+
/^[a-z_-]+_(stdout\.log|stderr\.log|exit_code)$/u.test(normalized));
|
|
588
604
|
}
|
|
589
605
|
export function createSubjectRevisionTraceInputFiles(args) {
|
|
590
606
|
const files = [];
|
|
@@ -620,6 +636,23 @@ export function createSubjectRevisionTraceInputFiles(args) {
|
|
|
620
636
|
}, null, 2)}\n`));
|
|
621
637
|
return dedupeSurfaceFiles(files);
|
|
622
638
|
}
|
|
639
|
+
export function createSubjectEvaluationTraceInputFiles(args) {
|
|
640
|
+
const subject = args.subject;
|
|
641
|
+
if (!subject?.eval && !subject?.metrics) {
|
|
642
|
+
return [];
|
|
643
|
+
}
|
|
644
|
+
const filePath = normalizeRelativePath(args.path ?? `base-subject/${subject.id}/evaluation.json`);
|
|
645
|
+
const payload = {
|
|
646
|
+
kind: "subject_evaluation",
|
|
647
|
+
subjectId: subject.id,
|
|
648
|
+
status: subject.status,
|
|
649
|
+
metrics: subject.metrics ?? null,
|
|
650
|
+
fileChanges: subject.fileChanges,
|
|
651
|
+
eval: subject.eval ?? null,
|
|
652
|
+
prompt: subject.prompt ?? null,
|
|
653
|
+
};
|
|
654
|
+
return [textSurfaceFile(filePath, `${JSON.stringify(payload, null, 2)}\n`)];
|
|
655
|
+
}
|
|
623
656
|
function isTerminalExecutionJob(job) {
|
|
624
657
|
return job.kind === "execute" && (job.status === "succeeded" ||
|
|
625
658
|
job.status === "failed" ||
|
|
@@ -866,16 +899,14 @@ export function createSubjectFilePreview(args) {
|
|
|
866
899
|
};
|
|
867
900
|
}
|
|
868
901
|
export function createCaseReview(args) {
|
|
869
|
-
const preferredSampleIndex =
|
|
870
|
-
const sampleMatchesCase = (sample) => sample.id === args.caseId
|
|
871
|
-
sample.id.startsWith(`${args.caseId}__`) ||
|
|
872
|
-
(sample.cases ?? []).some((entry) => entry.id === args.caseId || entry.id.startsWith(`${args.caseId}__`));
|
|
902
|
+
const preferredSampleIndex = uniqueExecutionSampleIndex(args.executions ?? []);
|
|
903
|
+
const sampleMatchesCase = (sample) => (sample.cases ?? []).some((entry) => entry.id === args.caseId);
|
|
873
904
|
const samples = args.subject.eval?.samples ?? [];
|
|
874
905
|
const sampleResult = samples.find((sample) => typeof preferredSampleIndex === "number" &&
|
|
875
906
|
sample.index === preferredSampleIndex &&
|
|
876
907
|
sampleMatchesCase(sample)) ?? samples.find(sampleMatchesCase);
|
|
877
|
-
const caseResult = sampleResult?.cases?.find((entry) => entry.id === args.caseId
|
|
878
|
-
if (!sampleResult && (args.
|
|
908
|
+
const caseResult = sampleResult?.cases?.find((entry) => entry.id === args.caseId);
|
|
909
|
+
if (!sampleResult && (args.executions?.length ?? 0) > 0) {
|
|
879
910
|
return {
|
|
880
911
|
subjectId: args.subject.id,
|
|
881
912
|
caseId: args.caseId,
|
|
@@ -884,7 +915,7 @@ export function createCaseReview(args) {
|
|
|
884
915
|
? { sampleIndex: preferredSampleIndex }
|
|
885
916
|
: {}),
|
|
886
917
|
metrics: {},
|
|
887
|
-
|
|
918
|
+
executions: args.executions ?? [],
|
|
888
919
|
criteria_results: [],
|
|
889
920
|
};
|
|
890
921
|
}
|
|
@@ -893,28 +924,21 @@ export function createCaseReview(args) {
|
|
|
893
924
|
}
|
|
894
925
|
const durationMs = typeof caseResult?.durationMs === "number"
|
|
895
926
|
? caseResult.durationMs
|
|
896
|
-
:
|
|
897
|
-
typeof sampleResult.durationMs === "number"
|
|
898
|
-
? sampleResult.durationMs
|
|
899
|
-
: !caseResult && typeof sampleResult.durationMs === "number"
|
|
900
|
-
? sampleResult.durationMs
|
|
901
|
-
: undefined;
|
|
902
|
-
const sampleStatus = sampleResult.status === "planned" ? undefined : sampleResult.status;
|
|
903
|
-
const status = caseResult?.status ?? sampleStatus;
|
|
927
|
+
: undefined;
|
|
904
928
|
return {
|
|
905
929
|
subjectId: args.subject.id,
|
|
906
|
-
caseId: caseResult?.id ??
|
|
930
|
+
caseId: caseResult?.id ?? args.caseId,
|
|
907
931
|
caseLabel: caseResult?.label ?? args.caseId,
|
|
908
932
|
sampleId: sampleResult.id,
|
|
909
933
|
sampleIndex: sampleResult.index,
|
|
910
|
-
...(status ? { status } : {}),
|
|
911
|
-
metrics: caseResult?.metrics ??
|
|
934
|
+
...(caseResult?.status ? { status: caseResult.status } : {}),
|
|
935
|
+
metrics: caseResult?.metrics ?? {},
|
|
912
936
|
...(typeof durationMs === "number" ? { durationMs } : {}),
|
|
913
937
|
...(caseResult?.source ? { source: caseResult.source } : {}),
|
|
914
|
-
...(
|
|
915
|
-
? { feedback: caseResult
|
|
938
|
+
...(caseResult?.feedback !== undefined
|
|
939
|
+
? { feedback: caseResult.feedback }
|
|
916
940
|
: {}),
|
|
917
|
-
|
|
941
|
+
executions: args.executions ?? [],
|
|
918
942
|
criteria_results: (caseResult?.criteria ?? []).map((criterion) => ({
|
|
919
943
|
criterion_id: criterion.criterion_id,
|
|
920
944
|
pass: criterion.pass,
|
|
@@ -924,9 +948,9 @@ export function createCaseReview(args) {
|
|
|
924
948
|
})),
|
|
925
949
|
};
|
|
926
950
|
}
|
|
927
|
-
function
|
|
928
|
-
const sampleIndices = new Set(
|
|
929
|
-
.map((
|
|
951
|
+
function uniqueExecutionSampleIndex(executions) {
|
|
952
|
+
const sampleIndices = new Set(executions
|
|
953
|
+
.map((execution) => execution.sampleIndex)
|
|
930
954
|
.filter((index) => typeof index === "number"));
|
|
931
955
|
if (sampleIndices.size !== 1) {
|
|
932
956
|
return null;
|
|
@@ -951,6 +975,7 @@ function parseAuthoredWorkbenchSourceSpec(source) {
|
|
|
951
975
|
name: resolved.subject.name,
|
|
952
976
|
description: resolved.subject.description,
|
|
953
977
|
files: { path: resolved.subject.files.path },
|
|
978
|
+
...(resolved.subject.prepare ? { prepare: { ...resolved.subject.prepare } } : {}),
|
|
954
979
|
run: runSpecFromInvocation(resolved.run),
|
|
955
980
|
},
|
|
956
981
|
...(resolved.optimizer
|
|
@@ -1101,11 +1126,18 @@ export async function executeWorkbenchExecutionJob(args, options) {
|
|
|
1101
1126
|
const runtimeArgs = adapterAuthProfiles.length > 0
|
|
1102
1127
|
? { ...args, adapterAuthProfiles }
|
|
1103
1128
|
: args;
|
|
1104
|
-
const
|
|
1129
|
+
const executionForRuntime = readWorkbenchExecutionSpec(runtimeArgs.job);
|
|
1130
|
+
const executor = workbenchExecutionExecutorForRuntimeInput(runtimeArgs);
|
|
1131
|
+
if (executor === "host") {
|
|
1132
|
+
return await withWorkbenchRuntimeControlServer(runtimeArgs, options, startedAt, async (adapterRuntimeEnv) => executeAdapterInCurrentRuntime({
|
|
1133
|
+
...runtimeArgs,
|
|
1134
|
+
adapterRuntimeEnv,
|
|
1135
|
+
}, executionForRuntime, startedAt, createWorkbenchExecutionCapability(executionForRuntime, { now: startedAt })));
|
|
1136
|
+
}
|
|
1105
1137
|
const fileStore = createWorkbenchSandboxFileStore(runtimeArgs);
|
|
1106
1138
|
const planeFactory = options.createSandboxPlaneForProvider ?? createSandboxBackendPlaneForProvider;
|
|
1107
1139
|
const plane = planeFactory(options.sandboxProvider, runtimeArgs, startedAt, fileStore);
|
|
1108
|
-
const validated = await executeValidatedSandboxExecution(plane,
|
|
1140
|
+
const validated = await executeValidatedSandboxExecution(plane, executionForRuntime, {
|
|
1109
1141
|
now: startedAt,
|
|
1110
1142
|
runnerId: resolveWorkbenchWorkerId([
|
|
1111
1143
|
process.env.WORKBENCH_WORKER_ID,
|
|
@@ -1121,6 +1153,215 @@ export async function executeWorkbenchExecutionJob(args, options) {
|
|
|
1121
1153
|
return failWorkbenchRunJob(args.job, startedAt, error);
|
|
1122
1154
|
}
|
|
1123
1155
|
}
|
|
1156
|
+
export function workbenchExecutionExecutorForRuntimeInput(args) {
|
|
1157
|
+
if (args.runtimeControlOperation) {
|
|
1158
|
+
return "sandbox";
|
|
1159
|
+
}
|
|
1160
|
+
const execution = readWorkbenchExecutionSpec(args.job);
|
|
1161
|
+
const operation = adapterOperationForExecutionPurpose(execution.purpose);
|
|
1162
|
+
if (!operation) {
|
|
1163
|
+
return "sandbox";
|
|
1164
|
+
}
|
|
1165
|
+
const manifest = args.adapterManifests?.find((entry) => entry.id === execution.adapter.use);
|
|
1166
|
+
return manifest ? workbenchAdapterOperationExecutor(manifest, operation) : "sandbox";
|
|
1167
|
+
}
|
|
1168
|
+
function adapterOperationForExecutionPurpose(purpose) {
|
|
1169
|
+
if (purpose === "improve") {
|
|
1170
|
+
return "optimizer.improve";
|
|
1171
|
+
}
|
|
1172
|
+
if (purpose === "attempt") {
|
|
1173
|
+
return "engine.run";
|
|
1174
|
+
}
|
|
1175
|
+
return null;
|
|
1176
|
+
}
|
|
1177
|
+
const RUNTIME_CONTROL_MAX_BODY_BYTES = 512 * 1024 * 1024;
|
|
1178
|
+
async function withWorkbenchRuntimeControlServer(args, options, startedAt, run) {
|
|
1179
|
+
const [{ createServer }] = await Promise.all([
|
|
1180
|
+
importNodeModule(nodeBuiltin("http")),
|
|
1181
|
+
]);
|
|
1182
|
+
const token = randomBytes(24).toString("base64url");
|
|
1183
|
+
const server = createServer((request, response) => {
|
|
1184
|
+
void handleWorkbenchRuntimeControlHttpRequest({
|
|
1185
|
+
request,
|
|
1186
|
+
response,
|
|
1187
|
+
token,
|
|
1188
|
+
args,
|
|
1189
|
+
options,
|
|
1190
|
+
startedAt,
|
|
1191
|
+
});
|
|
1192
|
+
});
|
|
1193
|
+
const url = await new Promise((resolve, reject) => {
|
|
1194
|
+
server.once("error", reject);
|
|
1195
|
+
server.listen(0, "127.0.0.1", () => {
|
|
1196
|
+
server.off("error", reject);
|
|
1197
|
+
const address = server.address();
|
|
1198
|
+
if (!address || typeof address === "string") {
|
|
1199
|
+
reject(new Error("Workbench runtime-control server did not expose a local TCP address."));
|
|
1200
|
+
return;
|
|
1201
|
+
}
|
|
1202
|
+
resolve(`http://127.0.0.1:${address.port}`);
|
|
1203
|
+
});
|
|
1204
|
+
});
|
|
1205
|
+
try {
|
|
1206
|
+
return await run({
|
|
1207
|
+
[WORKBENCH_RUNTIME_CONTROL_URL_ENV]: url,
|
|
1208
|
+
[WORKBENCH_RUNTIME_CONTROL_TOKEN_ENV]: token,
|
|
1209
|
+
});
|
|
1210
|
+
}
|
|
1211
|
+
finally {
|
|
1212
|
+
await new Promise((resolve) => server.close(() => resolve()));
|
|
1213
|
+
}
|
|
1214
|
+
}
|
|
1215
|
+
async function handleWorkbenchRuntimeControlHttpRequest(args) {
|
|
1216
|
+
const { request, response } = args;
|
|
1217
|
+
try {
|
|
1218
|
+
if (request.method !== "POST" || request.url !== "/v1/operation-sequence") {
|
|
1219
|
+
writeRuntimeControlJson(response, 404, { error: "Unknown Workbench runtime-control endpoint." });
|
|
1220
|
+
return;
|
|
1221
|
+
}
|
|
1222
|
+
if (request.headers.authorization !== `Bearer ${args.token}`) {
|
|
1223
|
+
writeRuntimeControlJson(response, 401, { error: "Workbench runtime-control token is invalid." });
|
|
1224
|
+
return;
|
|
1225
|
+
}
|
|
1226
|
+
const parsed = JSON.parse(await readRuntimeControlBody(request));
|
|
1227
|
+
const controlRequest = normalizeRuntimeControlOperationSequenceRequest(parsed);
|
|
1228
|
+
const result = await executeRuntimeControlOperationSequenceInSandbox(args.args, args.options, args.startedAt, controlRequest);
|
|
1229
|
+
writeRuntimeControlJson(response, 200, result);
|
|
1230
|
+
}
|
|
1231
|
+
catch (error) {
|
|
1232
|
+
writeRuntimeControlJson(response, 500, {
|
|
1233
|
+
error: error instanceof Error ? error.stack ?? error.message : String(error),
|
|
1234
|
+
});
|
|
1235
|
+
}
|
|
1236
|
+
}
|
|
1237
|
+
function writeRuntimeControlJson(response, statusCode, payload) {
|
|
1238
|
+
response.statusCode = statusCode;
|
|
1239
|
+
response.setHeader("content-type", "application/json");
|
|
1240
|
+
response.end(`${JSON.stringify(payload, null, 2)}\n`);
|
|
1241
|
+
}
|
|
1242
|
+
function readRuntimeControlBody(request) {
|
|
1243
|
+
return new Promise((resolve, reject) => {
|
|
1244
|
+
const chunks = [];
|
|
1245
|
+
let size = 0;
|
|
1246
|
+
request.on("data", (chunk) => {
|
|
1247
|
+
size += chunk.length;
|
|
1248
|
+
if (size > RUNTIME_CONTROL_MAX_BODY_BYTES) {
|
|
1249
|
+
reject(new Error("Workbench runtime-control request body is too large."));
|
|
1250
|
+
request.destroy();
|
|
1251
|
+
return;
|
|
1252
|
+
}
|
|
1253
|
+
chunks.push(chunk);
|
|
1254
|
+
});
|
|
1255
|
+
request.on("error", reject);
|
|
1256
|
+
request.on("end", () => resolve(Buffer.concat(chunks).toString("utf8")));
|
|
1257
|
+
});
|
|
1258
|
+
}
|
|
1259
|
+
function normalizeRuntimeControlOperationSequenceRequest(value) {
|
|
1260
|
+
if (!value || typeof value !== "object" || Array.isArray(value)) {
|
|
1261
|
+
throw new Error("Workbench runtime-control operation sequence request must be an object.");
|
|
1262
|
+
}
|
|
1263
|
+
const record = value;
|
|
1264
|
+
if (!Array.isArray(record.operations) || record.operations.length === 0) {
|
|
1265
|
+
throw new Error("Workbench runtime-control operation sequence requires at least one operation.");
|
|
1266
|
+
}
|
|
1267
|
+
const inputs = normalizeRuntimeControlInputs(record.inputs);
|
|
1268
|
+
return {
|
|
1269
|
+
...(inputs ? { inputs } : {}),
|
|
1270
|
+
operations: record.operations.map((entry, index) => normalizeRuntimeControlOperation(entry, `operations[${index}]`)),
|
|
1271
|
+
...(typeof record.prepare === "boolean" ? { prepare: record.prepare } : {}),
|
|
1272
|
+
...(typeof record.collectWorkspace === "boolean" ? { collectWorkspace: record.collectWorkspace } : {}),
|
|
1273
|
+
};
|
|
1274
|
+
}
|
|
1275
|
+
function normalizeRuntimeControlInputs(value) {
|
|
1276
|
+
if (value === undefined) {
|
|
1277
|
+
return undefined;
|
|
1278
|
+
}
|
|
1279
|
+
if (!value || typeof value !== "object" || Array.isArray(value)) {
|
|
1280
|
+
throw new Error("Workbench runtime-control inputs must be an object.");
|
|
1281
|
+
}
|
|
1282
|
+
const record = value;
|
|
1283
|
+
const inputs = {};
|
|
1284
|
+
if (hasOwn(record, "subject")) {
|
|
1285
|
+
inputs.subject = normalizeRuntimeControlFiles(record.subject, "inputs.subject");
|
|
1286
|
+
}
|
|
1287
|
+
if (hasOwn(record, "case")) {
|
|
1288
|
+
inputs.case = normalizeRuntimeControlFiles(record.case, "inputs.case");
|
|
1289
|
+
}
|
|
1290
|
+
if (hasOwn(record, "enginePrivate")) {
|
|
1291
|
+
inputs.enginePrivate = normalizeRuntimeControlFiles(record.enginePrivate, "inputs.enginePrivate");
|
|
1292
|
+
}
|
|
1293
|
+
if (hasOwn(record, "traces")) {
|
|
1294
|
+
inputs.traces = normalizeRuntimeControlFiles(record.traces, "inputs.traces");
|
|
1295
|
+
}
|
|
1296
|
+
if (hasOwn(record, "workspace")) {
|
|
1297
|
+
inputs.workspace = normalizeRuntimeControlFiles(record.workspace, "inputs.workspace");
|
|
1298
|
+
}
|
|
1299
|
+
if (hasOwn(record, "output")) {
|
|
1300
|
+
inputs.output = normalizeRuntimeControlFiles(record.output, "inputs.output");
|
|
1301
|
+
}
|
|
1302
|
+
return inputs;
|
|
1303
|
+
}
|
|
1304
|
+
function normalizeRuntimeControlFiles(value, label) {
|
|
1305
|
+
if (value === undefined) {
|
|
1306
|
+
return [];
|
|
1307
|
+
}
|
|
1308
|
+
if (!Array.isArray(value)) {
|
|
1309
|
+
throw new Error(`Workbench runtime-control ${label} must be an array.`);
|
|
1310
|
+
}
|
|
1311
|
+
return value.map((entry, index) => {
|
|
1312
|
+
if (!isSurfaceSnapshotFile(entry)) {
|
|
1313
|
+
throw new Error(`Workbench runtime-control ${label}[${index}] must be a surface snapshot file.`);
|
|
1314
|
+
}
|
|
1315
|
+
return { ...entry, path: normalizeRelativePath(entry.path) };
|
|
1316
|
+
});
|
|
1317
|
+
}
|
|
1318
|
+
function hasOwn(value, key) {
|
|
1319
|
+
return Object.prototype.hasOwnProperty.call(value, key);
|
|
1320
|
+
}
|
|
1321
|
+
function normalizeRuntimeControlOperation(value, label) {
|
|
1322
|
+
if (!value || typeof value !== "object" || Array.isArray(value)) {
|
|
1323
|
+
throw new Error(`Workbench runtime-control ${label} must be an object.`);
|
|
1324
|
+
}
|
|
1325
|
+
const record = value;
|
|
1326
|
+
const operation = record.operation;
|
|
1327
|
+
if (operation !== "engine.resolve" &&
|
|
1328
|
+
operation !== "engine.run" &&
|
|
1329
|
+
operation !== "subject.run" &&
|
|
1330
|
+
operation !== "optimizer.improve") {
|
|
1331
|
+
throw new Error(`Workbench runtime-control ${label}.operation is invalid.`);
|
|
1332
|
+
}
|
|
1333
|
+
const invocation = record.invocation;
|
|
1334
|
+
if (!invocation || typeof invocation !== "object" || Array.isArray(invocation)) {
|
|
1335
|
+
throw new Error(`Workbench runtime-control ${label}.invocation must be an object.`);
|
|
1336
|
+
}
|
|
1337
|
+
const invocationRecord = invocation;
|
|
1338
|
+
if (typeof invocationRecord.use !== "string" || invocationRecord.use.length === 0) {
|
|
1339
|
+
throw new Error(`Workbench runtime-control ${label}.invocation.use is required.`);
|
|
1340
|
+
}
|
|
1341
|
+
const withConfig = invocationRecord.with === undefined
|
|
1342
|
+
? {}
|
|
1343
|
+
: isJsonPayload(invocationRecord.with)
|
|
1344
|
+
? invocationRecord.with
|
|
1345
|
+
: null;
|
|
1346
|
+
if (withConfig === null) {
|
|
1347
|
+
throw new Error(`Workbench runtime-control ${label}.invocation.with must be JSON.`);
|
|
1348
|
+
}
|
|
1349
|
+
if (invocationRecord.auth !== undefined && !isJsonPayload(invocationRecord.auth)) {
|
|
1350
|
+
throw new Error(`Workbench runtime-control ${label}.invocation.auth must be JSON.`);
|
|
1351
|
+
}
|
|
1352
|
+
return {
|
|
1353
|
+
operation,
|
|
1354
|
+
invocation: {
|
|
1355
|
+
use: invocationRecord.use,
|
|
1356
|
+
with: withConfig,
|
|
1357
|
+
...(invocationRecord.auth !== undefined ? { auth: invocationRecord.auth } : {}),
|
|
1358
|
+
...(typeof invocationRecord.command === "string" && invocationRecord.command.trim()
|
|
1359
|
+
? { command: invocationRecord.command }
|
|
1360
|
+
: {}),
|
|
1361
|
+
},
|
|
1362
|
+
...(typeof record.label === "string" && record.label.trim() ? { label: record.label } : {}),
|
|
1363
|
+
};
|
|
1364
|
+
}
|
|
1124
1365
|
async function explicitAdapterAuthProfilesForExecution(execution, args, loadLocalAdapterProfiles) {
|
|
1125
1366
|
const required = requiredAdapterAuthTargetsForExecution(execution, args);
|
|
1126
1367
|
if (required.length === 0) {
|
|
@@ -1155,7 +1396,7 @@ function adapterAuthTargetKey(target) {
|
|
|
1155
1396
|
export function workbenchExecutionPurpose(job) {
|
|
1156
1397
|
return readWorkbenchExecutionPurpose(job);
|
|
1157
1398
|
}
|
|
1158
|
-
export async function
|
|
1399
|
+
export async function executeAdapterInCurrentRuntime(args, execution, startedAt, capability) {
|
|
1159
1400
|
const eventPublisher = createWorkbenchExecutionEventPublisher({
|
|
1160
1401
|
projectId: args.job.projectId,
|
|
1161
1402
|
runId: args.job.runId,
|
|
@@ -1174,10 +1415,10 @@ export async function executeAdapterInCurrentSandboxRuntime(args, execution, sta
|
|
|
1174
1415
|
};
|
|
1175
1416
|
try {
|
|
1176
1417
|
if (execution.purpose === "improve") {
|
|
1177
|
-
return await
|
|
1418
|
+
return await executeSubjectRevisionExecutionInCurrentRuntime(runtimeInput, execution, startedAt, capability, eventPublisher);
|
|
1178
1419
|
}
|
|
1179
1420
|
if (execution.purpose === "attempt") {
|
|
1180
|
-
return await
|
|
1421
|
+
return await executeAttemptExecutionInCurrentRuntime(runtimeInput, execution, startedAt, capability, eventPublisher);
|
|
1181
1422
|
}
|
|
1182
1423
|
throw new Error(`Unsupported execution purpose ${execution.purpose}.`);
|
|
1183
1424
|
}
|
|
@@ -1274,7 +1515,7 @@ function adapterAuthRequest(bundles, root, currentAdapterId) {
|
|
|
1274
1515
|
}
|
|
1275
1516
|
return entries;
|
|
1276
1517
|
}
|
|
1277
|
-
function
|
|
1518
|
+
function adapterAuthRequestForStep(args, adapterId) {
|
|
1278
1519
|
const profiles = (args.adapterAuthProfiles ?? [])
|
|
1279
1520
|
.map((bundle) => sanitizeWorkbenchAdapterAuthBundle(bundle));
|
|
1280
1521
|
if (profiles.length === 0) {
|
|
@@ -1295,12 +1536,19 @@ function adapterAuthProfilesForExecution(execution, args) {
|
|
|
1295
1536
|
}
|
|
1296
1537
|
function requiredAdapterAuthTargetsForExecution(execution, args) {
|
|
1297
1538
|
const manifests = args.adapterManifests ?? [];
|
|
1298
|
-
return collectWorkbenchAdapterAuthRequirements(adapterInvocationsForExecution(execution, args
|
|
1539
|
+
return collectWorkbenchAdapterAuthRequirements(adapterInvocationsForExecution(execution, args), manifests)
|
|
1299
1540
|
.map((target) => normalizeWorkbenchAdapterAuthTarget(target));
|
|
1300
1541
|
}
|
|
1301
|
-
function adapterInvocationsForExecution(execution,
|
|
1542
|
+
function adapterInvocationsForExecution(execution, args) {
|
|
1543
|
+
if (args.runtimeControlOperation) {
|
|
1544
|
+
return uniqueAdapterInvocations(args.runtimeControlOperation.operations.map((operation) => ({
|
|
1545
|
+
use: operation.invocation.use,
|
|
1546
|
+
with: operation.invocation.with ?? {},
|
|
1547
|
+
...(operation.invocation.auth !== undefined ? { auth: operation.invocation.auth } : {}),
|
|
1548
|
+
})));
|
|
1549
|
+
}
|
|
1302
1550
|
if (execution.purpose === "attempt") {
|
|
1303
|
-
return uniqueAdapterInvocations([execution.adapter, spec.run]);
|
|
1551
|
+
return uniqueAdapterInvocations([execution.adapter, args.spec.run]);
|
|
1304
1552
|
}
|
|
1305
1553
|
return [execution.adapter];
|
|
1306
1554
|
}
|
|
@@ -1341,7 +1589,7 @@ function completedJobFromSandboxResult(fallbackJob, startedAt, result) {
|
|
|
1341
1589
|
}
|
|
1342
1590
|
return attachSandboxMetadataToJob(failWorkbenchRunJob(fallbackJob, result.startedAt || startedAt, result.error ?? `Sandbox execution ${result.status}.`, result.finishedAt), asRuntimeRecord(result.metadata).sandbox);
|
|
1343
1591
|
}
|
|
1344
|
-
async function
|
|
1592
|
+
async function executeSubjectRevisionExecutionInCurrentRuntime(args, execution, startedAt, capability, eventPublisher) {
|
|
1345
1593
|
const { workload, result } = await runHostedProtocolExecutionResult(args, execution, startedAt, capability, eventPublisher);
|
|
1346
1594
|
if (result.error || (result.exitCode ?? 0) !== 0) {
|
|
1347
1595
|
return failWorkbenchRunJob(args.job, startedAt, result.error ?? `Adapter ${execution.adapter.use} exited with status ${result.exitCode}.`, result.finishedAt, result);
|
|
@@ -1382,7 +1630,7 @@ async function executeSubjectRevisionExecutionInSandbox(args, execution, started
|
|
|
1382
1630
|
},
|
|
1383
1631
|
};
|
|
1384
1632
|
}
|
|
1385
|
-
async function
|
|
1633
|
+
async function executeAttemptExecutionInCurrentRuntime(args, execution, startedAt, capability, eventPublisher) {
|
|
1386
1634
|
const workload = createWorkbenchRunWorkload({
|
|
1387
1635
|
job: args.job,
|
|
1388
1636
|
spec: args.spec,
|
|
@@ -1391,7 +1639,7 @@ async function executeAttemptExecutionInSandbox(args, execution, startedAt, capa
|
|
|
1391
1639
|
engineCases: args.engineCases,
|
|
1392
1640
|
traceFiles: args.traceFiles,
|
|
1393
1641
|
});
|
|
1394
|
-
const workloadResult = await
|
|
1642
|
+
const workloadResult = await runHostedCommandExecutionSteps(args, workload, attemptStepsForExecution(execution, args.spec, args.adapterManifests), startedAt, {
|
|
1395
1643
|
capability,
|
|
1396
1644
|
eventPublisher,
|
|
1397
1645
|
});
|
|
@@ -1405,10 +1653,7 @@ async function executeAttemptExecutionInSandbox(args, execution, startedAt, capa
|
|
|
1405
1653
|
return failWorkbenchRunJob(args.job, startedAt, "Attempt engine must return a workbench-result result with a finite numeric score.", workloadResult.finishedAt, workloadResult);
|
|
1406
1654
|
}
|
|
1407
1655
|
const finishedAt = workloadResult.finishedAt ?? new Date().toISOString();
|
|
1408
|
-
const usage =
|
|
1409
|
-
workloadResult.usage,
|
|
1410
|
-
engineResult.usage,
|
|
1411
|
-
]);
|
|
1656
|
+
const usage = attemptUsageSummary(workloadResult.usage, engineResult.usage);
|
|
1412
1657
|
const sample = evaluateSample({
|
|
1413
1658
|
subjectId: workload.subjectId,
|
|
1414
1659
|
files: workloadResult.files,
|
|
@@ -1453,6 +1698,282 @@ async function executeAttemptExecutionInSandbox(args, execution, startedAt, capa
|
|
|
1453
1698
|
},
|
|
1454
1699
|
};
|
|
1455
1700
|
}
|
|
1701
|
+
export async function executeRuntimeControlOperationSequenceInCurrentRuntime(args, execution, startedAt, capability) {
|
|
1702
|
+
void execution;
|
|
1703
|
+
void capability;
|
|
1704
|
+
if (!args.runtimeControlOperation) {
|
|
1705
|
+
throw new Error("Runtime-control operation sequence is missing from the sandbox request.");
|
|
1706
|
+
}
|
|
1707
|
+
const childExecution = readWorkbenchExecutionSpec(args.job);
|
|
1708
|
+
const workload = createWorkbenchRunWorkload({
|
|
1709
|
+
job: args.job,
|
|
1710
|
+
spec: args.spec,
|
|
1711
|
+
baseFiles: args.baseFiles,
|
|
1712
|
+
engineResolveFiles: args.engineResolveFiles,
|
|
1713
|
+
engineCases: args.engineCases,
|
|
1714
|
+
traceFiles: args.traceFiles,
|
|
1715
|
+
});
|
|
1716
|
+
const runtimeArgs = { ...args };
|
|
1717
|
+
delete runtimeArgs.adapterRuntimeEnv;
|
|
1718
|
+
const adapterAuth = await materializeSandboxAdapterAuth(runtimeArgs, childExecution);
|
|
1719
|
+
let result;
|
|
1720
|
+
try {
|
|
1721
|
+
result = await runHostedCommandExecutionSteps({
|
|
1722
|
+
...runtimeArgs,
|
|
1723
|
+
...(adapterAuth.root ? { adapterAuthRoot: adapterAuth.root } : {}),
|
|
1724
|
+
...(Object.keys(adapterAuth.env).length > 0
|
|
1725
|
+
? { adapterAuthEnv: adapterAuth.env }
|
|
1726
|
+
: {}),
|
|
1727
|
+
}, workload, args.runtimeControlOperation.operations.map((operation, index) => runtimeControlStepForOperation(operation, index, args.adapterManifests)), startedAt, {
|
|
1728
|
+
runSubjectPrepare: args.runtimeControlOperation.prepare ?? false,
|
|
1729
|
+
workspaceFiles: args.runtimeControlOperation.inputs?.workspace ?? [],
|
|
1730
|
+
outputFiles: args.runtimeControlOperation.inputs?.output ?? [],
|
|
1731
|
+
collectWorkspace: args.runtimeControlOperation.collectWorkspace ?? false,
|
|
1732
|
+
});
|
|
1733
|
+
}
|
|
1734
|
+
finally {
|
|
1735
|
+
if (adapterAuth.cleanup) {
|
|
1736
|
+
await adapterAuth.cleanup().catch(() => undefined);
|
|
1737
|
+
}
|
|
1738
|
+
}
|
|
1739
|
+
const finishedAt = result.finishedAt ?? new Date().toISOString();
|
|
1740
|
+
const failed = Boolean(result.error) || (result.exitCode ?? 0) !== 0;
|
|
1741
|
+
return {
|
|
1742
|
+
...args.job,
|
|
1743
|
+
status: failed ? "failed" : "succeeded",
|
|
1744
|
+
attempt: Math.max(1, args.job.attempt),
|
|
1745
|
+
startedAt,
|
|
1746
|
+
finishedAt,
|
|
1747
|
+
updatedAt: finishedAt,
|
|
1748
|
+
...(failed ? { error: result.error ?? `Runtime-control operation sequence exited with status ${result.exitCode}.` } : {}),
|
|
1749
|
+
output: runtimeControlJobOutput(result, !failed),
|
|
1750
|
+
};
|
|
1751
|
+
}
|
|
1752
|
+
async function executeRuntimeControlOperationSequenceInSandbox(args, options, startedAt, request) {
|
|
1753
|
+
const childArgs = createRuntimeControlSandboxInput(args, request);
|
|
1754
|
+
const execution = readWorkbenchExecutionSpec(childArgs.job);
|
|
1755
|
+
const fileStore = createWorkbenchSandboxFileStore(childArgs);
|
|
1756
|
+
const planeFactory = options.createSandboxPlaneForProvider ?? createSandboxBackendPlaneForProvider;
|
|
1757
|
+
const plane = planeFactory(options.sandboxProvider, childArgs, startedAt, fileStore);
|
|
1758
|
+
assertSandboxBackendSupportsNetworkPolicy(plane.backend, execution);
|
|
1759
|
+
const sandboxOptions = {
|
|
1760
|
+
now: startedAt,
|
|
1761
|
+
runnerId: resolveWorkbenchWorkerId([
|
|
1762
|
+
process.env.WORKBENCH_WORKER_ID,
|
|
1763
|
+
process.env.EC2_INSTANCE_ID,
|
|
1764
|
+
os.hostname(),
|
|
1765
|
+
process.env.HOSTNAME,
|
|
1766
|
+
], "local-runner"),
|
|
1767
|
+
fileStore,
|
|
1768
|
+
};
|
|
1769
|
+
const inputs = await fileStore.materializeInputs(execution);
|
|
1770
|
+
const environment = plane.prepareEnvironment
|
|
1771
|
+
? await plane.prepareEnvironment(execution, sandboxOptions)
|
|
1772
|
+
: {
|
|
1773
|
+
backend: plane.backend.name,
|
|
1774
|
+
kind: execution.sandbox.kind,
|
|
1775
|
+
ref: execution.sandbox.ref,
|
|
1776
|
+
};
|
|
1777
|
+
const allocation = createWorkbenchSandboxAllocation(execution, {
|
|
1778
|
+
backend: plane.backend.name,
|
|
1779
|
+
runnerId: sandboxOptions.runnerId,
|
|
1780
|
+
now: startedAt,
|
|
1781
|
+
});
|
|
1782
|
+
const capability = createWorkbenchExecutionCapability(execution, { now: startedAt });
|
|
1783
|
+
assertRuntimeControlScope("Runtime-control sandbox allocation", collectSandboxAllocationScopeIssues(allocation, execution, { now: startedAt }));
|
|
1784
|
+
assertRuntimeControlScope("Runtime-control execution capability", collectExecutionCapabilityScopeIssues(capability, execution, { now: startedAt }));
|
|
1785
|
+
const sandbox = await plane.createSandbox({
|
|
1786
|
+
execution,
|
|
1787
|
+
environment,
|
|
1788
|
+
allocation,
|
|
1789
|
+
capability,
|
|
1790
|
+
inputs,
|
|
1791
|
+
}, sandboxOptions);
|
|
1792
|
+
assertRuntimeControlScope("Runtime-control sandbox handle", collectSandboxHandleScopeIssues(sandbox, allocation, execution));
|
|
1793
|
+
let result;
|
|
1794
|
+
try {
|
|
1795
|
+
result = await plane.exec({
|
|
1796
|
+
execution,
|
|
1797
|
+
environment,
|
|
1798
|
+
sandbox,
|
|
1799
|
+
allocation,
|
|
1800
|
+
capability,
|
|
1801
|
+
inputs,
|
|
1802
|
+
}, sandboxOptions);
|
|
1803
|
+
}
|
|
1804
|
+
finally {
|
|
1805
|
+
await plane.destroySandbox(sandbox, sandboxOptions);
|
|
1806
|
+
}
|
|
1807
|
+
const completedJob = completedJobFromSandboxResult(childArgs.job, startedAt, result);
|
|
1808
|
+
return runtimeControlResultFromCompletedJob(completedJob);
|
|
1809
|
+
}
|
|
1810
|
+
function createRuntimeControlSandboxInput(args, request) {
|
|
1811
|
+
const parentExecution = readWorkbenchExecutionSpec(args.job);
|
|
1812
|
+
const parentWorkload = createWorkbenchRunWorkload({
|
|
1813
|
+
job: args.job,
|
|
1814
|
+
spec: args.spec,
|
|
1815
|
+
baseFiles: args.baseFiles,
|
|
1816
|
+
engineResolveFiles: args.engineResolveFiles,
|
|
1817
|
+
engineCases: args.engineCases,
|
|
1818
|
+
traceFiles: args.traceFiles,
|
|
1819
|
+
});
|
|
1820
|
+
const nonce = runtimeControlNonce();
|
|
1821
|
+
const childExecutionId = `${parentExecution.id}:runtime:${nonce}`;
|
|
1822
|
+
const childJobId = `${args.job.id}:runtime:${nonce}`;
|
|
1823
|
+
const parentInput = asRuntimeRecord(args.job.input);
|
|
1824
|
+
const publicFiles = runtimeControlInputFiles(request.inputs, "case", parentWorkload.engineCase ? engineCasePublicFiles(parentWorkload.engineCase) : []);
|
|
1825
|
+
const privateFiles = runtimeControlInputFiles(request.inputs, "enginePrivate", parentWorkload.engineCase ? engineCasePrivateFiles(parentWorkload.engineCase) : []);
|
|
1826
|
+
const subjectFiles = runtimeControlInputFiles(request.inputs, "subject", parentWorkload.subjectFiles);
|
|
1827
|
+
const traceFiles = runtimeControlInputFiles(request.inputs, "traces", parentWorkload.traceFiles);
|
|
1828
|
+
const adapter = request.operations[request.operations.length - 1]?.invocation;
|
|
1829
|
+
const childExecution = {
|
|
1830
|
+
...parentExecution,
|
|
1831
|
+
id: childExecutionId,
|
|
1832
|
+
outputs: [],
|
|
1833
|
+
adapter: adapter
|
|
1834
|
+
? {
|
|
1835
|
+
use: adapter.use,
|
|
1836
|
+
with: adapter.with ?? {},
|
|
1837
|
+
...(adapter.auth !== undefined ? { auth: adapter.auth } : {}),
|
|
1838
|
+
}
|
|
1839
|
+
: parentExecution.adapter,
|
|
1840
|
+
metadata: {
|
|
1841
|
+
...asRuntimeRecord(parentExecution.metadata),
|
|
1842
|
+
runtimeControl: true,
|
|
1843
|
+
caseId: parentWorkload.caseId,
|
|
1844
|
+
},
|
|
1845
|
+
};
|
|
1846
|
+
const engineCase = {
|
|
1847
|
+
id: parentWorkload.caseId,
|
|
1848
|
+
case: parentWorkload.engineCaseSpec ?? {
|
|
1849
|
+
version: 3,
|
|
1850
|
+
prompt: parentWorkload.prompt,
|
|
1851
|
+
},
|
|
1852
|
+
files: {
|
|
1853
|
+
public: publicFiles,
|
|
1854
|
+
private: privateFiles,
|
|
1855
|
+
},
|
|
1856
|
+
};
|
|
1857
|
+
const childJob = {
|
|
1858
|
+
...args.job,
|
|
1859
|
+
id: childJobId,
|
|
1860
|
+
input: {
|
|
1861
|
+
...parentInput,
|
|
1862
|
+
execution: childExecution,
|
|
1863
|
+
caseId: parentWorkload.caseId,
|
|
1864
|
+
},
|
|
1865
|
+
};
|
|
1866
|
+
const childArgs = {
|
|
1867
|
+
...args,
|
|
1868
|
+
job: childJob,
|
|
1869
|
+
baseFiles: subjectFiles,
|
|
1870
|
+
engineResolveFiles: [...publicFiles, ...privateFiles],
|
|
1871
|
+
engineCases: [engineCase],
|
|
1872
|
+
traceFiles,
|
|
1873
|
+
runtimeControlOperation: request,
|
|
1874
|
+
};
|
|
1875
|
+
delete childArgs.adapterRuntimeEnv;
|
|
1876
|
+
delete childArgs.workspaceRoot;
|
|
1877
|
+
return childArgs;
|
|
1878
|
+
}
|
|
1879
|
+
function runtimeControlInputFiles(inputs, key, fallback) {
|
|
1880
|
+
if (inputs && Object.prototype.hasOwnProperty.call(inputs, key)) {
|
|
1881
|
+
return cloneSurfaceFiles(inputs[key] ?? []);
|
|
1882
|
+
}
|
|
1883
|
+
return cloneSurfaceFiles(fallback);
|
|
1884
|
+
}
|
|
1885
|
+
function runtimeControlStepForOperation(operation, index, manifests = []) {
|
|
1886
|
+
const command = operation.invocation.command?.trim()
|
|
1887
|
+
|| adapterProtocolCommandSpec({
|
|
1888
|
+
use: operation.invocation.use,
|
|
1889
|
+
with: operation.invocation.with ?? {},
|
|
1890
|
+
...(operation.invocation.auth !== undefined ? { auth: operation.invocation.auth } : {}),
|
|
1891
|
+
}, operation.operation, manifests).command;
|
|
1892
|
+
return {
|
|
1893
|
+
kind: operation.operation === "subject.run"
|
|
1894
|
+
? "subject"
|
|
1895
|
+
: operation.operation === "optimizer.improve"
|
|
1896
|
+
? "optimizer"
|
|
1897
|
+
: "engine",
|
|
1898
|
+
label: operation.label ?? `${operation.operation.replace(".", "_")}_${index + 1}`,
|
|
1899
|
+
operation: operation.operation,
|
|
1900
|
+
executor: "sandbox",
|
|
1901
|
+
adapter: {
|
|
1902
|
+
use: operation.invocation.use,
|
|
1903
|
+
with: operation.invocation.with ?? {},
|
|
1904
|
+
...(operation.invocation.auth !== undefined ? { auth: operation.invocation.auth } : {}),
|
|
1905
|
+
},
|
|
1906
|
+
command,
|
|
1907
|
+
};
|
|
1908
|
+
}
|
|
1909
|
+
function runtimeControlResultFromCompletedJob(job) {
|
|
1910
|
+
return normalizeRuntimeControlResultOutput(asRuntimeRecord(job.output), job.status === "succeeded", job.error);
|
|
1911
|
+
}
|
|
1912
|
+
function runtimeControlJobOutput(result, ok) {
|
|
1913
|
+
return normalizeRuntimeControlResultOutput({
|
|
1914
|
+
ok,
|
|
1915
|
+
files: result.files,
|
|
1916
|
+
fileChanges: result.fileChanges,
|
|
1917
|
+
...(result.operationResults ? { operationResults: result.operationResults } : {}),
|
|
1918
|
+
...(result.workspaceFiles ? { workspaceFiles: result.workspaceFiles } : {}),
|
|
1919
|
+
...(result.result ? { result: result.result } : {}),
|
|
1920
|
+
...(result.usage ? { usage: result.usage } : {}),
|
|
1921
|
+
...(result.summary !== undefined ? { summary: result.summary } : {}),
|
|
1922
|
+
...(result.feedback !== undefined ? { feedback: result.feedback } : {}),
|
|
1923
|
+
...(result.error ? { error: result.error } : {}),
|
|
1924
|
+
}, ok, result.error);
|
|
1925
|
+
}
|
|
1926
|
+
function normalizeRuntimeControlResultOutput(output, ok, fallbackError) {
|
|
1927
|
+
const files = Array.isArray(output.files)
|
|
1928
|
+
? output.files.filter(isSurfaceSnapshotFile)
|
|
1929
|
+
: [];
|
|
1930
|
+
const workspaceFiles = Array.isArray(output.workspaceFiles)
|
|
1931
|
+
? output.workspaceFiles.filter(isSurfaceSnapshotFile)
|
|
1932
|
+
: undefined;
|
|
1933
|
+
const operationResults = Array.isArray(output.operationResults)
|
|
1934
|
+
? output.operationResults.filter(isWorkbenchAdapterOperationResult)
|
|
1935
|
+
: [];
|
|
1936
|
+
return {
|
|
1937
|
+
ok: ok && output.ok !== false,
|
|
1938
|
+
files,
|
|
1939
|
+
fileChanges: Array.isArray(output.fileChanges)
|
|
1940
|
+
? output.fileChanges.filter((entry) => typeof entry === "string")
|
|
1941
|
+
: files.map((file) => file.path),
|
|
1942
|
+
operationResults,
|
|
1943
|
+
...(workspaceFiles ? { workspaceFiles } : {}),
|
|
1944
|
+
...(output.result && typeof output.result === "object" && !Array.isArray(output.result)
|
|
1945
|
+
? { result: output.result }
|
|
1946
|
+
: {}),
|
|
1947
|
+
...(output.usage && typeof output.usage === "object" && !Array.isArray(output.usage)
|
|
1948
|
+
? { usage: output.usage }
|
|
1949
|
+
: {}),
|
|
1950
|
+
...(typeof output.summary === "string" ? { summary: output.summary } : {}),
|
|
1951
|
+
...(output.feedback !== undefined && isJsonPayload(output.feedback) ? { feedback: output.feedback } : {}),
|
|
1952
|
+
...(typeof output.error === "string" ? { error: output.error } : fallbackError ? { error: fallbackError } : {}),
|
|
1953
|
+
};
|
|
1954
|
+
}
|
|
1955
|
+
function isWorkbenchAdapterOperationResult(value) {
|
|
1956
|
+
if (!value || typeof value !== "object" || Array.isArray(value)) {
|
|
1957
|
+
return false;
|
|
1958
|
+
}
|
|
1959
|
+
const record = value;
|
|
1960
|
+
return record.protocol === "workbench.adapter-result.v1" &&
|
|
1961
|
+
(record.operation === "engine.resolve" ||
|
|
1962
|
+
record.operation === "engine.run" ||
|
|
1963
|
+
record.operation === "subject.run" ||
|
|
1964
|
+
record.operation === "optimizer.improve");
|
|
1965
|
+
}
|
|
1966
|
+
function cloneSurfaceFiles(files) {
|
|
1967
|
+
return files.map((file) => ({ ...file, path: normalizeRelativePath(file.path) }));
|
|
1968
|
+
}
|
|
1969
|
+
function runtimeControlNonce() {
|
|
1970
|
+
return randomBytes(6).toString("hex");
|
|
1971
|
+
}
|
|
1972
|
+
function assertRuntimeControlScope(label, issues) {
|
|
1973
|
+
if (issues.length > 0) {
|
|
1974
|
+
throw new Error(`${label} failed validation:\n${issues.join("\n")}`);
|
|
1975
|
+
}
|
|
1976
|
+
}
|
|
1456
1977
|
async function runHostedProtocolExecutionResult(args, execution, startedAt, capability, eventPublisher) {
|
|
1457
1978
|
const workload = createWorkbenchRunWorkload({
|
|
1458
1979
|
job: args.job,
|
|
@@ -1462,13 +1983,13 @@ async function runHostedProtocolExecutionResult(args, execution, startedAt, capa
|
|
|
1462
1983
|
engineCases: args.engineCases,
|
|
1463
1984
|
traceFiles: args.traceFiles,
|
|
1464
1985
|
});
|
|
1465
|
-
const result = await
|
|
1986
|
+
const result = await runHostedCommandExecutionSteps(args, workload, [protocolStepForExecution(execution, args.adapterManifests)], startedAt, {
|
|
1466
1987
|
capability,
|
|
1467
1988
|
eventPublisher,
|
|
1468
1989
|
});
|
|
1469
1990
|
return { workload, result };
|
|
1470
1991
|
}
|
|
1471
|
-
async function
|
|
1992
|
+
async function runHostedCommandExecutionSteps(args, workload, steps, startedAt, options = {}) {
|
|
1472
1993
|
const [{ execFile }, fs, os, path, { promisify }] = await Promise.all([
|
|
1473
1994
|
importNodeModule(nodeBuiltin("child_process")),
|
|
1474
1995
|
importNodeModule(nodeBuiltin("fs/promises")),
|
|
@@ -1489,9 +2010,22 @@ async function runHostedCommandExecutionPhases(args, workload, phases, startedAt
|
|
|
1489
2010
|
const workspace = await createRuntimeWorkspaceRoot(args, fs, os, path, "workbench-execution-sandbox-");
|
|
1490
2011
|
try {
|
|
1491
2012
|
await stageWorkbenchRunWorkload(workspace.root, workload);
|
|
2013
|
+
if (options.workspaceFiles && options.workspaceFiles.length > 0) {
|
|
2014
|
+
await stageInitialWorkspaceFiles(workspace.root, options.workspaceFiles);
|
|
2015
|
+
}
|
|
2016
|
+
if (options.outputFiles && options.outputFiles.length > 0) {
|
|
2017
|
+
await writeSurfaceFiles(outputDir(workspace.root), options.outputFiles);
|
|
2018
|
+
}
|
|
2019
|
+
const execution = readWorkbenchExecutionSpec(workload.job);
|
|
2020
|
+
const hostAdapterIds = new Set(steps.flatMap((step) => step.executor === "host"
|
|
2021
|
+
? [step.adapter?.use ?? execution.adapter.use]
|
|
2022
|
+
: []));
|
|
2023
|
+
const hostAdapterRoots = hostAdapterIds.size > 0
|
|
2024
|
+
? await materializeHostAdapterRoots(workspace.root, args.adapterFiles ?? [], hostAdapterIds)
|
|
2025
|
+
: new Map();
|
|
1492
2026
|
let exitCode = 0;
|
|
1493
2027
|
let runtimeError;
|
|
1494
|
-
const
|
|
2028
|
+
const operationResults = [];
|
|
1495
2029
|
try {
|
|
1496
2030
|
if (!environmentVersion) {
|
|
1497
2031
|
throw new Error("environment is required for adapter command executions.");
|
|
@@ -1503,49 +2037,64 @@ async function runHostedCommandExecutionPhases(args, workload, phases, startedAt
|
|
|
1503
2037
|
network: environmentVersion.spec.network,
|
|
1504
2038
|
}, null, 2)}\n`);
|
|
1505
2039
|
}
|
|
1506
|
-
const
|
|
2040
|
+
const stepTimeoutMs = environmentVersion
|
|
1507
2041
|
? environmentVersionTimeoutMs(environmentVersion)
|
|
1508
2042
|
: 5 * 60 * 1000;
|
|
1509
|
-
const
|
|
1510
|
-
|
|
1511
|
-
await
|
|
1512
|
-
|
|
1513
|
-
|
|
2043
|
+
const shouldRunSubjectPrepare = options.runSubjectPrepare ?? steps.some((step) => step.executor === "sandbox");
|
|
2044
|
+
if (shouldRunSubjectPrepare) {
|
|
2045
|
+
await runSubjectPrepareCommand({
|
|
2046
|
+
root: workspace.root,
|
|
2047
|
+
workload,
|
|
2048
|
+
execution,
|
|
2049
|
+
execFileAsync,
|
|
2050
|
+
timeoutMs: stepTimeoutMs,
|
|
2051
|
+
eventPublisher: options.eventPublisher,
|
|
2052
|
+
});
|
|
2053
|
+
}
|
|
2054
|
+
let enginePrivateStaged = false;
|
|
2055
|
+
for (const step of steps) {
|
|
2056
|
+
if (step.kind === "engine" && !enginePrivateStaged) {
|
|
2057
|
+
await stageWorkbenchEnginePrivateFiles(workspace.root, workload);
|
|
2058
|
+
enginePrivateStaged = true;
|
|
1514
2059
|
}
|
|
1515
|
-
|
|
1516
|
-
const
|
|
1517
|
-
|
|
1518
|
-
|
|
2060
|
+
await resetHostedWorkloadStepOutput(workspace.root);
|
|
2061
|
+
const adapterRequestPath = await writeWorkbenchAdapterRequest(workspace.root, workload, execution, step, adapterAuthRequestForStep(args, step.adapter?.use ?? execution.adapter.use), args.adapterManifests);
|
|
2062
|
+
const stepRole = stepEventRole(step);
|
|
2063
|
+
await publishCommandStepEvent(options.eventPublisher, {
|
|
2064
|
+
step: step.label,
|
|
1519
2065
|
status: "started",
|
|
1520
|
-
...(
|
|
2066
|
+
...(stepRole ? { role: stepRole } : {}),
|
|
1521
2067
|
});
|
|
1522
2068
|
try {
|
|
1523
|
-
if (!
|
|
1524
|
-
throw new Error(`Adapter
|
|
2069
|
+
if (!step.command) {
|
|
2070
|
+
throw new Error(`Adapter step ${step.label} is missing a command.`);
|
|
1525
2071
|
}
|
|
1526
|
-
const
|
|
2072
|
+
const adapterRoot = step.executor === "host"
|
|
2073
|
+
? hostAdapterRoots.get(step.adapter?.use ?? execution.adapter.use)
|
|
2074
|
+
: undefined;
|
|
2075
|
+
const command = createHostedWorkloadShellCommand(workspace.root, step.command, step.label, step.okExitCodes);
|
|
1527
2076
|
await execFileAsync("sh", ["-c", command], {
|
|
1528
|
-
cwd: workspace.root,
|
|
1529
|
-
env:
|
|
2077
|
+
cwd: adapterRoot ?? workspace.root,
|
|
2078
|
+
env: createHostedWorkloadAdapterEnv(workspace.root, adapterRequestPath, args.adapterAuthEnv, adapterRoot ? { adapterRoot } : undefined, args.adapterRuntimeEnv),
|
|
1530
2079
|
maxBuffer: 10 * 1024 * 1024,
|
|
1531
|
-
timeout:
|
|
2080
|
+
timeout: stepTimeoutMs,
|
|
1532
2081
|
});
|
|
1533
|
-
const operationResult = await readWorkbenchAdapterOperationResult(outputDir(workspace.root),
|
|
1534
|
-
assertWorkbenchAdapterOperationResultOk(operationResult, `Adapter ${
|
|
1535
|
-
|
|
1536
|
-
await
|
|
1537
|
-
|
|
2082
|
+
const operationResult = await readWorkbenchAdapterOperationResult(outputDir(workspace.root), step.operation);
|
|
2083
|
+
assertWorkbenchAdapterOperationResultOk(operationResult, `Adapter ${step.adapter?.use ?? execution.adapter.use} ${step.operation}`);
|
|
2084
|
+
operationResults.push(operationResult);
|
|
2085
|
+
await publishCommandStepEvent(options.eventPublisher, {
|
|
2086
|
+
step: step.label,
|
|
1538
2087
|
status: "succeeded",
|
|
1539
|
-
...(
|
|
2088
|
+
...(stepRole ? { role: stepRole } : {}),
|
|
1540
2089
|
});
|
|
1541
2090
|
}
|
|
1542
2091
|
catch (error) {
|
|
1543
|
-
await
|
|
1544
|
-
|
|
2092
|
+
await publishCommandStepEvent(options.eventPublisher, {
|
|
2093
|
+
step: step.label,
|
|
1545
2094
|
status: "failed",
|
|
1546
2095
|
exitCode: readExitCode(error),
|
|
1547
2096
|
error: error instanceof Error ? error.message : String(error),
|
|
1548
|
-
...(
|
|
2097
|
+
...(stepRole ? { role: stepRole } : {}),
|
|
1549
2098
|
});
|
|
1550
2099
|
throw error;
|
|
1551
2100
|
}
|
|
@@ -1569,16 +2118,56 @@ async function runHostedCommandExecutionPhases(args, workload, phases, startedAt
|
|
|
1569
2118
|
startedAt,
|
|
1570
2119
|
});
|
|
1571
2120
|
}
|
|
1572
|
-
|
|
2121
|
+
const result = await readWorkbenchRunWorkloadResult(workspace.root, workload, {
|
|
1573
2122
|
exitCode,
|
|
1574
2123
|
startedAt,
|
|
1575
|
-
|
|
2124
|
+
operationResults,
|
|
1576
2125
|
});
|
|
2126
|
+
if (options.collectWorkspace) {
|
|
2127
|
+
result.workspaceFiles = await readMutableWorkspaceSnapshotFiles(workspace.root);
|
|
2128
|
+
}
|
|
2129
|
+
return result;
|
|
1577
2130
|
}
|
|
1578
2131
|
finally {
|
|
1579
2132
|
await workspace.cleanup();
|
|
1580
2133
|
}
|
|
1581
2134
|
}
|
|
2135
|
+
async function runSubjectPrepareCommand(args) {
|
|
2136
|
+
const command = args.workload.spec.subject.prepare?.command;
|
|
2137
|
+
if (!command) {
|
|
2138
|
+
return;
|
|
2139
|
+
}
|
|
2140
|
+
const role = args.execution.purpose === "improve" ? "optimizer" : "runner";
|
|
2141
|
+
await publishCommandStepEvent(args.eventPublisher, {
|
|
2142
|
+
step: "subject_prepare",
|
|
2143
|
+
status: "started",
|
|
2144
|
+
role,
|
|
2145
|
+
});
|
|
2146
|
+
try {
|
|
2147
|
+
const shellCommand = createHostedWorkloadShellCommand(args.root, command, "subject_prepare");
|
|
2148
|
+
await args.execFileAsync("sh", ["-c", shellCommand], {
|
|
2149
|
+
cwd: args.root,
|
|
2150
|
+
env: createHostedWorkloadPrepareEnv(args.root),
|
|
2151
|
+
maxBuffer: 10 * 1024 * 1024,
|
|
2152
|
+
timeout: args.timeoutMs,
|
|
2153
|
+
});
|
|
2154
|
+
await publishCommandStepEvent(args.eventPublisher, {
|
|
2155
|
+
step: "subject_prepare",
|
|
2156
|
+
status: "succeeded",
|
|
2157
|
+
role,
|
|
2158
|
+
});
|
|
2159
|
+
}
|
|
2160
|
+
catch (error) {
|
|
2161
|
+
await publishCommandStepEvent(args.eventPublisher, {
|
|
2162
|
+
step: "subject_prepare",
|
|
2163
|
+
status: "failed",
|
|
2164
|
+
exitCode: readExitCode(error),
|
|
2165
|
+
error: error instanceof Error ? error.message : String(error),
|
|
2166
|
+
role,
|
|
2167
|
+
});
|
|
2168
|
+
throw new Error(`Subject prepare command failed: ${error instanceof Error ? error.message : String(error)}`);
|
|
2169
|
+
}
|
|
2170
|
+
}
|
|
1582
2171
|
async function createRuntimeWorkspaceRoot(args, fs, os, path, prefix) {
|
|
1583
2172
|
if (args.workspaceRoot) {
|
|
1584
2173
|
await fs.mkdir(args.workspaceRoot, { recursive: true });
|
|
@@ -1614,19 +2203,22 @@ async function createRuntimeWorkspaceRoot(args, fs, os, path, prefix) {
|
|
|
1614
2203
|
},
|
|
1615
2204
|
};
|
|
1616
2205
|
}
|
|
1617
|
-
function
|
|
1618
|
-
if (
|
|
2206
|
+
function stepEventRole(step) {
|
|
2207
|
+
if (step.kind === "optimizer") {
|
|
1619
2208
|
return "optimizer";
|
|
1620
2209
|
}
|
|
1621
|
-
if (
|
|
2210
|
+
if (step.kind === "subject") {
|
|
1622
2211
|
return "runner";
|
|
1623
2212
|
}
|
|
1624
|
-
if (
|
|
2213
|
+
if (step.kind === "engine") {
|
|
1625
2214
|
return "engine";
|
|
1626
2215
|
}
|
|
1627
2216
|
return undefined;
|
|
1628
2217
|
}
|
|
1629
2218
|
function adapterOperationUsageSummary(result) {
|
|
2219
|
+
if (hasExplicitUsageRole(result.usage)) {
|
|
2220
|
+
return completeUsageSummary(result.usage);
|
|
2221
|
+
}
|
|
1630
2222
|
if (result.operation === "optimizer.improve") {
|
|
1631
2223
|
return assignUsageRole("optimizer", result.usage);
|
|
1632
2224
|
}
|
|
@@ -1638,11 +2230,16 @@ function adapterOperationUsageSummary(result) {
|
|
|
1638
2230
|
}
|
|
1639
2231
|
return result.usage;
|
|
1640
2232
|
}
|
|
1641
|
-
function
|
|
1642
|
-
|
|
1643
|
-
|
|
1644
|
-
|
|
1645
|
-
|
|
2233
|
+
function attemptUsageSummary(workloadUsage, resultUsage) {
|
|
2234
|
+
const normalizedWorkloadUsage = completeUsageSummary(workloadUsage);
|
|
2235
|
+
const legacyEngineUsage = normalizedWorkloadUsage?.engine
|
|
2236
|
+
? undefined
|
|
2237
|
+
: assignUsageRole("engine", resultUsage);
|
|
2238
|
+
return mergeUsageSummaries([normalizedWorkloadUsage, legacyEngineUsage]);
|
|
2239
|
+
}
|
|
2240
|
+
function hasExplicitUsageRole(usage) {
|
|
2241
|
+
const normalized = completeUsageSummary(usage);
|
|
2242
|
+
return Boolean(normalized?.optimizer || normalized?.runner || normalized?.engine);
|
|
1646
2243
|
}
|
|
1647
2244
|
function createSubjectPatchFromResult(result, spec) {
|
|
1648
2245
|
if (result.subjectPatch) {
|
|
@@ -1720,47 +2317,103 @@ export async function stageWorkbenchRunWorkload(root, workload) {
|
|
|
1720
2317
|
fs
|
|
1721
2318
|
.rm(runtimePrivateDir(root), { recursive: true, force: true })
|
|
1722
2319
|
.catch(() => undefined),
|
|
1723
|
-
fs
|
|
1724
|
-
.rm(runtimeLogsDir(root), { recursive: true, force: true })
|
|
1725
|
-
.catch(() => undefined),
|
|
1726
2320
|
]);
|
|
1727
2321
|
await fs.mkdir(inputDir(root), { recursive: true });
|
|
1728
2322
|
await fs.mkdir(outputDir(root), { recursive: true });
|
|
1729
2323
|
if (purpose === "attempt") {
|
|
1730
|
-
assertMutableWorkspaceFiles(workload.subjectFiles, "Subject files");
|
|
1731
2324
|
await fs.mkdir(subjectDir(root), { recursive: true });
|
|
1732
2325
|
await fs.mkdir(caseDir(root), { recursive: true });
|
|
1733
|
-
await fs.mkdir(runtimeLogsAgentDir(root), { recursive: true });
|
|
1734
|
-
await fs.mkdir(runtimeLogsVerifierDir(root), { recursive: true });
|
|
1735
2326
|
const engineCase = requireWorkloadEngineCase(workload, "Attempt staging");
|
|
1736
2327
|
await writeSurfaceFiles(subjectDir(root), workload.subjectFiles);
|
|
1737
|
-
await writeSurfaceFiles(caseDir(root),
|
|
1738
|
-
await writeSurfaceFiles(root, workload.subjectFiles);
|
|
2328
|
+
await writeSurfaceFiles(caseDir(root), engineCasePublicFiles(engineCase));
|
|
1739
2329
|
return;
|
|
1740
2330
|
}
|
|
1741
2331
|
if (purpose === "improve") {
|
|
1742
|
-
assertMutableWorkspaceFiles(workload.subjectFiles, "Subject files");
|
|
1743
2332
|
await fs.mkdir(subjectDir(root), { recursive: true });
|
|
1744
2333
|
await writeSurfaceFiles(subjectDir(root), workload.subjectFiles);
|
|
1745
|
-
await writeSurfaceFiles(root, workload.subjectFiles);
|
|
1746
2334
|
await fs.mkdir(tracesDir(root), { recursive: true });
|
|
1747
2335
|
await writeSurfaceFiles(tracesDir(root), workload.traceFiles);
|
|
1748
2336
|
}
|
|
1749
2337
|
}
|
|
1750
|
-
async function
|
|
2338
|
+
async function stageWorkbenchEnginePrivateFiles(root, workload) {
|
|
2339
|
+
if (readWorkloadExecutionPurpose(workload) !== "attempt") {
|
|
2340
|
+
return;
|
|
2341
|
+
}
|
|
1751
2342
|
const fs = await importNodeModule(nodeBuiltin("fs/promises"));
|
|
1752
|
-
const engineCase = requireWorkloadEngineCase(workload, "Attempt scoring");
|
|
1753
|
-
await Promise.all([
|
|
1754
|
-
fs
|
|
1755
|
-
.rm(runtimeEnginePrivateDir(root), { recursive: true, force: true })
|
|
1756
|
-
.catch(() => undefined),
|
|
1757
|
-
fs
|
|
1758
|
-
.rm(runtimeLogsVerifierDir(root), { recursive: true, force: true })
|
|
1759
|
-
.catch(() => undefined),
|
|
1760
|
-
]);
|
|
1761
2343
|
await fs.mkdir(runtimeEnginePrivateDir(root), { recursive: true });
|
|
1762
|
-
await
|
|
1763
|
-
|
|
2344
|
+
await writeSurfaceFiles(runtimeEnginePrivateDir(root), engineCasePrivateFiles(requireWorkloadEngineCase(workload, "Engine-private staging")));
|
|
2345
|
+
}
|
|
2346
|
+
async function stageInitialWorkspaceFiles(root, files) {
|
|
2347
|
+
await writeSurfaceFiles(root, files.filter((file) => isMutableWorkspaceSnapshotPath(file.path)));
|
|
2348
|
+
}
|
|
2349
|
+
async function readMutableWorkspaceSnapshotFiles(root) {
|
|
2350
|
+
return (await readSurfaceFiles(root))
|
|
2351
|
+
.filter((file) => isMutableWorkspaceSnapshotPath(file.path))
|
|
2352
|
+
.sort((left, right) => left.path.localeCompare(right.path));
|
|
2353
|
+
}
|
|
2354
|
+
function isMutableWorkspaceSnapshotPath(filePath) {
|
|
2355
|
+
const normalized = normalizeRelativePath(filePath);
|
|
2356
|
+
return Boolean(normalized &&
|
|
2357
|
+
!normalized.startsWith("../") &&
|
|
2358
|
+
normalized !== "input" &&
|
|
2359
|
+
!normalized.startsWith("input/") &&
|
|
2360
|
+
normalized !== "private" &&
|
|
2361
|
+
!normalized.startsWith("private/") &&
|
|
2362
|
+
normalized !== "output" &&
|
|
2363
|
+
!normalized.startsWith("output/") &&
|
|
2364
|
+
normalized !== ".workbench" &&
|
|
2365
|
+
!normalized.startsWith(".workbench/"));
|
|
2366
|
+
}
|
|
2367
|
+
async function materializeHostAdapterRoots(root, adapterFiles, adapterIds) {
|
|
2368
|
+
if (adapterFiles.length === 0 || adapterIds.size === 0) {
|
|
2369
|
+
return new Map();
|
|
2370
|
+
}
|
|
2371
|
+
const fs = await importNodeModule(nodeBuiltin("fs/promises"));
|
|
2372
|
+
const path = await importNodeModule(nodeBuiltin("path"));
|
|
2373
|
+
const sourceRoots = hostAdapterSourceRoots(adapterFiles, adapterIds);
|
|
2374
|
+
const roots = new Map();
|
|
2375
|
+
for (const [adapterId, sourceRoot] of sourceRoots) {
|
|
2376
|
+
const targetRoot = path.join(root, ".workbench", "adapters", adapterId);
|
|
2377
|
+
const files = adapterFiles.flatMap((file) => {
|
|
2378
|
+
const relativePath = adapterFilePathWithinRoot(file.path, sourceRoot);
|
|
2379
|
+
return relativePath === null
|
|
2380
|
+
? []
|
|
2381
|
+
: [{ ...file, path: relativePath }];
|
|
2382
|
+
});
|
|
2383
|
+
await fs.rm(targetRoot, { recursive: true, force: true }).catch(() => undefined);
|
|
2384
|
+
await fs.mkdir(targetRoot, { recursive: true });
|
|
2385
|
+
await writeSurfaceFiles(targetRoot, files);
|
|
2386
|
+
roots.set(adapterId, await fs.realpath(targetRoot));
|
|
2387
|
+
}
|
|
2388
|
+
return roots;
|
|
2389
|
+
}
|
|
2390
|
+
function hostAdapterSourceRoots(adapterFiles, adapterIds) {
|
|
2391
|
+
const roots = new Map();
|
|
2392
|
+
for (const file of adapterFiles) {
|
|
2393
|
+
const normalized = normalizeRelativePath(file.path);
|
|
2394
|
+
if (!normalized.endsWith("workbench.adapter.yaml")) {
|
|
2395
|
+
continue;
|
|
2396
|
+
}
|
|
2397
|
+
const manifest = parseWorkbenchAdapterManifest(file.content);
|
|
2398
|
+
if (!adapterIds.has(manifest.id)) {
|
|
2399
|
+
continue;
|
|
2400
|
+
}
|
|
2401
|
+
const sourceRoot = normalized === "workbench.adapter.yaml"
|
|
2402
|
+
? ""
|
|
2403
|
+
: normalized.slice(0, -"workbench.adapter.yaml".length).replace(/\/+$/u, "");
|
|
2404
|
+
roots.set(manifest.id, sourceRoot);
|
|
2405
|
+
}
|
|
2406
|
+
return roots;
|
|
2407
|
+
}
|
|
2408
|
+
function adapterFilePathWithinRoot(filePath, sourceRoot) {
|
|
2409
|
+
const normalized = normalizeRelativePath(filePath);
|
|
2410
|
+
if (!sourceRoot) {
|
|
2411
|
+
return normalized;
|
|
2412
|
+
}
|
|
2413
|
+
if (!normalized.startsWith(`${sourceRoot}/`)) {
|
|
2414
|
+
return null;
|
|
2415
|
+
}
|
|
2416
|
+
return normalized.slice(sourceRoot.length + 1);
|
|
1764
2417
|
}
|
|
1765
2418
|
async function readHostedRunFailureResult(root, workload, options) {
|
|
1766
2419
|
const traceFiles = await readRuntimeTraceFiles(root, workload);
|
|
@@ -1788,16 +2441,16 @@ async function readWorkbenchRunWorkloadResult(root, workload, options = {}) {
|
|
|
1788
2441
|
const primaryOperation = purpose === "improve"
|
|
1789
2442
|
? "optimizer.improve"
|
|
1790
2443
|
: "engine.run";
|
|
1791
|
-
const primaryResult = [...(options.
|
|
2444
|
+
const primaryResult = [...(options.operationResults ?? [])]
|
|
1792
2445
|
.reverse()
|
|
1793
2446
|
.find((result) => result.operation === primaryOperation);
|
|
1794
2447
|
const resultPayload = jsonRecord(primaryResult?.value);
|
|
1795
2448
|
const usage = mergeUsageSummaries([
|
|
1796
2449
|
options.usage,
|
|
1797
|
-
...(options.
|
|
2450
|
+
...(options.operationResults ?? []).map(adapterOperationUsageSummary),
|
|
1798
2451
|
]);
|
|
1799
|
-
const metrics =
|
|
1800
|
-
const cases =
|
|
2452
|
+
const metrics = normalizeResultMetrics(resultPayload.metrics);
|
|
2453
|
+
const cases = normalizeResultCases(resultPayload.cases);
|
|
1801
2454
|
const includeResultScoring = purpose === "attempt";
|
|
1802
2455
|
const files = [...outputFiles, ...traceFiles].sort((left, right) => left.path.localeCompare(right.path));
|
|
1803
2456
|
const subjectPatch = purpose === "improve" ? primaryResult?.value : undefined;
|
|
@@ -1809,6 +2462,7 @@ async function readWorkbenchRunWorkloadResult(root, workload, options = {}) {
|
|
|
1809
2462
|
return {
|
|
1810
2463
|
files,
|
|
1811
2464
|
fileChanges: declaredChanges,
|
|
2465
|
+
...(options.operationResults ? { operationResults: [...options.operationResults] } : {}),
|
|
1812
2466
|
...(subjectPatch ? { subjectPatch } : {}),
|
|
1813
2467
|
...(engineResult ? { result: engineResult } : {}),
|
|
1814
2468
|
...(includeResultScoring && metrics ? { metrics } : {}),
|
|
@@ -1835,10 +2489,10 @@ async function readRuntimeTraceFiles(root, workload) {
|
|
|
1835
2489
|
const path = await importNodeModule(nodeBuiltin("path"));
|
|
1836
2490
|
const traceRoot = path.join(outputDir(root), ".workbench", "traces", workload.job.id);
|
|
1837
2491
|
const purpose = readWorkloadExecutionPurpose(workload);
|
|
1838
|
-
const outputTraceRoot =
|
|
2492
|
+
const outputTraceRoot = workbenchTraceExecutionDirectory({
|
|
1839
2493
|
sequence: 1,
|
|
1840
2494
|
runId: workload.job.runId,
|
|
1841
|
-
|
|
2495
|
+
purpose,
|
|
1842
2496
|
});
|
|
1843
2497
|
return (await readSurfaceFiles(traceRoot)).map((file) => ({
|
|
1844
2498
|
...file,
|
|
@@ -1868,13 +2522,13 @@ function createHostedWorkloadShellCommand(root, command, prefix = "", okExitCode
|
|
|
1868
2522
|
'exit "$status"',
|
|
1869
2523
|
].join("; ");
|
|
1870
2524
|
}
|
|
1871
|
-
async function
|
|
2525
|
+
async function resetHostedWorkloadStepOutput(root) {
|
|
1872
2526
|
const fs = await importNodeModule(nodeBuiltin("fs/promises"));
|
|
1873
2527
|
await fs
|
|
1874
2528
|
.rm(workbenchAdapterOperationResultPath(outputDir(root)), { force: true })
|
|
1875
2529
|
.catch(() => undefined);
|
|
1876
2530
|
}
|
|
1877
|
-
async function writeWorkbenchAdapterRequest(root, workload, execution,
|
|
2531
|
+
async function writeWorkbenchAdapterRequest(root, workload, execution, step, auth, manifests) {
|
|
1878
2532
|
const [fs, path] = await Promise.all([
|
|
1879
2533
|
importNodeModule(nodeBuiltin("fs/promises")),
|
|
1880
2534
|
importNodeModule(nodeBuiltin("path")),
|
|
@@ -1882,13 +2536,13 @@ async function writeWorkbenchAdapterRequest(root, workload, execution, phase, au
|
|
|
1882
2536
|
const requestPath = path.join(root, ".workbench", "request.json");
|
|
1883
2537
|
await fs.mkdir(path.dirname(requestPath), { recursive: true });
|
|
1884
2538
|
const casePrompt = workload.engineCaseSpec?.prompt;
|
|
1885
|
-
const adapter =
|
|
2539
|
+
const adapter = step.adapter ?? execution.adapter;
|
|
1886
2540
|
const subjectCommand = adapterProtocolCommandSpec(workload.spec.run, "subject.run", manifests).command;
|
|
1887
2541
|
await fs.writeFile(requestPath, `${JSON.stringify({
|
|
1888
2542
|
protocol: "workbench.adapter.v3",
|
|
1889
2543
|
id: execution.id,
|
|
1890
2544
|
jobId: workload.job.id,
|
|
1891
|
-
operation:
|
|
2545
|
+
operation: step.operation,
|
|
1892
2546
|
invocation: {
|
|
1893
2547
|
use: adapter.use,
|
|
1894
2548
|
with: adapterConfigRecord(adapter, manifests),
|
|
@@ -1903,6 +2557,7 @@ async function writeWorkbenchAdapterRequest(root, workload, execution, phase, au
|
|
|
1903
2557
|
subject: {
|
|
1904
2558
|
id: workload.subjectId,
|
|
1905
2559
|
path: workload.spec.subject.files.path,
|
|
2560
|
+
...(workload.spec.subject.prepare ? { prepare: { ...workload.spec.subject.prepare } } : {}),
|
|
1906
2561
|
run: {
|
|
1907
2562
|
...workload.spec.run,
|
|
1908
2563
|
command: subjectCommand,
|
|
@@ -1923,14 +2578,12 @@ async function writeWorkbenchAdapterRequest(root, workload, execution, phase, au
|
|
|
1923
2578
|
},
|
|
1924
2579
|
paths: {
|
|
1925
2580
|
workspace: root,
|
|
1926
|
-
cwd: root,
|
|
1927
2581
|
output: outputDir(root),
|
|
1928
2582
|
result: workbenchAdapterOperationResultPath(outputDir(root)),
|
|
1929
2583
|
subject: subjectDir(root),
|
|
1930
2584
|
...(workload.engineCaseSpec ? { case: caseDir(root) } : {}),
|
|
1931
2585
|
traces: tracesDir(root),
|
|
1932
|
-
...(
|
|
1933
|
-
logs: runtimeLogsDir(root),
|
|
2586
|
+
...(step.kind === "engine" ? { enginePrivate: runtimeEnginePrivateDir(root) } : {}),
|
|
1934
2587
|
},
|
|
1935
2588
|
}, null, 2)}\n`);
|
|
1936
2589
|
return requestPath;
|
|
@@ -1945,7 +2598,29 @@ function requireOptimizerEdits(spec) {
|
|
|
1945
2598
|
}
|
|
1946
2599
|
return edits;
|
|
1947
2600
|
}
|
|
1948
|
-
function
|
|
2601
|
+
function createHostedWorkloadAdapterEnv(root, adapterRequestPath, adapterEnv = {}, options = {}, runtimeEnv = {}) {
|
|
2602
|
+
const env = createHostedWorkloadBaseEnv();
|
|
2603
|
+
env.WORKBENCH_ADAPTER_REQUEST = adapterRequestPath;
|
|
2604
|
+
env.WORKBENCH_OUTPUT = outputDir(root);
|
|
2605
|
+
env.WORKBENCH_RESULT = workbenchAdapterOperationResultPath(outputDir(root));
|
|
2606
|
+
if (options.adapterRoot) {
|
|
2607
|
+
env.WORKBENCH_ADAPTER_ROOT = options.adapterRoot;
|
|
2608
|
+
env.WORKBENCH_WORKSPACE_ROOT = root;
|
|
2609
|
+
env.PATH = [
|
|
2610
|
+
`${options.adapterRoot}/node_modules/.bin`,
|
|
2611
|
+
env.PATH,
|
|
2612
|
+
].filter(Boolean).join(":");
|
|
2613
|
+
}
|
|
2614
|
+
Object.assign(env, adapterEnv);
|
|
2615
|
+
Object.assign(env, runtimeEnv);
|
|
2616
|
+
return env;
|
|
2617
|
+
}
|
|
2618
|
+
function createHostedWorkloadPrepareEnv(root) {
|
|
2619
|
+
const env = createHostedWorkloadBaseEnv();
|
|
2620
|
+
env.WORKBENCH_OUTPUT = outputDir(root);
|
|
2621
|
+
return env;
|
|
2622
|
+
}
|
|
2623
|
+
function createHostedWorkloadBaseEnv() {
|
|
1949
2624
|
const env = {};
|
|
1950
2625
|
for (const [key, value] of Object.entries(process.env)) {
|
|
1951
2626
|
if (typeof value === "string") {
|
|
@@ -1957,20 +2632,52 @@ function createHostedWorkloadPhaseEnv(root, adapterRequestPath, adapterEnv = {})
|
|
|
1957
2632
|
delete env[key];
|
|
1958
2633
|
}
|
|
1959
2634
|
}
|
|
1960
|
-
const runtimeBins = [
|
|
2635
|
+
const runtimeBins = uniquePathEntries([
|
|
2636
|
+
...nodeModuleBinDirsForAncestors(process.cwd()),
|
|
2637
|
+
...nodeModuleBinDirsForAncestors(path.dirname(fileURLToPath(import.meta.url))),
|
|
2638
|
+
"/app/node_modules/.bin",
|
|
1961
2639
|
"/workbench-runtime/node_modules/.bin",
|
|
1962
2640
|
"/workbench-runtime/products/workbench/node_modules/.bin",
|
|
1963
|
-
]
|
|
1964
|
-
|
|
1965
|
-
|
|
1966
|
-
|
|
1967
|
-
|
|
1968
|
-
|
|
1969
|
-
|
|
1970
|
-
|
|
1971
|
-
|
|
2641
|
+
]);
|
|
2642
|
+
env.PATH = uniquePathEntries([
|
|
2643
|
+
path.dirname(process.execPath),
|
|
2644
|
+
"/usr/local/sbin",
|
|
2645
|
+
"/usr/local/bin",
|
|
2646
|
+
"/usr/sbin",
|
|
2647
|
+
"/usr/bin",
|
|
2648
|
+
"/sbin",
|
|
2649
|
+
"/bin",
|
|
2650
|
+
...runtimeBins,
|
|
2651
|
+
...(process.env.PATH ? process.env.PATH.split(path.delimiter) : []),
|
|
2652
|
+
]).join(path.delimiter);
|
|
1972
2653
|
return env;
|
|
1973
2654
|
}
|
|
2655
|
+
function nodeModuleBinDirsForAncestors(start) {
|
|
2656
|
+
const dirs = [];
|
|
2657
|
+
let current = path.resolve(start);
|
|
2658
|
+
for (let depth = 0; depth < 12; depth += 1) {
|
|
2659
|
+
dirs.push(path.join(current, "node_modules", ".bin"));
|
|
2660
|
+
const parent = path.dirname(current);
|
|
2661
|
+
if (parent === current) {
|
|
2662
|
+
break;
|
|
2663
|
+
}
|
|
2664
|
+
current = parent;
|
|
2665
|
+
}
|
|
2666
|
+
return dirs;
|
|
2667
|
+
}
|
|
2668
|
+
function uniquePathEntries(entries) {
|
|
2669
|
+
const seen = new Set();
|
|
2670
|
+
const output = [];
|
|
2671
|
+
for (const entry of entries) {
|
|
2672
|
+
const trimmed = entry.trim();
|
|
2673
|
+
if (!trimmed || seen.has(trimmed)) {
|
|
2674
|
+
continue;
|
|
2675
|
+
}
|
|
2676
|
+
seen.add(trimmed);
|
|
2677
|
+
output.push(trimmed);
|
|
2678
|
+
}
|
|
2679
|
+
return output;
|
|
2680
|
+
}
|
|
1974
2681
|
function readWorkloadExecutionPurpose(workload) {
|
|
1975
2682
|
const purpose = workbenchExecutionPurpose(workload.job);
|
|
1976
2683
|
if (purpose === "improve" || purpose === "attempt") {
|
|
@@ -2005,35 +2712,6 @@ function runtimePrivateDir(root) {
|
|
|
2005
2712
|
function runtimeEnginePrivateDir(root) {
|
|
2006
2713
|
return `${runtimePrivateDir(root)}/engine`;
|
|
2007
2714
|
}
|
|
2008
|
-
function runtimeLogsDir(root) {
|
|
2009
|
-
return `${root}/logs`;
|
|
2010
|
-
}
|
|
2011
|
-
function runtimeLogsAgentDir(root) {
|
|
2012
|
-
return `${runtimeLogsDir(root)}/agent`;
|
|
2013
|
-
}
|
|
2014
|
-
function runtimeLogsVerifierDir(root) {
|
|
2015
|
-
return `${runtimeLogsDir(root)}/verifier`;
|
|
2016
|
-
}
|
|
2017
|
-
function assertMutableWorkspaceFiles(files, label) {
|
|
2018
|
-
const reserved = files
|
|
2019
|
-
.map((file) => normalizeRelativePath(file.path))
|
|
2020
|
-
.filter(isRuntimeReservedWorkspacePath);
|
|
2021
|
-
if (reserved.length > 0) {
|
|
2022
|
-
throw new Error(`${label} cannot target runtime-reserved workspace paths: ${reserved.join(", ")}.`);
|
|
2023
|
-
}
|
|
2024
|
-
}
|
|
2025
|
-
function isRuntimeReservedWorkspacePath(normalizedPath) {
|
|
2026
|
-
return normalizedPath === ".workbench" ||
|
|
2027
|
-
normalizedPath.startsWith(".workbench/") ||
|
|
2028
|
-
normalizedPath === "input" ||
|
|
2029
|
-
normalizedPath.startsWith("input/") ||
|
|
2030
|
-
normalizedPath === "output" ||
|
|
2031
|
-
normalizedPath.startsWith("output/") ||
|
|
2032
|
-
normalizedPath === "logs" ||
|
|
2033
|
-
normalizedPath.startsWith("logs/") ||
|
|
2034
|
-
normalizedPath === "private" ||
|
|
2035
|
-
normalizedPath.startsWith("private/");
|
|
2036
|
-
}
|
|
2037
2715
|
async function writeSurfaceFiles(root, files) {
|
|
2038
2716
|
const fs = await importNodeModule(nodeBuiltin("fs/promises"));
|
|
2039
2717
|
const path = await importNodeModule(nodeBuiltin("path"));
|
|
@@ -2097,7 +2775,7 @@ function encodeSurfaceSnapshotContent(body, utf8Decoder) {
|
|
|
2097
2775
|
};
|
|
2098
2776
|
}
|
|
2099
2777
|
}
|
|
2100
|
-
function
|
|
2778
|
+
function normalizeResultMetrics(value) {
|
|
2101
2779
|
if (!value || typeof value !== "object" || Array.isArray(value)) {
|
|
2102
2780
|
return undefined;
|
|
2103
2781
|
}
|
|
@@ -2109,7 +2787,7 @@ function normalizeRewardMetrics(value) {
|
|
|
2109
2787
|
}
|
|
2110
2788
|
return Object.keys(metrics).length > 0 ? metrics : undefined;
|
|
2111
2789
|
}
|
|
2112
|
-
function
|
|
2790
|
+
function normalizeResultCases(value) {
|
|
2113
2791
|
if (!Array.isArray(value)) {
|
|
2114
2792
|
return undefined;
|
|
2115
2793
|
}
|
|
@@ -2122,7 +2800,7 @@ function normalizeRewardCases(value) {
|
|
|
2122
2800
|
if (!id) {
|
|
2123
2801
|
return [];
|
|
2124
2802
|
}
|
|
2125
|
-
const metrics =
|
|
2803
|
+
const metrics = normalizeResultMetrics(record.metrics) ?? {};
|
|
2126
2804
|
const status = record.status === "completed" || record.status === "error"
|
|
2127
2805
|
? record.status
|
|
2128
2806
|
: undefined;
|
|
@@ -2146,9 +2824,7 @@ function normalizeRewardCases(value) {
|
|
|
2146
2824
|
: undefined;
|
|
2147
2825
|
const pass = typeof criterionRecord.pass === "boolean"
|
|
2148
2826
|
? criterionRecord.pass
|
|
2149
|
-
:
|
|
2150
|
-
? score >= 0.5
|
|
2151
|
-
: undefined;
|
|
2827
|
+
: undefined;
|
|
2152
2828
|
if (!criterionId || score === undefined || pass === undefined) {
|
|
2153
2829
|
return [];
|
|
2154
2830
|
}
|
|
@@ -2261,13 +2937,13 @@ function evaluateSample(args) {
|
|
|
2261
2937
|
if (typeof sampleScore !== "number" || !Number.isFinite(sampleScore)) {
|
|
2262
2938
|
throw new Error("Evaluation sample requires an engine result with a finite numeric score.");
|
|
2263
2939
|
}
|
|
2264
|
-
const cases = args.workload.cases?.length ? args.workload.cases : undefined;
|
|
2265
2940
|
const metrics = args.workload.metrics ?? {
|
|
2266
2941
|
score: sampleScore,
|
|
2267
2942
|
};
|
|
2268
2943
|
if (metrics.score === undefined) {
|
|
2269
2944
|
metrics.score = sampleScore;
|
|
2270
2945
|
}
|
|
2946
|
+
const cases = args.workload.cases?.length ? args.workload.cases : undefined;
|
|
2271
2947
|
const feedback = {
|
|
2272
2948
|
...(args.workload.summary !== undefined
|
|
2273
2949
|
? { summary: args.workload.summary }
|
|
@@ -2295,7 +2971,7 @@ function evaluateSample(args) {
|
|
|
2295
2971
|
feedback,
|
|
2296
2972
|
};
|
|
2297
2973
|
}
|
|
2298
|
-
function normalizeSampleJobOutput(value
|
|
2974
|
+
function normalizeSampleJobOutput(value) {
|
|
2299
2975
|
if (!value || typeof value !== "object" || Array.isArray(value)) {
|
|
2300
2976
|
return null;
|
|
2301
2977
|
}
|
|
@@ -2314,9 +2990,6 @@ function normalizeSampleJobOutput(value, fallbackFiles = []) {
|
|
|
2314
2990
|
!Number.isFinite(record.attemptIndex)) {
|
|
2315
2991
|
return null;
|
|
2316
2992
|
}
|
|
2317
|
-
const sampleFiles = files.length > 0
|
|
2318
|
-
? files
|
|
2319
|
-
: fallbackFiles.map((file) => ({ ...file }));
|
|
2320
2993
|
return {
|
|
2321
2994
|
subjectId: record.subjectId,
|
|
2322
2995
|
attemptIndex: record.attemptIndex,
|
|
@@ -2324,10 +2997,10 @@ function normalizeSampleJobOutput(value, fallbackFiles = []) {
|
|
|
2324
2997
|
fileChanges: Array.isArray(record.fileChanges)
|
|
2325
2998
|
? record.fileChanges.filter((entry) => typeof entry === "string")
|
|
2326
2999
|
: [],
|
|
2327
|
-
files
|
|
3000
|
+
files,
|
|
2328
3001
|
traces: Array.isArray(record.traces)
|
|
2329
3002
|
? record.traces.filter((entry) => typeof entry === "string")
|
|
2330
|
-
: traceFilePaths(
|
|
3003
|
+
: traceFilePaths(files),
|
|
2331
3004
|
};
|
|
2332
3005
|
}
|
|
2333
3006
|
function normalizeEvaluationSampleOutputs(args) {
|
|
@@ -2498,8 +3171,16 @@ function compareSampleOutputs(left, right) {
|
|
|
2498
3171
|
}
|
|
2499
3172
|
return left.sample.id.localeCompare(right.sample.id);
|
|
2500
3173
|
}
|
|
2501
|
-
function createEvaluationRecord(subjectId, rawSamples) {
|
|
2502
|
-
const samples = mergeEvaluationSampleRecords(rawSamples)
|
|
3174
|
+
function createEvaluationRecord(subjectId, subjectName, rawSamples) {
|
|
3175
|
+
const samples = mergeEvaluationSampleRecords(rawSamples).map((sample) => subjectName
|
|
3176
|
+
? {
|
|
3177
|
+
...sample,
|
|
3178
|
+
subject: {
|
|
3179
|
+
...sample.subject,
|
|
3180
|
+
label: subjectName,
|
|
3181
|
+
},
|
|
3182
|
+
}
|
|
3183
|
+
: sample);
|
|
2503
3184
|
const startedAt = minTimestamp(samples.flatMap((sample) => (sample.startedAt ? [sample.startedAt] : [])));
|
|
2504
3185
|
const finishedAt = maxTimestamp(samples.flatMap((sample) => (sample.finishedAt ? [sample.finishedAt] : [])));
|
|
2505
3186
|
const durationValues = samples.flatMap((sample) => typeof sample.durationMs === "number" ? [sample.durationMs] : []);
|
|
@@ -2513,6 +3194,7 @@ function createEvaluationRecord(subjectId, rawSamples) {
|
|
|
2513
3194
|
subject: {
|
|
2514
3195
|
id: subjectId,
|
|
2515
3196
|
kind: "subject",
|
|
3197
|
+
...(subjectName ? { label: subjectName } : {}),
|
|
2516
3198
|
},
|
|
2517
3199
|
status: samples.length > 0 && completedSampleCount === samples.length
|
|
2518
3200
|
? "completed"
|
|
@@ -2533,6 +3215,10 @@ function createEvaluationRecord(subjectId, rawSamples) {
|
|
|
2533
3215
|
samples,
|
|
2534
3216
|
};
|
|
2535
3217
|
}
|
|
3218
|
+
function normalizedSubjectDisplayName(value) {
|
|
3219
|
+
const normalized = value?.trim();
|
|
3220
|
+
return normalized ? normalized : null;
|
|
3221
|
+
}
|
|
2536
3222
|
function aggregateSampleMetrics(samples) {
|
|
2537
3223
|
const metricNames = new Set(samples.flatMap((sample) => Object.keys(sample.metrics ?? {})));
|
|
2538
3224
|
if (metricNames.size === 0) {
|
|
@@ -2563,14 +3249,14 @@ function mergeEvaluationSampleRecords(samples) {
|
|
|
2563
3249
|
function mergeEvaluationSampleGroup(group) {
|
|
2564
3250
|
const first = group[0];
|
|
2565
3251
|
if (group.length === 1) {
|
|
2566
|
-
return
|
|
3252
|
+
return first;
|
|
2567
3253
|
}
|
|
2568
3254
|
const startedAt = minTimestamp(group.flatMap((sample) => (sample.startedAt ? [sample.startedAt] : [])));
|
|
2569
3255
|
const finishedAt = maxTimestamp(group.flatMap((sample) => (sample.finishedAt ? [sample.finishedAt] : [])));
|
|
2570
3256
|
const durationMs = startedAt && finishedAt
|
|
2571
3257
|
? Math.max(0, Date.parse(finishedAt) - Date.parse(startedAt))
|
|
2572
3258
|
: undefined;
|
|
2573
|
-
const cases = group.flatMap((sample) =>
|
|
3259
|
+
const cases = group.flatMap((sample) => sample.cases ?? []);
|
|
2574
3260
|
const metrics = aggregateSampleGroupMetrics(group);
|
|
2575
3261
|
const usage = mergeUsageSummaries(group.map((sample) => sample.usage));
|
|
2576
3262
|
const errors = group.flatMap((sample) => sample.error ? [sample.error] : []);
|
|
@@ -2588,22 +3274,6 @@ function mergeEvaluationSampleGroup(group) {
|
|
|
2588
3274
|
...(cases.length > 0 ? { cases } : {}),
|
|
2589
3275
|
};
|
|
2590
3276
|
}
|
|
2591
|
-
function normalizeSingleCaseDurations(sample) {
|
|
2592
|
-
if (!sample.cases) {
|
|
2593
|
-
return sample;
|
|
2594
|
-
}
|
|
2595
|
-
const cases = normalizeCaseDurations(sample);
|
|
2596
|
-
return cases.length === sample.cases.length
|
|
2597
|
-
? { ...sample, cases }
|
|
2598
|
-
: sample;
|
|
2599
|
-
}
|
|
2600
|
-
function normalizeCaseDurations(sample) {
|
|
2601
|
-
return (sample.cases ?? []).map((caseResult) => (typeof caseResult.durationMs === "number" ||
|
|
2602
|
-
sample.cases?.length !== 1 ||
|
|
2603
|
-
typeof sample.durationMs !== "number"
|
|
2604
|
-
? caseResult
|
|
2605
|
-
: { ...caseResult, durationMs: sample.durationMs }));
|
|
2606
|
-
}
|
|
2607
3277
|
function aggregateSampleGroupMetrics(group) {
|
|
2608
3278
|
const metricNames = new Set(group.flatMap((sample) => Object.keys(sample.metrics ?? {})));
|
|
2609
3279
|
if (metricNames.size === 0) {
|