@workbench-ai/workbench-core 0.0.46
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/adapter-auth.d.ts +63 -0
- package/dist/adapter-auth.d.ts.map +1 -0
- package/dist/adapter-auth.js +244 -0
- package/dist/execution-events.d.ts +53 -0
- package/dist/execution-events.d.ts.map +1 -0
- package/dist/execution-events.js +195 -0
- package/dist/execution-graph.d.ts +27 -0
- package/dist/execution-graph.d.ts.map +1 -0
- package/dist/execution-graph.js +126 -0
- package/dist/execution-jobs.d.ts +70 -0
- package/dist/execution-jobs.d.ts.map +1 -0
- package/dist/execution-jobs.js +229 -0
- package/dist/execution-outputs.d.ts +9 -0
- package/dist/execution-outputs.d.ts.map +1 -0
- package/dist/execution-outputs.js +393 -0
- package/dist/execution-phases.d.ts +21 -0
- package/dist/execution-phases.d.ts.map +1 -0
- package/dist/execution-phases.js +262 -0
- package/dist/execution-runtime-types.d.ts +35 -0
- package/dist/execution-runtime-types.d.ts.map +1 -0
- package/dist/execution-runtime-types.js +1 -0
- package/dist/execution-scheduler.d.ts +31 -0
- package/dist/execution-scheduler.d.ts.map +1 -0
- package/dist/execution-scheduler.js +241 -0
- package/dist/execution-traces.d.ts +16 -0
- package/dist/execution-traces.d.ts.map +1 -0
- package/dist/execution-traces.js +164 -0
- package/dist/execution-usage.d.ts +12 -0
- package/dist/execution-usage.d.ts.map +1 -0
- package/dist/execution-usage.js +433 -0
- package/dist/generic-spec.d.ts +113 -0
- package/dist/generic-spec.d.ts.map +1 -0
- package/dist/generic-spec.js +656 -0
- package/dist/index.d.ts +160 -0
- package/dist/index.d.ts.map +1 -0
- package/dist/index.js +2858 -0
- package/dist/model-prices-litellm.d.ts +9674 -0
- package/dist/model-prices-litellm.d.ts.map +1 -0
- package/dist/model-prices-litellm.js +9668 -0
- package/dist/runtime-utils.d.ts +18 -0
- package/dist/runtime-utils.d.ts.map +1 -0
- package/dist/runtime-utils.js +108 -0
- package/dist/sandbox-backends/docker.d.ts +5 -0
- package/dist/sandbox-backends/docker.d.ts.map +1 -0
- package/dist/sandbox-backends/docker.js +568 -0
- package/dist/sandbox-backends/index.d.ts +37 -0
- package/dist/sandbox-backends/index.d.ts.map +1 -0
- package/dist/sandbox-backends/index.js +79 -0
- package/dist/sandbox-backends/names.d.ts +6 -0
- package/dist/sandbox-backends/names.d.ts.map +1 -0
- package/dist/sandbox-backends/names.js +14 -0
- package/dist/sandbox-backends/template-images.d.ts +4 -0
- package/dist/sandbox-backends/template-images.d.ts.map +1 -0
- package/dist/sandbox-backends/template-images.js +48 -0
- package/dist/sandbox-inputs.d.ts +27 -0
- package/dist/sandbox-inputs.d.ts.map +1 -0
- package/dist/sandbox-inputs.js +220 -0
- package/dist/sandbox-plane.d.ts +89 -0
- package/dist/sandbox-plane.d.ts.map +1 -0
- package/dist/sandbox-plane.js +327 -0
- package/dist/subject-patch.d.ts +8 -0
- package/dist/subject-patch.d.ts.map +1 -0
- package/dist/subject-patch.js +63 -0
- package/dist/trace-files.d.ts +18 -0
- package/dist/trace-files.d.ts.map +1 -0
- package/dist/trace-files.js +94 -0
- package/environments/libreoffice-agent/Dockerfile +13 -0
- package/environments/libreoffice-python/Dockerfile +11 -0
- package/environments/node-22/Dockerfile +3 -0
- package/environments/python-3.12/Dockerfile +8 -0
- package/package.json +42 -0
- package/worker/sandbox-adapter-runner.cjs +275 -0
package/dist/index.js
ADDED
|
@@ -0,0 +1,2858 @@
|
|
|
1
|
+
import { createHash } from "node:crypto";
|
|
2
|
+
import os from "node:os";
|
|
3
|
+
import path from "node:path";
|
|
4
|
+
import YAML from "yaml";
|
|
5
|
+
import { adapterCommandName, assertWorkbenchAdapterOperationResultOk, collectWorkbenchAdapterAuthRequirements, readWorkbenchAdapterOperationResult, workbenchAdapterOperationCommand, workbenchAdapterOperationResultPath, } from "@workbench-ai/workbench-protocol";
|
|
6
|
+
import { BENCHMARK_SPEC_FILE, engineCaseEnginePrivateFiles, engineCaseFilesForRuntimeInput, engineCaseSubjectVisibleFiles, resolveEngineCaseExecutionConfig, resolveWorkbenchResolvedSourceYaml as resolveWorkbenchResolvedSourceYamlInternal, validateWorkbenchResolvedSourceYaml as validateWorkbenchResolvedSourceYamlInternal, isWorkbenchSubjectManifestPath, } from "./generic-spec.js";
|
|
7
|
+
import { attachSandboxMetadataToJob, createWorkbenchSandboxFileStore, isSurfaceSnapshotFile, readWorkbenchExecutionSpec, } from "./sandbox-inputs.js";
|
|
8
|
+
import { asRuntimeRecord, importNodeModule, jsonRecord, nodeBuiltin, quoteShellArg, resolveWorkbenchWorkerId, } from "./runtime-utils.js";
|
|
9
|
+
import { executeValidatedSandboxExecution, } from "./sandbox-plane.js";
|
|
10
|
+
import { createSandboxBackendPlaneForProvider, } from "./sandbox-backends/index.js";
|
|
11
|
+
import { applyWorkbenchSubjectPatch } from "./subject-patch.js";
|
|
12
|
+
import { assignUsageRole, completeUsageSummary, mergeUsageSummaries, normalizeUsageSummary, usageStats, } from "./execution-usage.js";
|
|
13
|
+
import { traceFilePaths, workbenchTracePhaseDirectory, } from "./trace-files.js";
|
|
14
|
+
import { engineCaseForCase, } from "./execution-jobs.js";
|
|
15
|
+
import { createWorkbenchExecutionEventPublisher, publishCommandPhaseEvent, } from "./execution-events.js";
|
|
16
|
+
import { readWorkbenchExecutionPurpose } from "./execution-phases.js";
|
|
17
|
+
import { adapterAuthEnv, localWorkbenchAdapterAuthStore, normalizeWorkbenchAdapterAuthTarget, sanitizeWorkbenchAdapterAuthBundle, } from "./adapter-auth.js";
|
|
18
|
+
export { BENCHMARK_SPEC_FILE, DEFAULT_EXECUTION_RESOURCES, engineCaseEnginePrivateFiles, engineCaseFilesForRuntimeInput, engineCaseSubjectVisibleFiles, engineResolveInvocationForSpec, engineResolveBindingForSpec, engineResolveBindingForSourceYaml, isWorkbenchSubjectManifestPath, parseWorkbenchSourceFiles, resolveEngineCaseExecutionConfig, resolveWorkbenchResolvedSourceYaml, resolveWorkbenchSourceFiles, serializeWorkbenchResolvedSourceYaml, validateWorkbenchResolvedSourceYaml, } from "./generic-spec.js";
|
|
19
|
+
export { adapterCommandName, cloneWorkbenchAdapterManifest, collectWorkbenchAdapterAuthRequirements, collectWorkbenchAdapterInvocations, parseWorkbenchAdapterManifest, workbenchAdapterManifestRequiresAuth, workbenchAdapterManifestSupportsOperation, workbenchAdapterOperationCommand, withDefaultWorkbenchAdapterAuth, withDefaultWorkbenchAdapterAuthProfiles, } from "@workbench-ai/workbench-protocol";
|
|
20
|
+
export { adapterAuthEnv, createWorkbenchAdapterAuthBundle, defaultWorkbenchAdapterAuthStoreRoot, localWorkbenchAdapterAuthStore, normalizeWorkbenchAdapterAuthTarget, parseWorkbenchAdapterAuthTarget, sanitizeWorkbenchAdapterAuthBundle, } from "./adapter-auth.js";
|
|
21
|
+
export { asRuntimeRecord, importNodeModule, nodeBuiltin, normalizeWorkbenchWorkerId, normalizeRuntimeRegistry, quoteShellArg, resolveDockerRuntimeImageRef, resolveWorkbenchWorkerId, } from "./runtime-utils.js";
|
|
22
|
+
export { assignUsageRole, extractExecutionUsageFromTrace, mergeUsageSummaries, } from "./execution-usage.js";
|
|
23
|
+
export { createWorkbenchProgressStdoutParser, publishWorkbenchProgressStdoutEnvelope, } from "./execution-events.js";
|
|
24
|
+
export { resolveSandboxTemplateImage, } from "./sandbox-backends/template-images.js";
|
|
25
|
+
export { readOutputTraceFiles, workbenchTracePhaseDirectory, workbenchTraceRunDirectory, workbenchTraceRunDirectoryName, } from "./trace-files.js";
|
|
26
|
+
export { assertWorkbenchAdapterOperationSupport, assertWorkbenchAdapterOperationResultOk, collectWorkbenchAdapterOperationIssues, collectWorkbenchAdapterOperationRequirements, ensureWorkbenchAdapterOutputDir, WORKBENCH_ADAPTER_RESULT_FILE, normalizeWorkbenchAdapterOperationRequest, normalizeWorkbenchAdapterOperationResult, readWorkbenchAdapterOperationRequest, readWorkbenchAdapterOperationResult, workbenchAdapterOperationResultPath, writeWorkbenchAdapterOperationResult, } from "@workbench-ai/workbench-protocol";
|
|
27
|
+
export { applyWorkbenchSubjectPatch, } from "./subject-patch.js";
|
|
28
|
+
export { createWorkbenchSandboxFileStore, createSandboxAdapterRequest, executionResultFromCompletedSandboxJob, materializeWorkbenchSandboxInput, readWorkbenchExecutionSpec, sanitizeWorkbenchExecutionJobForSandbox, } from "./sandbox-inputs.js";
|
|
29
|
+
export { compileWorkbenchExecutionGraph, } from "./execution-graph.js";
|
|
30
|
+
export { createBaselineSubjectExecution, createBaselineSubjectJob, createWorkbenchExecutionJob, expectedWorkbenchRunJobCount, engineCaseForCase, engineCaseIds, attemptJobCountForRunSpec, workbenchExecutionJobPurpose, MAX_WORKBENCH_RUN_BUDGET, planWorkbenchExecutionJobsForPurpose, validateWorkbenchRunEnvelope, workbenchExecutionJobId, } from "./execution-jobs.js";
|
|
31
|
+
export { addCapacity, capacityFits, runWorkbenchExecutionDag, subtractCapacity, workbenchJobDependencies, workbenchJobHostCost, workbenchJobResources, } from "./execution-scheduler.js";
|
|
32
|
+
export { assertWorkbenchExecutionIsolation, collectWorkbenchExecutionIsolationIssues, validateWorkbenchExecutionOutputPayloads, } from "./execution-outputs.js";
|
|
33
|
+
export { collectSandboxAllocationScopeIssues, collectExecutionCapabilityScopeIssues, collectSandboxHandleScopeIssues, createWorkbenchSandboxAllocation, createWorkbenchSandboxExecutionMetadata, createWorkbenchExecutionCapability, executeValidatedSandboxExecution, } from "./sandbox-plane.js";
|
|
34
|
+
export { buildSubjectCasePhaseRefs, buildWorkbenchTracePhases, isWorkbenchPhaseActive, readWorkbenchExecutionId, readWorkbenchExecutionMetadataNumber, readWorkbenchExecutionMetadataString, readWorkbenchExecutionPurpose, resolveWorkbenchJobGroupStatus, } from "./execution-phases.js";
|
|
35
|
+
export { finalizeWorkbenchExecutionTraceForJob, mergeWorkbenchExecutionTracesByJob, } from "./execution-traces.js";
|
|
36
|
+
export { DOCKER_SANDBOX_BACKEND, assertSandboxHostHealthForProvider, createDockerSandboxBackendDescriptor, createDockerSandboxPlane, resolveWorkbenchSandboxProviderName, sandboxProviderAdmissionForResources, sandboxProviderDefaultMaxConcurrentJobs, sandboxProviderLeaseScope, sandboxHostHealthExpectationForProvider, } from "./sandbox-backends/index.js";
|
|
37
|
+
export const DEFAULT_ENVIRONMENT_VERSIONS = [
|
|
38
|
+
{
|
|
39
|
+
id: "envv_python_3_12",
|
|
40
|
+
environmentId: "env_python",
|
|
41
|
+
name: "Python 3.12",
|
|
42
|
+
imageRef: "docker://workbench/workbench-python-3.12:envv_python_3_12",
|
|
43
|
+
sourceHash: "builtin:python-3.12",
|
|
44
|
+
sourceType: "builtin",
|
|
45
|
+
status: "ready",
|
|
46
|
+
createdAt: "2026-04-23T00:00:00.000Z",
|
|
47
|
+
updatedAt: "2026-04-23T00:00:00.000Z",
|
|
48
|
+
spec: {
|
|
49
|
+
base: "workbench/python-3.12",
|
|
50
|
+
resources: {
|
|
51
|
+
cpu: 2,
|
|
52
|
+
memoryGb: 4,
|
|
53
|
+
diskGb: 10,
|
|
54
|
+
timeoutMinutes: 20,
|
|
55
|
+
},
|
|
56
|
+
network: "off",
|
|
57
|
+
},
|
|
58
|
+
},
|
|
59
|
+
{
|
|
60
|
+
id: "envv_libreoffice_python",
|
|
61
|
+
environmentId: "env_libreoffice_python",
|
|
62
|
+
name: "LibreOffice + Python",
|
|
63
|
+
imageRef: "docker://workbench/workbench-libreoffice-python:envv_libreoffice_python",
|
|
64
|
+
sourceHash: "builtin:libreoffice-python",
|
|
65
|
+
sourceType: "builtin",
|
|
66
|
+
status: "ready",
|
|
67
|
+
createdAt: "2026-04-23T00:00:00.000Z",
|
|
68
|
+
updatedAt: "2026-04-23T00:00:00.000Z",
|
|
69
|
+
spec: {
|
|
70
|
+
base: "workbench/python-3.12",
|
|
71
|
+
resources: {
|
|
72
|
+
cpu: 2,
|
|
73
|
+
memoryGb: 4,
|
|
74
|
+
diskGb: 10,
|
|
75
|
+
timeoutMinutes: 30,
|
|
76
|
+
},
|
|
77
|
+
network: "off",
|
|
78
|
+
},
|
|
79
|
+
},
|
|
80
|
+
{
|
|
81
|
+
id: "envv_libreoffice_agent",
|
|
82
|
+
environmentId: "env_libreoffice_agent",
|
|
83
|
+
name: "LibreOffice + Agent",
|
|
84
|
+
imageRef: "docker://workbench/workbench-libreoffice-agent:envv_libreoffice_agent",
|
|
85
|
+
sourceHash: "builtin:libreoffice-agent",
|
|
86
|
+
sourceType: "builtin",
|
|
87
|
+
status: "ready",
|
|
88
|
+
createdAt: "2026-04-29T00:00:00.000Z",
|
|
89
|
+
updatedAt: "2026-04-29T00:00:00.000Z",
|
|
90
|
+
spec: {
|
|
91
|
+
base: "workbench/python-3.12",
|
|
92
|
+
resources: {
|
|
93
|
+
cpu: 4,
|
|
94
|
+
memoryGb: 8,
|
|
95
|
+
diskGb: 20,
|
|
96
|
+
timeoutMinutes: 60,
|
|
97
|
+
},
|
|
98
|
+
network: "on",
|
|
99
|
+
},
|
|
100
|
+
},
|
|
101
|
+
{
|
|
102
|
+
id: "envv_node_22",
|
|
103
|
+
environmentId: "env_node",
|
|
104
|
+
name: "Node 22",
|
|
105
|
+
imageRef: "docker://workbench/workbench-node-22:envv_node_22",
|
|
106
|
+
sourceHash: "builtin:node-22",
|
|
107
|
+
sourceType: "builtin",
|
|
108
|
+
status: "ready",
|
|
109
|
+
createdAt: "2026-04-23T00:00:00.000Z",
|
|
110
|
+
updatedAt: "2026-04-23T00:00:00.000Z",
|
|
111
|
+
spec: {
|
|
112
|
+
base: "workbench/node-22",
|
|
113
|
+
resources: {
|
|
114
|
+
cpu: 2,
|
|
115
|
+
memoryGb: 4,
|
|
116
|
+
diskGb: 10,
|
|
117
|
+
timeoutMinutes: 20,
|
|
118
|
+
},
|
|
119
|
+
network: "off",
|
|
120
|
+
},
|
|
121
|
+
},
|
|
122
|
+
];
|
|
123
|
+
export const DEFAULT_ENVIRONMENTS = [
|
|
124
|
+
{
|
|
125
|
+
id: "env_python",
|
|
126
|
+
name: "Python",
|
|
127
|
+
description: "Python runtime for scripts, data processing, and simple evaluators.",
|
|
128
|
+
currentVersionId: "envv_python_3_12",
|
|
129
|
+
builtIn: true,
|
|
130
|
+
createdAt: "2026-04-23T00:00:00.000Z",
|
|
131
|
+
updatedAt: "2026-04-23T00:00:00.000Z",
|
|
132
|
+
},
|
|
133
|
+
{
|
|
134
|
+
id: "env_libreoffice_python",
|
|
135
|
+
name: "LibreOffice + Python",
|
|
136
|
+
description: "Python runtime with soffice for document, spreadsheet, and PDF-heavy evaluations.",
|
|
137
|
+
currentVersionId: "envv_libreoffice_python",
|
|
138
|
+
builtIn: true,
|
|
139
|
+
createdAt: "2026-04-23T00:00:00.000Z",
|
|
140
|
+
updatedAt: "2026-04-23T00:00:00.000Z",
|
|
141
|
+
},
|
|
142
|
+
{
|
|
143
|
+
id: "env_libreoffice_agent",
|
|
144
|
+
name: "LibreOffice + Agent",
|
|
145
|
+
description: "Agent runtime with soffice and Python libraries for spreadsheet-heavy skill and rubric evaluations.",
|
|
146
|
+
currentVersionId: "envv_libreoffice_agent",
|
|
147
|
+
builtIn: true,
|
|
148
|
+
createdAt: "2026-04-29T00:00:00.000Z",
|
|
149
|
+
updatedAt: "2026-04-29T00:00:00.000Z",
|
|
150
|
+
},
|
|
151
|
+
{
|
|
152
|
+
id: "env_node",
|
|
153
|
+
name: "Node",
|
|
154
|
+
description: "Node runtime for JavaScript and TypeScript subjects.",
|
|
155
|
+
currentVersionId: "envv_node_22",
|
|
156
|
+
builtIn: true,
|
|
157
|
+
createdAt: "2026-04-23T00:00:00.000Z",
|
|
158
|
+
updatedAt: "2026-04-23T00:00:00.000Z",
|
|
159
|
+
},
|
|
160
|
+
];
|
|
161
|
+
export function loadAuthoredWorkbenchSourceDocument(args) {
|
|
162
|
+
const spec = parseAuthoredWorkbenchSourceSpec(args.sourceYaml);
|
|
163
|
+
return {
|
|
164
|
+
path: args.path ?? "benchmark.yaml",
|
|
165
|
+
exists: args.sourceYaml.trim().length > 0,
|
|
166
|
+
source_yaml: args.sourceYaml,
|
|
167
|
+
source_files: authoredSourceFilesForDocument(args),
|
|
168
|
+
spec,
|
|
169
|
+
cases: summarizeCaseInputs(args.cases ?? []),
|
|
170
|
+
};
|
|
171
|
+
}
|
|
172
|
+
function authoredSourceFilesForDocument(args) {
|
|
173
|
+
const explicit = (args.sourceFiles ?? [])
|
|
174
|
+
.filter((file) => file.encoding === "utf8" &&
|
|
175
|
+
isAuthoredSourceYamlPath(file.path))
|
|
176
|
+
.map((file) => ({
|
|
177
|
+
path: file.path,
|
|
178
|
+
content: file.content,
|
|
179
|
+
}));
|
|
180
|
+
if (explicit.length > 0) {
|
|
181
|
+
return explicit;
|
|
182
|
+
}
|
|
183
|
+
return splitAuthoredSourceYaml(args.sourceYaml);
|
|
184
|
+
}
|
|
185
|
+
function splitAuthoredSourceYaml(sourceYaml) {
|
|
186
|
+
const parsed = YAML.parse(sourceYaml);
|
|
187
|
+
if (!parsed || typeof parsed !== "object" || Array.isArray(parsed)) {
|
|
188
|
+
return [];
|
|
189
|
+
}
|
|
190
|
+
const entries = [
|
|
191
|
+
[BENCHMARK_SPEC_FILE, parsed.benchmark],
|
|
192
|
+
["subjects/current/subject.yaml", splitSubjectSourceRecord(parsed.subject)],
|
|
193
|
+
["optimizers/current.yaml", splitOptimizerSourceRecord(parsed.optimizer)],
|
|
194
|
+
];
|
|
195
|
+
return entries.flatMap(([filePath, value]) => {
|
|
196
|
+
if (!value || typeof value !== "object" || Array.isArray(value)) {
|
|
197
|
+
return [];
|
|
198
|
+
}
|
|
199
|
+
return [{
|
|
200
|
+
path: filePath,
|
|
201
|
+
content: `${YAML.stringify(value).trimEnd()}\n`,
|
|
202
|
+
}];
|
|
203
|
+
});
|
|
204
|
+
}
|
|
205
|
+
function splitSubjectSourceRecord(value) {
|
|
206
|
+
const record = cloneYamlRecord(value);
|
|
207
|
+
if (!record) {
|
|
208
|
+
return value;
|
|
209
|
+
}
|
|
210
|
+
delete record.benchmark;
|
|
211
|
+
delete record.path;
|
|
212
|
+
rewriteAdapterSources(record, "subjects");
|
|
213
|
+
return record;
|
|
214
|
+
}
|
|
215
|
+
function splitOptimizerSourceRecord(value) {
|
|
216
|
+
const record = cloneYamlRecord(value);
|
|
217
|
+
if (!record) {
|
|
218
|
+
return value;
|
|
219
|
+
}
|
|
220
|
+
rewriteAdapterSources(record, "optimizers");
|
|
221
|
+
return record;
|
|
222
|
+
}
|
|
223
|
+
function cloneYamlRecord(value) {
|
|
224
|
+
return value && typeof value === "object" && !Array.isArray(value)
|
|
225
|
+
? YAML.parse(YAML.stringify(value))
|
|
226
|
+
: null;
|
|
227
|
+
}
|
|
228
|
+
function rewriteAdapterSources(record, yamlDir) {
|
|
229
|
+
if (!Array.isArray(record.adapters)) {
|
|
230
|
+
return;
|
|
231
|
+
}
|
|
232
|
+
record.adapters = record.adapters.map((entry) => typeof entry === "string" && !/^(?:npm|git):/iu.test(entry.trim())
|
|
233
|
+
? sourcePathRelativeTo(yamlDir, entry)
|
|
234
|
+
: entry);
|
|
235
|
+
}
|
|
236
|
+
function sourcePathRelativeTo(yamlDir, sourcePath) {
|
|
237
|
+
const normalized = sourcePath.replace(/\\/gu, "/").replace(/^\.\//u, "");
|
|
238
|
+
const relative = path.posix.relative(yamlDir, normalized);
|
|
239
|
+
return relative || ".";
|
|
240
|
+
}
|
|
241
|
+
function isAuthoredSourceYamlPath(filePath) {
|
|
242
|
+
return filePath === BENCHMARK_SPEC_FILE ||
|
|
243
|
+
isWorkbenchSubjectManifestPath(filePath) ||
|
|
244
|
+
/^optimizers\/[^/]+\.ya?ml$/iu.test(filePath);
|
|
245
|
+
}
|
|
246
|
+
function formatOptimizerSummary(spec) {
|
|
247
|
+
return spec.improve ? `adapter:${spec.improve.use}` : "optimizer not configured";
|
|
248
|
+
}
|
|
249
|
+
function formatEngineRunSummary(spec) {
|
|
250
|
+
return `adapter:${spec.engineRun.use}`;
|
|
251
|
+
}
|
|
252
|
+
function environmentNetwork(runtime) {
|
|
253
|
+
const egress = runtime.network?.egress;
|
|
254
|
+
return egress === "none" ? "off" : "on";
|
|
255
|
+
}
|
|
256
|
+
function environmentResources(runtime) {
|
|
257
|
+
const resources = runtime.resources ?? {};
|
|
258
|
+
const output = {};
|
|
259
|
+
if (typeof resources.cpu === "number") {
|
|
260
|
+
output.cpu = resources.cpu;
|
|
261
|
+
}
|
|
262
|
+
if (typeof resources.memoryGb === "number") {
|
|
263
|
+
output.memoryGb = resources.memoryGb;
|
|
264
|
+
}
|
|
265
|
+
if (typeof resources.diskGb === "number") {
|
|
266
|
+
output.diskGb = resources.diskGb;
|
|
267
|
+
}
|
|
268
|
+
if (typeof resources.timeoutMinutes === "number") {
|
|
269
|
+
output.timeoutMinutes = resources.timeoutMinutes;
|
|
270
|
+
}
|
|
271
|
+
return Object.keys(output).length > 0 ? output : undefined;
|
|
272
|
+
}
|
|
273
|
+
function adapterProtocolCommandSpec(adapter, operation, manifests = []) {
|
|
274
|
+
if (!/^[a-z][a-z0-9-]*$/u.test(adapter.use)) {
|
|
275
|
+
throw new Error(`Adapter id ${adapter.use} cannot be mapped to an executable command.`);
|
|
276
|
+
}
|
|
277
|
+
const manifest = manifests.find((entry) => entry.id === adapter.use);
|
|
278
|
+
return {
|
|
279
|
+
use: "command",
|
|
280
|
+
command: manifest ? workbenchAdapterOperationCommand(manifest, operation) : adapterCommandName(adapter.use),
|
|
281
|
+
};
|
|
282
|
+
}
|
|
283
|
+
function protocolPhaseForExecution(execution, manifests) {
|
|
284
|
+
const role = executionPurposeRole(execution.purpose);
|
|
285
|
+
const operation = execution.purpose === "improve" ? "optimizer.improve" : "subject.run";
|
|
286
|
+
const command = adapterProtocolCommandSpec(execution.adapter, operation, manifests);
|
|
287
|
+
return {
|
|
288
|
+
kind: role,
|
|
289
|
+
label: execution.purpose,
|
|
290
|
+
operation,
|
|
291
|
+
adapter: execution.adapter,
|
|
292
|
+
command: command.command,
|
|
293
|
+
};
|
|
294
|
+
}
|
|
295
|
+
function attemptPhasesForExecution(execution, spec, manifests) {
|
|
296
|
+
void spec;
|
|
297
|
+
const enginePhase = {
|
|
298
|
+
kind: "engine",
|
|
299
|
+
label: "engine",
|
|
300
|
+
operation: "engine.run",
|
|
301
|
+
adapter: execution.adapter,
|
|
302
|
+
command: adapterProtocolCommandSpec(execution.adapter, "engine.run", manifests).command,
|
|
303
|
+
};
|
|
304
|
+
return [enginePhase];
|
|
305
|
+
}
|
|
306
|
+
function adapterConfigRecord(adapter, manifests = []) {
|
|
307
|
+
const config = cloneJsonRecord(jsonRecord(adapter.with));
|
|
308
|
+
const manifest = manifests.find((entry) => entry.id === adapter.use);
|
|
309
|
+
if (!manifest?.slots) {
|
|
310
|
+
return config;
|
|
311
|
+
}
|
|
312
|
+
for (const slot of Object.values(manifest.slots)) {
|
|
313
|
+
const value = jsonPointerValue(config, slot.path);
|
|
314
|
+
if (Array.isArray(value)) {
|
|
315
|
+
for (let index = 0; index < value.length; index += 1) {
|
|
316
|
+
const nested = jsonRecord(value[index]);
|
|
317
|
+
if (nested) {
|
|
318
|
+
value[index] = invocationWithCommand(nested, slot.operation, manifests);
|
|
319
|
+
}
|
|
320
|
+
}
|
|
321
|
+
continue;
|
|
322
|
+
}
|
|
323
|
+
const nested = jsonRecord(value);
|
|
324
|
+
if (nested) {
|
|
325
|
+
setJsonPointerValue(config, slot.path, invocationWithCommand(nested, slot.operation, manifests));
|
|
326
|
+
}
|
|
327
|
+
}
|
|
328
|
+
return config;
|
|
329
|
+
}
|
|
330
|
+
function invocationWithCommand(invocation, operation, manifests) {
|
|
331
|
+
const use = typeof invocation.use === "string" ? invocation.use : "";
|
|
332
|
+
if (!use) {
|
|
333
|
+
return invocation;
|
|
334
|
+
}
|
|
335
|
+
const manifest = manifests.find((entry) => entry.id === use);
|
|
336
|
+
return {
|
|
337
|
+
...invocation,
|
|
338
|
+
command: manifest ? workbenchAdapterOperationCommand(manifest, operation) : adapterCommandName(use),
|
|
339
|
+
};
|
|
340
|
+
}
|
|
341
|
+
function cloneJsonRecord(value) {
|
|
342
|
+
return JSON.parse(JSON.stringify(value));
|
|
343
|
+
}
|
|
344
|
+
function jsonPointerValue(root, pointer) {
|
|
345
|
+
let current = root;
|
|
346
|
+
for (const segment of jsonPointerSegments(pointer)) {
|
|
347
|
+
if (!current || typeof current !== "object") {
|
|
348
|
+
return undefined;
|
|
349
|
+
}
|
|
350
|
+
current = current[segment];
|
|
351
|
+
}
|
|
352
|
+
return current;
|
|
353
|
+
}
|
|
354
|
+
function setJsonPointerValue(root, pointer, value) {
|
|
355
|
+
const segments = jsonPointerSegments(pointer);
|
|
356
|
+
let current = root;
|
|
357
|
+
for (const segment of segments.slice(0, -1)) {
|
|
358
|
+
const next = current[segment];
|
|
359
|
+
if (!next || typeof next !== "object" || Array.isArray(next)) {
|
|
360
|
+
return;
|
|
361
|
+
}
|
|
362
|
+
current = next;
|
|
363
|
+
}
|
|
364
|
+
const key = segments.at(-1);
|
|
365
|
+
if (key) {
|
|
366
|
+
current[key] = value;
|
|
367
|
+
}
|
|
368
|
+
}
|
|
369
|
+
function jsonPointerSegments(pointer) {
|
|
370
|
+
if (pointer === "") {
|
|
371
|
+
return [];
|
|
372
|
+
}
|
|
373
|
+
return pointer
|
|
374
|
+
.replace(/^\//u, "")
|
|
375
|
+
.split("/")
|
|
376
|
+
.map((segment) => segment.replace(/~1/gu, "/").replace(/~0/gu, "~"));
|
|
377
|
+
}
|
|
378
|
+
export function materializeWorkbenchRunResult(args) {
|
|
379
|
+
const completed = args.jobs.filter((job) => job.status === "succeeded");
|
|
380
|
+
const failedJobCount = args.jobs.filter((job) => job.status === "failed").length;
|
|
381
|
+
const completedJobCount = args.jobs.filter((job) => job.status === "succeeded").length;
|
|
382
|
+
const subjectRevisions = completed
|
|
383
|
+
.filter((job) => workbenchExecutionPurpose(job) === "improve")
|
|
384
|
+
.map((job) => normalizeSubjectRevisionJobOutput(job.output))
|
|
385
|
+
.filter((output) => output !== null)
|
|
386
|
+
.sort((left, right) => left.attemptIndex - right.attemptIndex);
|
|
387
|
+
const evaluationJobs = args.jobs.filter((job) => workbenchExecutionPurpose(job) === "attempt");
|
|
388
|
+
const evaluationsBySubject = new Map();
|
|
389
|
+
for (const job of evaluationJobs) {
|
|
390
|
+
const subjectId = readJobString(job.output, "subjectId") ??
|
|
391
|
+
readJobString(job.input, "subjectId") ??
|
|
392
|
+
job.subjectId;
|
|
393
|
+
if (subjectId) {
|
|
394
|
+
evaluationsBySubject.set(subjectId, [
|
|
395
|
+
...(evaluationsBySubject.get(subjectId) ?? []),
|
|
396
|
+
job,
|
|
397
|
+
]);
|
|
398
|
+
}
|
|
399
|
+
}
|
|
400
|
+
const subjects = [];
|
|
401
|
+
const subjectFiles = {};
|
|
402
|
+
const evaluations = [];
|
|
403
|
+
for (const subjectRevision of subjectRevisions) {
|
|
404
|
+
const subjectId = subjectRevision.subjectId;
|
|
405
|
+
const subjectJobs = evaluationsBySubject.get(subjectId) ?? [];
|
|
406
|
+
const succeededEvaluationJobs = subjectJobs.filter((job) => job.status === "succeeded");
|
|
407
|
+
const outputs = normalizeEvaluationSampleOutputs({
|
|
408
|
+
jobs: succeededEvaluationJobs,
|
|
409
|
+
allJobs: completed,
|
|
410
|
+
})
|
|
411
|
+
.sort((left, right) => compareSampleOutputs(left.output, right.output));
|
|
412
|
+
const outputJobIds = new Set(outputs.flatMap(({ jobs }) => jobs.map((job) => job.id)));
|
|
413
|
+
const completedSampleKeys = new Set(outputs
|
|
414
|
+
.map(({ output }) => evaluationSampleGroupKeyFromOutput(output))
|
|
415
|
+
.filter((key) => key !== null));
|
|
416
|
+
const errorSampleJobs = [
|
|
417
|
+
...subjectJobs.filter((job) => job.status === "failed"),
|
|
418
|
+
...succeededEvaluationJobs.filter((job) => !outputJobIds.has(job.id)),
|
|
419
|
+
];
|
|
420
|
+
const errorSamples = errorEvaluationSamplesFromJobs(errorSampleJobs, subjectId, subjectRevision.attemptIndex, completedSampleKeys);
|
|
421
|
+
const samples = [
|
|
422
|
+
...outputs.map(({ jobs, output }) => withJobUsage(output.sample, completed, jobs[0])),
|
|
423
|
+
...errorSamples,
|
|
424
|
+
].sort((left, right) => left.index - right.index || left.id.localeCompare(right.id));
|
|
425
|
+
const evalRecord = createEvaluationRecord(subjectId, samples);
|
|
426
|
+
const usage = mergeUsageSummaries([
|
|
427
|
+
subjectRevision.usage,
|
|
428
|
+
...samples.map((sample) => sample.usage),
|
|
429
|
+
]);
|
|
430
|
+
const metrics = evaluationMeanMetrics(createEvaluationRecord(subjectId, samples));
|
|
431
|
+
const attemptIndex = subjectRevision.attemptIndex;
|
|
432
|
+
const evaluationTraces = [
|
|
433
|
+
...outputs.flatMap(({ output }) => output.traces),
|
|
434
|
+
...errorSampleJobs.flatMap(jobTracePaths),
|
|
435
|
+
].sort();
|
|
436
|
+
const baseId = subjectRevision.baseId && subjectRevision.baseId !== subjectId
|
|
437
|
+
? subjectRevision.baseId
|
|
438
|
+
: null;
|
|
439
|
+
const sourceMeta = subjectSourceMetadata(args.subjectSourceFiles);
|
|
440
|
+
const benchmarkMeta = benchmarkSourceMetadata(args.benchmarkSourceFiles);
|
|
441
|
+
const meta = {
|
|
442
|
+
attemptIndex,
|
|
443
|
+
sampleCount: evalRecord.sampleCount,
|
|
444
|
+
optimizer: formatOptimizerSummary(args.spec),
|
|
445
|
+
engineRun: formatEngineRunSummary(args.spec),
|
|
446
|
+
strategy: "greedy",
|
|
447
|
+
traces: {
|
|
448
|
+
improve: subjectRevision.traces,
|
|
449
|
+
evaluations: evaluationTraces,
|
|
450
|
+
},
|
|
451
|
+
};
|
|
452
|
+
if (sourceMeta) {
|
|
453
|
+
meta.source = sourceMeta;
|
|
454
|
+
}
|
|
455
|
+
if (benchmarkMeta) {
|
|
456
|
+
meta.benchmark = benchmarkMeta;
|
|
457
|
+
}
|
|
458
|
+
const record = {
|
|
459
|
+
id: subjectId,
|
|
460
|
+
ordinal: args.existingSubjectCount + subjects.length,
|
|
461
|
+
benchmarkFingerprint: args.benchmarkFingerprint,
|
|
462
|
+
subjectFingerprint: args.subjectFingerprint ?? materializedSubjectFingerprint(args.spec, subjectRevision.files),
|
|
463
|
+
createdAt: args.startedAt,
|
|
464
|
+
...(baseId ? { baseId } : {}),
|
|
465
|
+
referenceIds: [],
|
|
466
|
+
status: evalRecord.completedSampleCount > 0 ? "evaluated" : "eval_error",
|
|
467
|
+
fileChanges: subjectRevision.fileChanges,
|
|
468
|
+
...(metrics ? { metrics } : {}),
|
|
469
|
+
...(usage ? { usage } : {}),
|
|
470
|
+
eval: evalRecord,
|
|
471
|
+
...(subjectRevision.prompt ? { prompt: subjectRevision.prompt } : {}),
|
|
472
|
+
meta,
|
|
473
|
+
};
|
|
474
|
+
subjects.push(record);
|
|
475
|
+
evaluations.push(createEvaluationResultRecord({
|
|
476
|
+
runId: args.runId,
|
|
477
|
+
benchmarkFingerprint: args.benchmarkFingerprint,
|
|
478
|
+
createdAt: args.startedAt,
|
|
479
|
+
subject: record,
|
|
480
|
+
evaluation: evalRecord,
|
|
481
|
+
}));
|
|
482
|
+
subjectFiles[subjectId] = materializedSubjectFiles({
|
|
483
|
+
subjectRevisionFiles: subjectRevision.files,
|
|
484
|
+
});
|
|
485
|
+
}
|
|
486
|
+
const selectedSubject = selectSubject({
|
|
487
|
+
subjects,
|
|
488
|
+
previousSubject: args.previousSubject ?? null,
|
|
489
|
+
});
|
|
490
|
+
return {
|
|
491
|
+
subjects,
|
|
492
|
+
subjectFiles,
|
|
493
|
+
evaluations,
|
|
494
|
+
activeSubjectId: selectedSubject?.id ?? args.previousSubject?.id ?? null,
|
|
495
|
+
selectedSubject,
|
|
496
|
+
completedJobCount,
|
|
497
|
+
failedJobCount,
|
|
498
|
+
};
|
|
499
|
+
}
|
|
500
|
+
function subjectSourceMetadata(files) {
|
|
501
|
+
const sourceFiles = (files ?? [])
|
|
502
|
+
.filter((file) => /^subjects\/[^/]+\/subject\.ya?ml$/iu.test(file.path))
|
|
503
|
+
.sort((left, right) => left.path.localeCompare(right.path))
|
|
504
|
+
.map((file) => ({
|
|
505
|
+
path: file.path,
|
|
506
|
+
kind: file.kind,
|
|
507
|
+
encoding: file.encoding ?? "utf8",
|
|
508
|
+
content: file.content,
|
|
509
|
+
executable: file.executable ?? false,
|
|
510
|
+
}));
|
|
511
|
+
return sourceFiles.length > 0 ? { files: sourceFiles } : null;
|
|
512
|
+
}
|
|
513
|
+
function benchmarkSourceMetadata(files) {
|
|
514
|
+
const sourceFiles = (files ?? [])
|
|
515
|
+
.filter((file) => file.path === BENCHMARK_SPEC_FILE)
|
|
516
|
+
.sort((left, right) => left.path.localeCompare(right.path))
|
|
517
|
+
.map((file) => ({
|
|
518
|
+
path: file.path,
|
|
519
|
+
kind: file.kind,
|
|
520
|
+
encoding: file.encoding ?? "utf8",
|
|
521
|
+
content: file.content,
|
|
522
|
+
executable: file.executable ?? false,
|
|
523
|
+
}));
|
|
524
|
+
return sourceFiles.length > 0 ? { files: sourceFiles } : null;
|
|
525
|
+
}
|
|
526
|
+
function materializedSubjectFingerprint(spec, files) {
|
|
527
|
+
const hash = createHash("sha256");
|
|
528
|
+
hash.update("workbench-subject-v1\0");
|
|
529
|
+
hash.update("materialized\0runner\0");
|
|
530
|
+
hash.update(JSON.stringify(spec.run));
|
|
531
|
+
for (const file of filterSubjectSourceFiles(files).slice().sort((left, right) => left.path.localeCompare(right.path))) {
|
|
532
|
+
hash.update("\0file\0");
|
|
533
|
+
hash.update(file.path);
|
|
534
|
+
hash.update("\0");
|
|
535
|
+
hash.update(file.encoding ?? "utf8");
|
|
536
|
+
hash.update("\0");
|
|
537
|
+
hash.update(file.content);
|
|
538
|
+
hash.update("\0");
|
|
539
|
+
hash.update(file.executable ? "1" : "0");
|
|
540
|
+
}
|
|
541
|
+
return hash.digest("hex");
|
|
542
|
+
}
|
|
543
|
+
function materializedSubjectFiles(args) {
|
|
544
|
+
const byPath = new Map();
|
|
545
|
+
for (const file of filterSubjectSourceFiles(args.subjectRevisionFiles)) {
|
|
546
|
+
byPath.set(file.path, { ...file });
|
|
547
|
+
}
|
|
548
|
+
return [...byPath.values()].sort((left, right) => left.path.localeCompare(right.path));
|
|
549
|
+
}
|
|
550
|
+
function createEvaluationResultRecord(args) {
|
|
551
|
+
const evaluation = args.evaluation;
|
|
552
|
+
return {
|
|
553
|
+
id: evaluationResultId(args.runId, args.subject.id),
|
|
554
|
+
runId: args.runId,
|
|
555
|
+
benchmarkFingerprint: args.benchmarkFingerprint,
|
|
556
|
+
subjectFingerprint: args.subject.subjectFingerprint,
|
|
557
|
+
subjectId: args.subject.id,
|
|
558
|
+
createdAt: args.createdAt,
|
|
559
|
+
updatedAt: evaluation.finishedAt ?? args.createdAt,
|
|
560
|
+
status: evaluation.status,
|
|
561
|
+
sampleCount: evaluation.sampleCount,
|
|
562
|
+
completedSampleCount: evaluation.completedSampleCount,
|
|
563
|
+
errorSampleCount: evaluation.errorSampleCount,
|
|
564
|
+
...(evaluation.metrics ? { metrics: evaluation.metrics } : {}),
|
|
565
|
+
...(evaluation.durationMs ? { durationMs: evaluation.durationMs } : {}),
|
|
566
|
+
...(evaluation.usage ? { usage: evaluation.usage } : {}),
|
|
567
|
+
...(evaluation.error ? { error: evaluation.error } : {}),
|
|
568
|
+
evaluation,
|
|
569
|
+
};
|
|
570
|
+
}
|
|
571
|
+
function evaluationResultId(runId, subjectId) {
|
|
572
|
+
const runPart = runId.replace(/[^a-z0-9]+/giu, "_").replace(/^_+|_+$/gu, "").slice(-24);
|
|
573
|
+
const subjectPart = subjectId.replace(/[^a-z0-9]+/giu, "_").replace(/^_+|_+$/gu, "").slice(-24);
|
|
574
|
+
return `eval_${runPart}_${subjectPart}`;
|
|
575
|
+
}
|
|
576
|
+
export function selectExecutionOutputFilesForInspection(args) {
|
|
577
|
+
return args.files.filter((file) => !isWorkbenchInternalOutputPath(file.path));
|
|
578
|
+
}
|
|
579
|
+
export function isWorkbenchInternalOutputPath(filePath) {
|
|
580
|
+
const normalized = normalizeRelativePath(filePath);
|
|
581
|
+
return (normalized === ".workbench" ||
|
|
582
|
+
normalized.startsWith(".workbench/") ||
|
|
583
|
+
normalized === "workbench-result.json" ||
|
|
584
|
+
normalized === "sandbox-environment.json" ||
|
|
585
|
+
normalized === "sandbox_error.log" ||
|
|
586
|
+
normalized === "exit_code" ||
|
|
587
|
+
/^[a-z-]+_(stdout\.log|stderr\.log|exit_code)$/u.test(normalized));
|
|
588
|
+
}
|
|
589
|
+
export function createSubjectRevisionTraceInputFiles(args) {
|
|
590
|
+
const files = [];
|
|
591
|
+
const manifestJobs = [];
|
|
592
|
+
const jobs = args.jobs
|
|
593
|
+
.filter((job) => job.runId === args.runId && isTerminalExecutionJob(job))
|
|
594
|
+
.sort(compareTraceInputJobs);
|
|
595
|
+
for (const job of jobs) {
|
|
596
|
+
const jobFiles = completedJobOutputFiles(job);
|
|
597
|
+
const rawTraceFiles = jobFiles.filter((file) => normalizeRelativePath(file.path).startsWith(".workbench/traces/"));
|
|
598
|
+
files.push(...rawTraceFiles.map((file) => ({ ...file })));
|
|
599
|
+
const events = args.events
|
|
600
|
+
.filter((event) => event.runId === args.runId && event.jobId === job.id)
|
|
601
|
+
.sort((left, right) => left.at.localeCompare(right.at));
|
|
602
|
+
const eventPath = `events/${job.id}.ndjson`;
|
|
603
|
+
if (events.length > 0) {
|
|
604
|
+
files.push(textSurfaceFile(eventPath, `${events.map((event) => JSON.stringify(event)).join("\n")}\n`));
|
|
605
|
+
}
|
|
606
|
+
const summaryPath = `jobs/${job.id}.json`;
|
|
607
|
+
const summary = subjectRevisionTraceJobSummary(job, {
|
|
608
|
+
eventPath: events.length > 0 ? eventPath : null,
|
|
609
|
+
rawTracePaths: rawTraceFiles.map((file) => file.path).sort(),
|
|
610
|
+
});
|
|
611
|
+
files.push(textSurfaceFile(summaryPath, `${JSON.stringify(summary, null, 2)}\n`));
|
|
612
|
+
manifestJobs.push({
|
|
613
|
+
...summary,
|
|
614
|
+
summary_path: summaryPath,
|
|
615
|
+
});
|
|
616
|
+
}
|
|
617
|
+
files.push(textSurfaceFile("manifest.json", `${JSON.stringify({
|
|
618
|
+
run_id: args.runId,
|
|
619
|
+
jobs: manifestJobs,
|
|
620
|
+
}, null, 2)}\n`));
|
|
621
|
+
return dedupeSurfaceFiles(files);
|
|
622
|
+
}
|
|
623
|
+
function isTerminalExecutionJob(job) {
|
|
624
|
+
return job.kind === "execute" && (job.status === "succeeded" ||
|
|
625
|
+
job.status === "failed" ||
|
|
626
|
+
job.status === "cancelled");
|
|
627
|
+
}
|
|
628
|
+
function compareTraceInputJobs(left, right) {
|
|
629
|
+
const leftAttempt = readOptionalJobNumber(left.input, "attemptIndex") ?? -1;
|
|
630
|
+
const rightAttempt = readOptionalJobNumber(right.input, "attemptIndex") ?? -1;
|
|
631
|
+
return leftAttempt - rightAttempt ||
|
|
632
|
+
purposeSortKey(workbenchExecutionPurpose(left)) - purposeSortKey(workbenchExecutionPurpose(right)) ||
|
|
633
|
+
(readOptionalJobNumber(left.input, "sampleIndex") ?? -1) - (readOptionalJobNumber(right.input, "sampleIndex") ?? -1) ||
|
|
634
|
+
(readJobString(left.input, "caseId") ?? "").localeCompare(readJobString(right.input, "caseId") ?? "") ||
|
|
635
|
+
left.id.localeCompare(right.id);
|
|
636
|
+
}
|
|
637
|
+
function purposeSortKey(purpose) {
|
|
638
|
+
if (purpose === "improve") {
|
|
639
|
+
return 0;
|
|
640
|
+
}
|
|
641
|
+
if (purpose === "attempt") {
|
|
642
|
+
return 1;
|
|
643
|
+
}
|
|
644
|
+
return 3;
|
|
645
|
+
}
|
|
646
|
+
function completedJobOutputFiles(job) {
|
|
647
|
+
const output = jsonRecord(job.output);
|
|
648
|
+
if (!Array.isArray(output.files)) {
|
|
649
|
+
return [];
|
|
650
|
+
}
|
|
651
|
+
const files = [];
|
|
652
|
+
for (const file of output.files) {
|
|
653
|
+
if (isSurfaceSnapshotFile(file)) {
|
|
654
|
+
files.push({ ...file });
|
|
655
|
+
}
|
|
656
|
+
}
|
|
657
|
+
return files;
|
|
658
|
+
}
|
|
659
|
+
function subjectRevisionTraceJobSummary(job, paths) {
|
|
660
|
+
const output = jsonRecord(job.output);
|
|
661
|
+
return {
|
|
662
|
+
job_id: job.id,
|
|
663
|
+
purpose: workbenchExecutionPurpose(job) ?? "unknown",
|
|
664
|
+
status: job.status,
|
|
665
|
+
subject_id: job.subjectId ?? readJobString(job.input, "subjectId"),
|
|
666
|
+
attempt_index: readOptionalJobNumber(job.input, "attemptIndex"),
|
|
667
|
+
sample_index: readOptionalJobNumber(job.input, "sampleIndex"),
|
|
668
|
+
case_id: readJobString(job.input, "caseId"),
|
|
669
|
+
created_at: job.createdAt,
|
|
670
|
+
...(job.startedAt ? { started_at: job.startedAt } : {}),
|
|
671
|
+
...(job.finishedAt ? { finished_at: job.finishedAt } : {}),
|
|
672
|
+
...(job.error ? { error: job.error } : {}),
|
|
673
|
+
traces: jobTracePaths(job),
|
|
674
|
+
event_path: paths.eventPath,
|
|
675
|
+
raw_trace_paths: [...paths.rawTracePaths],
|
|
676
|
+
output: summarizeJobOutputForTrace(output),
|
|
677
|
+
};
|
|
678
|
+
}
|
|
679
|
+
function summarizeJobOutputForTrace(output) {
|
|
680
|
+
const { files: _files, fileSet: _fileSet, subjectPatch, ...rest } = output;
|
|
681
|
+
const patch = jsonRecord(subjectPatch);
|
|
682
|
+
const { files: _patchFiles, ...patchSummary } = patch;
|
|
683
|
+
return {
|
|
684
|
+
...rest,
|
|
685
|
+
...(Object.keys(patch).length > 0
|
|
686
|
+
? { subjectPatch: patchSummary }
|
|
687
|
+
: {}),
|
|
688
|
+
};
|
|
689
|
+
}
|
|
690
|
+
function textSurfaceFile(path, content) {
|
|
691
|
+
return {
|
|
692
|
+
path,
|
|
693
|
+
kind: "text",
|
|
694
|
+
encoding: "utf8",
|
|
695
|
+
content,
|
|
696
|
+
executable: false,
|
|
697
|
+
};
|
|
698
|
+
}
|
|
699
|
+
function dedupeSurfaceFiles(files) {
|
|
700
|
+
const byPath = new Map();
|
|
701
|
+
for (const file of files) {
|
|
702
|
+
byPath.set(normalizeRelativePath(file.path), {
|
|
703
|
+
...file,
|
|
704
|
+
path: normalizeRelativePath(file.path),
|
|
705
|
+
});
|
|
706
|
+
}
|
|
707
|
+
return [...byPath.values()].sort((left, right) => left.path.localeCompare(right.path));
|
|
708
|
+
}
|
|
709
|
+
export function buildWorkbenchProjectSourceFiles(input) {
|
|
710
|
+
const files = [
|
|
711
|
+
...(input.specFiles
|
|
712
|
+
? input.specFiles.map((file) => ({ ...file }))
|
|
713
|
+
: [textSurfaceFile("benchmark.yaml", input.specSource ?? "")]),
|
|
714
|
+
...prefixProjectSourceFiles(input.subjectFiles, input.subjectFilesPath),
|
|
715
|
+
...prefixProjectSourceFiles(input.engineResolveFiles, input.engineResolveFilesPath),
|
|
716
|
+
...(input.adapterFiles ?? []).map((file) => ({ ...file })),
|
|
717
|
+
...(input.dockerfiles ?? []).map((file) => ({ ...file })),
|
|
718
|
+
];
|
|
719
|
+
if (input.dockerfilePath && input.dockerfile !== null && input.dockerfile !== undefined) {
|
|
720
|
+
files.push(textSurfaceFile(input.dockerfilePath, input.dockerfile));
|
|
721
|
+
}
|
|
722
|
+
return dedupeSurfaceFiles(files);
|
|
723
|
+
}
|
|
724
|
+
export function readWorkbenchSpecDockerfilePath(spec) {
|
|
725
|
+
return typeof spec.environment.dockerfile === "string" &&
|
|
726
|
+
spec.environment.dockerfile.length > 0
|
|
727
|
+
? spec.environment.dockerfile
|
|
728
|
+
: "environment/Dockerfile";
|
|
729
|
+
}
|
|
730
|
+
function prefixProjectSourceFiles(files, rootPath) {
|
|
731
|
+
const root = normalizeRelativePath(rootPath);
|
|
732
|
+
return files.map((file) => {
|
|
733
|
+
const filePath = normalizeRelativePath(file.path);
|
|
734
|
+
return {
|
|
735
|
+
...file,
|
|
736
|
+
path: filePath === root || filePath.startsWith(`${root}/`)
|
|
737
|
+
? filePath
|
|
738
|
+
: `${root}/${filePath}`,
|
|
739
|
+
};
|
|
740
|
+
});
|
|
741
|
+
}
|
|
742
|
+
export function isSubjectSourceFilePath(filePath) {
|
|
743
|
+
const normalized = normalizeRelativePath(filePath);
|
|
744
|
+
return (normalized !== ".workbench" &&
|
|
745
|
+
!normalized.startsWith(".workbench/") &&
|
|
746
|
+
normalized !== "workbench-result.json");
|
|
747
|
+
}
|
|
748
|
+
export function filterSubjectSourceFiles(files) {
|
|
749
|
+
return files
|
|
750
|
+
.filter((file) => isSubjectSourceFilePath(file.path))
|
|
751
|
+
.map((file) => ({ ...file }));
|
|
752
|
+
}
|
|
753
|
+
export function buildSubjectLineage(args) {
|
|
754
|
+
const orderedSummaries = args.summaries.slice().sort((left, right) => {
|
|
755
|
+
const createdAt = left.createdAt.localeCompare(right.createdAt);
|
|
756
|
+
return createdAt !== 0 ? createdAt : left.id.localeCompare(right.id);
|
|
757
|
+
});
|
|
758
|
+
const summaryIds = new Set(orderedSummaries.map((summary) => summary.id));
|
|
759
|
+
return {
|
|
760
|
+
activeId: args.activeId,
|
|
761
|
+
nodes: orderedSummaries.map((summary) => ({
|
|
762
|
+
id: summary.id,
|
|
763
|
+
active: args.activeId === summary.id,
|
|
764
|
+
summary,
|
|
765
|
+
})),
|
|
766
|
+
edges: orderedSummaries.flatMap((summary) => buildLineageEdges(summary, summaryIds)),
|
|
767
|
+
};
|
|
768
|
+
}
|
|
769
|
+
export function normalizeSurfaceFiles(files) {
|
|
770
|
+
const byPath = new Map();
|
|
771
|
+
for (const file of files) {
|
|
772
|
+
const normalizedPath = normalizeRelativePath(file.path);
|
|
773
|
+
const content = String(file.content ?? "");
|
|
774
|
+
byPath.set(normalizedPath, {
|
|
775
|
+
path: normalizedPath,
|
|
776
|
+
kind: file.encoding === "base64" ? "binary" : "text",
|
|
777
|
+
encoding: file.encoding ?? "utf8",
|
|
778
|
+
content,
|
|
779
|
+
executable: file.executable === true,
|
|
780
|
+
});
|
|
781
|
+
}
|
|
782
|
+
return [...byPath.values()].sort((left, right) => left.path.localeCompare(right.path));
|
|
783
|
+
}
|
|
784
|
+
export function filterSurfaceFilesByInclude(files, include) {
|
|
785
|
+
if (!include || include.length === 0) {
|
|
786
|
+
return files.map((file) => ({ ...file }));
|
|
787
|
+
}
|
|
788
|
+
const matchers = include.map((pattern) => globPatternToRegExp(normalizeRelativePath(pattern)));
|
|
789
|
+
return files
|
|
790
|
+
.filter((file) => matchers.some((matcher) => matcher.test(normalizeRelativePath(file.path))))
|
|
791
|
+
.map((file) => ({ ...file }));
|
|
792
|
+
}
|
|
793
|
+
function globPatternToRegExp(pattern) {
|
|
794
|
+
let source = "^";
|
|
795
|
+
for (let index = 0; index < pattern.length; index += 1) {
|
|
796
|
+
const char = pattern[index];
|
|
797
|
+
if (char === "*") {
|
|
798
|
+
const next = pattern[index + 1];
|
|
799
|
+
const afterNext = pattern[index + 2];
|
|
800
|
+
if (next === "*") {
|
|
801
|
+
if (afterNext === "/") {
|
|
802
|
+
source += "(?:.*/)?";
|
|
803
|
+
index += 2;
|
|
804
|
+
}
|
|
805
|
+
else {
|
|
806
|
+
source += ".*";
|
|
807
|
+
index += 1;
|
|
808
|
+
}
|
|
809
|
+
}
|
|
810
|
+
else {
|
|
811
|
+
source += "[^/]*";
|
|
812
|
+
}
|
|
813
|
+
continue;
|
|
814
|
+
}
|
|
815
|
+
if (char === "?") {
|
|
816
|
+
source += "[^/]";
|
|
817
|
+
continue;
|
|
818
|
+
}
|
|
819
|
+
source += escapeRegExp(char);
|
|
820
|
+
}
|
|
821
|
+
return new RegExp(`${source}$`, "u");
|
|
822
|
+
}
|
|
823
|
+
function escapeRegExp(value) {
|
|
824
|
+
return value.replace(/[\\^$.*+?()[\]{}|]/gu, "\\$&");
|
|
825
|
+
}
|
|
826
|
+
export function summarizeSubjectFiles(files, changedPaths = files.map((file) => file.path)) {
|
|
827
|
+
const changed = new Set(changedPaths);
|
|
828
|
+
return [...files]
|
|
829
|
+
.sort((left, right) => left.path.localeCompare(right.path))
|
|
830
|
+
.map((file) => {
|
|
831
|
+
const previewKind = resolvePreviewKind(file.path);
|
|
832
|
+
const text = file.encoding === "utf8" ? file.content : "";
|
|
833
|
+
const lines = text.length === 0 ? [] : text.split(/\r?\n/u);
|
|
834
|
+
return {
|
|
835
|
+
path: file.path,
|
|
836
|
+
old_path: null,
|
|
837
|
+
status: changed.has(file.path) ? "added" : "unchanged",
|
|
838
|
+
mime_type: detectMimeType(file.path),
|
|
839
|
+
preview_kind: previewKind,
|
|
840
|
+
additions: changed.has(file.path) ? Math.max(lines.length, 1) : 0,
|
|
841
|
+
deletions: 0,
|
|
842
|
+
};
|
|
843
|
+
});
|
|
844
|
+
}
|
|
845
|
+
export function createSubjectFilePreview(args) {
|
|
846
|
+
if (args.view === "diff") {
|
|
847
|
+
throw new Error("Diff previews require explicit before and after file content.");
|
|
848
|
+
}
|
|
849
|
+
const normalizedPath = normalizeRelativePath(args.path);
|
|
850
|
+
const file = args.files.find((entry) => entry.path === normalizedPath);
|
|
851
|
+
if (!file) {
|
|
852
|
+
throw new Error(`File ${args.path} was not found.`);
|
|
853
|
+
}
|
|
854
|
+
const source = {
|
|
855
|
+
content: file.content,
|
|
856
|
+
encoding: file.encoding,
|
|
857
|
+
};
|
|
858
|
+
return {
|
|
859
|
+
path: file.path,
|
|
860
|
+
view: args.view,
|
|
861
|
+
mime_type: detectMimeType(file.path),
|
|
862
|
+
preview_kind: resolvePreviewKind(file.path),
|
|
863
|
+
diff: null,
|
|
864
|
+
source,
|
|
865
|
+
rendered_html: null,
|
|
866
|
+
};
|
|
867
|
+
}
|
|
868
|
+
export function createCaseReview(args) {
|
|
869
|
+
const preferredSampleIndex = uniquePhaseSampleIndex(args.phases ?? []);
|
|
870
|
+
const sampleMatchesCase = (sample) => sample.id === args.caseId ||
|
|
871
|
+
sample.id.startsWith(`${args.caseId}__`) ||
|
|
872
|
+
(sample.cases ?? []).some((entry) => entry.id === args.caseId || entry.id.startsWith(`${args.caseId}__`));
|
|
873
|
+
const samples = args.subject.eval?.samples ?? [];
|
|
874
|
+
const sampleResult = samples.find((sample) => typeof preferredSampleIndex === "number" &&
|
|
875
|
+
sample.index === preferredSampleIndex &&
|
|
876
|
+
sampleMatchesCase(sample)) ?? samples.find(sampleMatchesCase);
|
|
877
|
+
const caseResult = sampleResult?.cases?.find((entry) => entry.id === args.caseId || entry.id.startsWith(`${args.caseId}__`));
|
|
878
|
+
if (!sampleResult && (args.phases?.length ?? 0) > 0) {
|
|
879
|
+
return {
|
|
880
|
+
subjectId: args.subject.id,
|
|
881
|
+
caseId: args.caseId,
|
|
882
|
+
caseLabel: args.caseId,
|
|
883
|
+
...(typeof preferredSampleIndex === "number"
|
|
884
|
+
? { sampleIndex: preferredSampleIndex }
|
|
885
|
+
: {}),
|
|
886
|
+
metrics: {},
|
|
887
|
+
phases: args.phases ?? [],
|
|
888
|
+
criteria_results: [],
|
|
889
|
+
};
|
|
890
|
+
}
|
|
891
|
+
if (!sampleResult) {
|
|
892
|
+
throw new Error(`Case ${args.caseId} was not found on subject ${args.subject.id}.`);
|
|
893
|
+
}
|
|
894
|
+
const durationMs = typeof caseResult?.durationMs === "number"
|
|
895
|
+
? caseResult.durationMs
|
|
896
|
+
: sampleResult?.cases?.length === 1 &&
|
|
897
|
+
typeof sampleResult.durationMs === "number"
|
|
898
|
+
? sampleResult.durationMs
|
|
899
|
+
: !caseResult && typeof sampleResult.durationMs === "number"
|
|
900
|
+
? sampleResult.durationMs
|
|
901
|
+
: undefined;
|
|
902
|
+
const sampleStatus = sampleResult.status === "planned" ? undefined : sampleResult.status;
|
|
903
|
+
const status = caseResult?.status ?? sampleStatus;
|
|
904
|
+
return {
|
|
905
|
+
subjectId: args.subject.id,
|
|
906
|
+
caseId: caseResult?.id ?? sampleResult.id,
|
|
907
|
+
caseLabel: caseResult?.label ?? args.caseId,
|
|
908
|
+
sampleId: sampleResult.id,
|
|
909
|
+
sampleIndex: sampleResult.index,
|
|
910
|
+
...(status ? { status } : {}),
|
|
911
|
+
metrics: caseResult?.metrics ?? sampleResult.metrics ?? {},
|
|
912
|
+
...(typeof durationMs === "number" ? { durationMs } : {}),
|
|
913
|
+
...(caseResult?.source ? { source: caseResult.source } : {}),
|
|
914
|
+
...((caseResult?.feedback ?? sampleResult.feedback) !== undefined
|
|
915
|
+
? { feedback: caseResult?.feedback ?? sampleResult.feedback }
|
|
916
|
+
: {}),
|
|
917
|
+
phases: args.phases ?? [],
|
|
918
|
+
criteria_results: (caseResult?.criteria ?? []).map((criterion) => ({
|
|
919
|
+
criterion_id: criterion.criterion_id,
|
|
920
|
+
pass: criterion.pass,
|
|
921
|
+
score: criterion.score,
|
|
922
|
+
errors: criterion.errors ?? [],
|
|
923
|
+
...(criterion.rationale ? { rationale: criterion.rationale } : {}),
|
|
924
|
+
})),
|
|
925
|
+
};
|
|
926
|
+
}
|
|
927
|
+
function uniquePhaseSampleIndex(phases) {
|
|
928
|
+
const sampleIndices = new Set(phases
|
|
929
|
+
.map((phase) => phase.sampleIndex)
|
|
930
|
+
.filter((index) => typeof index === "number"));
|
|
931
|
+
if (sampleIndices.size !== 1) {
|
|
932
|
+
return null;
|
|
933
|
+
}
|
|
934
|
+
const [sampleIndex] = sampleIndices;
|
|
935
|
+
return typeof sampleIndex === "number" ? sampleIndex : null;
|
|
936
|
+
}
|
|
937
|
+
function parseAuthoredWorkbenchSourceSpec(source) {
|
|
938
|
+
const validation = validateWorkbenchResolvedSourceYamlInternal(source);
|
|
939
|
+
if (!validation.ok) {
|
|
940
|
+
return null;
|
|
941
|
+
}
|
|
942
|
+
const resolved = resolveWorkbenchResolvedSourceYamlInternal(source);
|
|
943
|
+
return {
|
|
944
|
+
version: 3,
|
|
945
|
+
benchmark: {
|
|
946
|
+
name: resolved.benchmark.name,
|
|
947
|
+
description: resolved.benchmark.description,
|
|
948
|
+
engine: authoredAdapterSpecFromInvocation(resolved.engine),
|
|
949
|
+
},
|
|
950
|
+
subject: {
|
|
951
|
+
name: resolved.subject.name,
|
|
952
|
+
description: resolved.subject.description,
|
|
953
|
+
files: { path: resolved.subject.files.path },
|
|
954
|
+
run: runSpecFromInvocation(resolved.run),
|
|
955
|
+
},
|
|
956
|
+
...(resolved.optimizer
|
|
957
|
+
? {
|
|
958
|
+
optimizer: {
|
|
959
|
+
name: resolved.optimizer.name,
|
|
960
|
+
...(resolved.optimizer.description ? { description: resolved.optimizer.description } : {}),
|
|
961
|
+
edits: [...resolved.optimizer.edits],
|
|
962
|
+
improve: improveSpecFromInvocation(resolved.improve),
|
|
963
|
+
},
|
|
964
|
+
}
|
|
965
|
+
: {}),
|
|
966
|
+
};
|
|
967
|
+
}
|
|
968
|
+
function improveSpecFromInvocation(invocation) {
|
|
969
|
+
return authoredAdapterSpecFromInvocation(invocation);
|
|
970
|
+
}
|
|
971
|
+
function runSpecFromInvocation(invocation) {
|
|
972
|
+
return authoredAdapterSpecFromInvocation(invocation);
|
|
973
|
+
}
|
|
974
|
+
function authoredAdapterSpecFromInvocation(invocation) {
|
|
975
|
+
const config = jsonRecord(invocation.with);
|
|
976
|
+
return {
|
|
977
|
+
use: invocation.use,
|
|
978
|
+
...(invocation.auth !== undefined ? { auth: invocation.auth } : {}),
|
|
979
|
+
...(Object.keys(config).length > 0 ? { with: config } : {}),
|
|
980
|
+
};
|
|
981
|
+
}
|
|
982
|
+
function summarizeCaseInputs(files) {
|
|
983
|
+
if (files.length === 0) {
|
|
984
|
+
return [];
|
|
985
|
+
}
|
|
986
|
+
const caseIds = [...new Set(files.flatMap((file) => {
|
|
987
|
+
const normalized = normalizeRelativePath(file.path);
|
|
988
|
+
const slash = normalized.indexOf("/");
|
|
989
|
+
if (slash <= 0) {
|
|
990
|
+
return [];
|
|
991
|
+
}
|
|
992
|
+
return [normalized.slice(0, slash)];
|
|
993
|
+
}))].sort();
|
|
994
|
+
if (caseIds.length === 0) {
|
|
995
|
+
return [];
|
|
996
|
+
}
|
|
997
|
+
return caseIds.map((taskId, index) => {
|
|
998
|
+
const prefix = `${taskId}/`;
|
|
999
|
+
const fileCount = files.filter((file) => normalizeRelativePath(file.path).startsWith(prefix)).length;
|
|
1000
|
+
return {
|
|
1001
|
+
id: `case-${String(index + 1).padStart(3, "0")}`,
|
|
1002
|
+
slug: taskId.replace(/\W+/gu, "-"),
|
|
1003
|
+
path: taskId,
|
|
1004
|
+
name: taskId,
|
|
1005
|
+
fileCount,
|
|
1006
|
+
};
|
|
1007
|
+
});
|
|
1008
|
+
}
|
|
1009
|
+
function buildLineageEdges(summary, summaryIds) {
|
|
1010
|
+
const edges = [];
|
|
1011
|
+
if (summary.baseId && summary.baseId !== summary.id && summaryIds.has(summary.baseId)) {
|
|
1012
|
+
edges.push({
|
|
1013
|
+
id: `anchor:${summary.baseId}:${summary.id}`,
|
|
1014
|
+
kind: "anchor",
|
|
1015
|
+
sourceId: summary.baseId,
|
|
1016
|
+
targetId: summary.id,
|
|
1017
|
+
});
|
|
1018
|
+
}
|
|
1019
|
+
return edges;
|
|
1020
|
+
}
|
|
1021
|
+
export function createWorkbenchRunWorkload(args) {
|
|
1022
|
+
const purpose = workbenchExecutionPurpose(args.job);
|
|
1023
|
+
if (!purpose) {
|
|
1024
|
+
throw new Error(`Unsupported runtime job kind: ${args.job.kind}`);
|
|
1025
|
+
}
|
|
1026
|
+
const subjectId = readJobString(args.job.input, "subjectId") ?? args.job.subjectId;
|
|
1027
|
+
if (!subjectId) {
|
|
1028
|
+
throw new Error(`${purpose} execution job is missing subjectId.`);
|
|
1029
|
+
}
|
|
1030
|
+
const attemptIndex = readRequiredJobNumber(args.job.input, "attemptIndex", `${purpose} execution job`);
|
|
1031
|
+
const sampleIndex = purpose === "improve"
|
|
1032
|
+
? 0
|
|
1033
|
+
: readRequiredJobNumber(args.job.input, "sampleIndex", `${purpose} execution job`);
|
|
1034
|
+
const caseId = purpose === "improve"
|
|
1035
|
+
? "current"
|
|
1036
|
+
: readRequiredJobString(args.job.input, "caseId", `${purpose} execution job`);
|
|
1037
|
+
const engineCase = purpose === "improve"
|
|
1038
|
+
? undefined
|
|
1039
|
+
: engineCaseForCase(args.engineCases, caseId);
|
|
1040
|
+
const selectedEngineResolveFiles = engineCase
|
|
1041
|
+
? engineCaseFilesForRuntimeInput({ spec: args.spec, engineCase })
|
|
1042
|
+
: [];
|
|
1043
|
+
const engineCaseSpec = engineCase?.case;
|
|
1044
|
+
const initial = createInitialSubjectFiles({
|
|
1045
|
+
baseFiles: args.baseFiles,
|
|
1046
|
+
spec: args.spec,
|
|
1047
|
+
attemptIndex,
|
|
1048
|
+
});
|
|
1049
|
+
return {
|
|
1050
|
+
job: args.job,
|
|
1051
|
+
spec: args.spec,
|
|
1052
|
+
subjectId,
|
|
1053
|
+
attemptIndex,
|
|
1054
|
+
sampleIndex,
|
|
1055
|
+
subjectFiles: initial.files,
|
|
1056
|
+
caseId,
|
|
1057
|
+
engineResolveFiles: selectedEngineResolveFiles,
|
|
1058
|
+
traceFiles: (args.traceFiles ?? []).map((file) => ({ ...file })),
|
|
1059
|
+
...(engineCase ? { engineCase } : {}),
|
|
1060
|
+
...(engineCaseSpec ? { engineCaseSpec } : {}),
|
|
1061
|
+
prompt: initial.prompt,
|
|
1062
|
+
changedPaths: initial.changedPaths,
|
|
1063
|
+
baseId: readJobString(args.job.input, "baseId"),
|
|
1064
|
+
};
|
|
1065
|
+
}
|
|
1066
|
+
function createInitialSubjectFiles(args) {
|
|
1067
|
+
const editablePaths = optimizerEdits(args.spec).map(normalizeRelativePath);
|
|
1068
|
+
const editPath = editablePaths[0];
|
|
1069
|
+
const subjectPaths = editPath ? [editPath] : [];
|
|
1070
|
+
const files = args.baseFiles.length > 0
|
|
1071
|
+
? args.baseFiles.map((file) => ({ ...file }))
|
|
1072
|
+
: editPath
|
|
1073
|
+
? normalizeSurfaceFiles([{ path: editPath, content: "" }])
|
|
1074
|
+
: [];
|
|
1075
|
+
const prompt = [
|
|
1076
|
+
`Run the subject workload for benchmark: ${args.spec.benchmark.description}`,
|
|
1077
|
+
`Attempt ${args.attemptIndex + 1} uses ${formatOptimizerSummary(args.spec)}; the improve adapter may edit the subject before Workbench scores it.`,
|
|
1078
|
+
].join("\n");
|
|
1079
|
+
const byPath = new Map(files.map((file) => [file.path, file]));
|
|
1080
|
+
if (editPath &&
|
|
1081
|
+
![...byPath.keys()].some((filePath) => subjectPaths.includes(filePath))) {
|
|
1082
|
+
byPath.set(editPath, {
|
|
1083
|
+
path: editPath,
|
|
1084
|
+
kind: "text",
|
|
1085
|
+
encoding: "utf8",
|
|
1086
|
+
executable: false,
|
|
1087
|
+
content: "",
|
|
1088
|
+
});
|
|
1089
|
+
}
|
|
1090
|
+
return {
|
|
1091
|
+
files: [...byPath.values()].sort((left, right) => left.path.localeCompare(right.path)),
|
|
1092
|
+
changedPaths: [],
|
|
1093
|
+
prompt,
|
|
1094
|
+
};
|
|
1095
|
+
}
|
|
1096
|
+
export async function executeWorkbenchExecutionJob(args, options) {
|
|
1097
|
+
const startedAt = args.job.startedAt ?? args.now ?? new Date().toISOString();
|
|
1098
|
+
const execution = readWorkbenchExecutionSpec(args.job);
|
|
1099
|
+
try {
|
|
1100
|
+
const adapterAuthProfiles = await explicitAdapterAuthProfilesForExecution(execution, args, Boolean(options.loadLocalAdapterAuthProfiles));
|
|
1101
|
+
const runtimeArgs = adapterAuthProfiles.length > 0
|
|
1102
|
+
? { ...args, adapterAuthProfiles }
|
|
1103
|
+
: args;
|
|
1104
|
+
const executionForSandbox = readWorkbenchExecutionSpec(runtimeArgs.job);
|
|
1105
|
+
const fileStore = createWorkbenchSandboxFileStore(runtimeArgs);
|
|
1106
|
+
const planeFactory = options.createSandboxPlaneForProvider ?? createSandboxBackendPlaneForProvider;
|
|
1107
|
+
const plane = planeFactory(options.sandboxProvider, runtimeArgs, startedAt, fileStore);
|
|
1108
|
+
const validated = await executeValidatedSandboxExecution(plane, executionForSandbox, {
|
|
1109
|
+
now: startedAt,
|
|
1110
|
+
runnerId: resolveWorkbenchWorkerId([
|
|
1111
|
+
process.env.WORKBENCH_WORKER_ID,
|
|
1112
|
+
process.env.EC2_INSTANCE_ID,
|
|
1113
|
+
os.hostname(),
|
|
1114
|
+
process.env.HOSTNAME,
|
|
1115
|
+
], "local-runner"),
|
|
1116
|
+
fileStore,
|
|
1117
|
+
});
|
|
1118
|
+
return completedJobFromSandboxResult(runtimeArgs.job, startedAt, validated.result);
|
|
1119
|
+
}
|
|
1120
|
+
catch (error) {
|
|
1121
|
+
return failWorkbenchRunJob(args.job, startedAt, error);
|
|
1122
|
+
}
|
|
1123
|
+
}
|
|
1124
|
+
async function explicitAdapterAuthProfilesForExecution(execution, args, loadLocalAdapterProfiles) {
|
|
1125
|
+
const required = requiredAdapterAuthTargetsForExecution(execution, args);
|
|
1126
|
+
if (required.length === 0) {
|
|
1127
|
+
return [];
|
|
1128
|
+
}
|
|
1129
|
+
const provided = (args.adapterAuthProfiles ?? [])
|
|
1130
|
+
.map((bundle) => sanitizeWorkbenchAdapterAuthBundle(bundle));
|
|
1131
|
+
const providedByTarget = new Map(provided.map((bundle) => [
|
|
1132
|
+
adapterAuthTargetKey(bundle),
|
|
1133
|
+
bundle,
|
|
1134
|
+
]));
|
|
1135
|
+
const missing = required.filter((target) => !providedByTarget.has(adapterAuthTargetKey(target)));
|
|
1136
|
+
if (missing.length > 0 && loadLocalAdapterProfiles) {
|
|
1137
|
+
const store = localWorkbenchAdapterAuthStore();
|
|
1138
|
+
const loaded = await Promise.all(required.map(async (target) => await store.get(target)));
|
|
1139
|
+
const missingLoaded = loaded.findIndex((bundle) => !bundle);
|
|
1140
|
+
if (missingLoaded >= 0) {
|
|
1141
|
+
const target = required[missingLoaded];
|
|
1142
|
+
throw new Error(`ADAPTER_AUTH_REQUIRED: ${target.adapterId}${target.slot ? `/${target.slot}` : ""} disconnected. Run workbench auth connect ${target.adapterId}${target.slot ? `/${target.slot}` : ""}.`);
|
|
1143
|
+
}
|
|
1144
|
+
return loaded.map((bundle) => bundle);
|
|
1145
|
+
}
|
|
1146
|
+
if (missing.length > 0) {
|
|
1147
|
+
const target = missing[0];
|
|
1148
|
+
throw new Error(`ADAPTER_AUTH_REQUIRED: ${target.adapterId}${target.slot ? `/${target.slot}` : ""} disconnected. Run workbench auth connect ${target.adapterId}${target.slot ? `/${target.slot}` : ""}.`);
|
|
1149
|
+
}
|
|
1150
|
+
return required.map((target) => providedByTarget.get(adapterAuthTargetKey(target)));
|
|
1151
|
+
}
|
|
1152
|
+
function adapterAuthTargetKey(target) {
|
|
1153
|
+
return `${target.adapterId}/${target.slot ?? "_"}/${target.profile}`;
|
|
1154
|
+
}
|
|
1155
|
+
export function workbenchExecutionPurpose(job) {
|
|
1156
|
+
return readWorkbenchExecutionPurpose(job);
|
|
1157
|
+
}
|
|
1158
|
+
export async function executeAdapterInCurrentSandboxRuntime(args, execution, startedAt, capability) {
|
|
1159
|
+
const eventPublisher = createWorkbenchExecutionEventPublisher({
|
|
1160
|
+
projectId: args.job.projectId,
|
|
1161
|
+
runId: args.job.runId,
|
|
1162
|
+
jobId: args.job.id,
|
|
1163
|
+
executionId: execution.id,
|
|
1164
|
+
attempt: Math.max(1, args.job.attempt),
|
|
1165
|
+
target: args.progress,
|
|
1166
|
+
});
|
|
1167
|
+
const adapterAuth = await materializeSandboxAdapterAuth(args, execution);
|
|
1168
|
+
const runtimeInput = {
|
|
1169
|
+
...args,
|
|
1170
|
+
...(adapterAuth.root ? { adapterAuthRoot: adapterAuth.root } : {}),
|
|
1171
|
+
...(Object.keys(adapterAuth.env).length > 0
|
|
1172
|
+
? { adapterAuthEnv: adapterAuth.env }
|
|
1173
|
+
: {}),
|
|
1174
|
+
};
|
|
1175
|
+
try {
|
|
1176
|
+
if (execution.purpose === "improve") {
|
|
1177
|
+
return await executeSubjectRevisionExecutionInSandbox(runtimeInput, execution, startedAt, capability, eventPublisher);
|
|
1178
|
+
}
|
|
1179
|
+
if (execution.purpose === "attempt") {
|
|
1180
|
+
return await executeAttemptExecutionInSandbox(runtimeInput, execution, startedAt, capability, eventPublisher);
|
|
1181
|
+
}
|
|
1182
|
+
throw new Error(`Unsupported execution purpose ${execution.purpose}.`);
|
|
1183
|
+
}
|
|
1184
|
+
catch (error) {
|
|
1185
|
+
return failWorkbenchRunJob(args.job, startedAt, error);
|
|
1186
|
+
}
|
|
1187
|
+
finally {
|
|
1188
|
+
if (adapterAuth.cleanup) {
|
|
1189
|
+
await adapterAuth.cleanup().catch(() => undefined);
|
|
1190
|
+
}
|
|
1191
|
+
await eventPublisher.flush().catch(() => undefined);
|
|
1192
|
+
}
|
|
1193
|
+
}
|
|
1194
|
+
async function materializeSandboxAdapterAuth(args, execution) {
|
|
1195
|
+
const adapterProfiles = adapterAuthProfilesForExecution(execution, args);
|
|
1196
|
+
if (adapterProfiles.length === 0) {
|
|
1197
|
+
return { env: {} };
|
|
1198
|
+
}
|
|
1199
|
+
const env = {};
|
|
1200
|
+
for (const bundle of adapterProfiles) {
|
|
1201
|
+
Object.assign(env, adapterAuthEnv(bundle));
|
|
1202
|
+
}
|
|
1203
|
+
const adapterFileBundles = adapterProfiles.filter((bundle) => bundle.files.length > 0);
|
|
1204
|
+
if (adapterFileBundles.length === 0) {
|
|
1205
|
+
return { env };
|
|
1206
|
+
}
|
|
1207
|
+
const [fs, os, path] = await Promise.all([
|
|
1208
|
+
importNodeModule(nodeBuiltin("fs/promises")),
|
|
1209
|
+
importNodeModule(nodeBuiltin("os")),
|
|
1210
|
+
importNodeModule(nodeBuiltin("path")),
|
|
1211
|
+
]);
|
|
1212
|
+
const base = args.workdir ?? os.tmpdir();
|
|
1213
|
+
await fs.mkdir(base, { recursive: true });
|
|
1214
|
+
const root = await fs.mkdtemp(path.join(base, "workbench-adapter-auth-"));
|
|
1215
|
+
await materializeAdapterAuthProfiles(adapterFileBundles, root, fs, path);
|
|
1216
|
+
return {
|
|
1217
|
+
...(root ? { root } : {}),
|
|
1218
|
+
env,
|
|
1219
|
+
cleanup: async () => {
|
|
1220
|
+
if (root) {
|
|
1221
|
+
await fs.rm(root, { recursive: true, force: true });
|
|
1222
|
+
}
|
|
1223
|
+
},
|
|
1224
|
+
};
|
|
1225
|
+
}
|
|
1226
|
+
async function materializeAdapterAuthProfiles(bundles, root, fs, path) {
|
|
1227
|
+
for (const bundle of bundles) {
|
|
1228
|
+
const targetRoot = path.join(root, bundle.adapterId, bundle.slot ?? "_", bundle.profile);
|
|
1229
|
+
for (const file of bundle.files) {
|
|
1230
|
+
const targetPath = path.join(targetRoot, file.path);
|
|
1231
|
+
await fs.mkdir(path.dirname(targetPath), { recursive: true });
|
|
1232
|
+
await fs.writeFile(targetPath, file.encoding === "base64"
|
|
1233
|
+
? Buffer.from(file.content, "base64")
|
|
1234
|
+
: file.content, { mode: file.mode ?? 0o600 });
|
|
1235
|
+
}
|
|
1236
|
+
}
|
|
1237
|
+
}
|
|
1238
|
+
function adapterAuthRequest(bundles, root, currentAdapterId) {
|
|
1239
|
+
const self = {};
|
|
1240
|
+
const adapters = {};
|
|
1241
|
+
for (const bundle of bundles) {
|
|
1242
|
+
const key = bundle.slot ?? "default";
|
|
1243
|
+
const fileAuth = bundle.files.length > 0
|
|
1244
|
+
? {
|
|
1245
|
+
...(root ? { filesRoot: `${root}/${bundle.adapterId}/${bundle.slot ?? "_"}/${bundle.profile}` } : {}),
|
|
1246
|
+
files: bundle.files.map((file) => ({
|
|
1247
|
+
path: file.path,
|
|
1248
|
+
encoding: file.encoding,
|
|
1249
|
+
})),
|
|
1250
|
+
}
|
|
1251
|
+
: undefined;
|
|
1252
|
+
const entry = {
|
|
1253
|
+
method: bundle.method,
|
|
1254
|
+
profile: bundle.profile,
|
|
1255
|
+
...(bundle.env && bundle.env.length > 0
|
|
1256
|
+
? { env: Object.fromEntries(bundle.env.map((entry) => [entry.name, "materialized"])) }
|
|
1257
|
+
: {}),
|
|
1258
|
+
...(fileAuth ? fileAuth : {}),
|
|
1259
|
+
};
|
|
1260
|
+
adapters[bundle.adapterId] = {
|
|
1261
|
+
...(adapters[bundle.adapterId] ?? {}),
|
|
1262
|
+
[key]: entry,
|
|
1263
|
+
};
|
|
1264
|
+
if (!currentAdapterId || bundle.adapterId === currentAdapterId) {
|
|
1265
|
+
self[key] = entry;
|
|
1266
|
+
}
|
|
1267
|
+
}
|
|
1268
|
+
const entries = {};
|
|
1269
|
+
if (Object.keys(self).length > 0) {
|
|
1270
|
+
entries.self = self;
|
|
1271
|
+
}
|
|
1272
|
+
if (Object.keys(adapters).length > 0) {
|
|
1273
|
+
entries.adapters = adapters;
|
|
1274
|
+
}
|
|
1275
|
+
return entries;
|
|
1276
|
+
}
|
|
1277
|
+
function adapterAuthRequestForPhase(args, adapterId) {
|
|
1278
|
+
const profiles = (args.adapterAuthProfiles ?? [])
|
|
1279
|
+
.map((bundle) => sanitizeWorkbenchAdapterAuthBundle(bundle));
|
|
1280
|
+
if (profiles.length === 0) {
|
|
1281
|
+
return args.adapterAuthRequest;
|
|
1282
|
+
}
|
|
1283
|
+
return adapterAuthRequest(profiles, args.adapterAuthRoot, adapterId);
|
|
1284
|
+
}
|
|
1285
|
+
function adapterAuthProfilesForExecution(execution, args) {
|
|
1286
|
+
const profiles = (args.adapterAuthProfiles ?? [])
|
|
1287
|
+
.map((bundle) => sanitizeWorkbenchAdapterAuthBundle(bundle));
|
|
1288
|
+
if (profiles.length === 0) {
|
|
1289
|
+
return [];
|
|
1290
|
+
}
|
|
1291
|
+
const targets = requiredAdapterAuthTargetsForExecution(execution, args);
|
|
1292
|
+
return profiles.filter((bundle) => targets.some((target) => bundle.adapterId === target.adapterId &&
|
|
1293
|
+
bundle.profile === target.profile &&
|
|
1294
|
+
(target.slot === undefined || bundle.slot === target.slot)));
|
|
1295
|
+
}
|
|
1296
|
+
function requiredAdapterAuthTargetsForExecution(execution, args) {
|
|
1297
|
+
const manifests = args.adapterManifests ?? [];
|
|
1298
|
+
return collectWorkbenchAdapterAuthRequirements(adapterInvocationsForExecution(execution, args.spec), manifests)
|
|
1299
|
+
.map((target) => normalizeWorkbenchAdapterAuthTarget(target));
|
|
1300
|
+
}
|
|
1301
|
+
function adapterInvocationsForExecution(execution, spec) {
|
|
1302
|
+
if (execution.purpose === "attempt") {
|
|
1303
|
+
return uniqueAdapterInvocations([execution.adapter, spec.run]);
|
|
1304
|
+
}
|
|
1305
|
+
return [execution.adapter];
|
|
1306
|
+
}
|
|
1307
|
+
function uniqueAdapterInvocations(invocations) {
|
|
1308
|
+
const seen = new Set();
|
|
1309
|
+
const result = [];
|
|
1310
|
+
for (const invocation of invocations) {
|
|
1311
|
+
const key = JSON.stringify(invocation);
|
|
1312
|
+
if (seen.has(key)) {
|
|
1313
|
+
continue;
|
|
1314
|
+
}
|
|
1315
|
+
seen.add(key);
|
|
1316
|
+
result.push(invocation);
|
|
1317
|
+
}
|
|
1318
|
+
return result;
|
|
1319
|
+
}
|
|
1320
|
+
function completedJobFromSandboxResult(fallbackJob, startedAt, result) {
|
|
1321
|
+
const completedJob = asRuntimeRecord(result.metadata).completedJob;
|
|
1322
|
+
if (completedJob &&
|
|
1323
|
+
typeof completedJob === "object" &&
|
|
1324
|
+
!Array.isArray(completedJob)) {
|
|
1325
|
+
return completedJob;
|
|
1326
|
+
}
|
|
1327
|
+
if (result.status === "succeeded") {
|
|
1328
|
+
const finishedAt = result.finishedAt || new Date().toISOString();
|
|
1329
|
+
return {
|
|
1330
|
+
...fallbackJob,
|
|
1331
|
+
status: "succeeded",
|
|
1332
|
+
attempt: Math.max(1, fallbackJob.attempt),
|
|
1333
|
+
startedAt: result.startedAt || startedAt,
|
|
1334
|
+
finishedAt,
|
|
1335
|
+
updatedAt: finishedAt,
|
|
1336
|
+
output: {
|
|
1337
|
+
ok: true,
|
|
1338
|
+
executionId: result.executionId,
|
|
1339
|
+
},
|
|
1340
|
+
};
|
|
1341
|
+
}
|
|
1342
|
+
return attachSandboxMetadataToJob(failWorkbenchRunJob(fallbackJob, result.startedAt || startedAt, result.error ?? `Sandbox execution ${result.status}.`, result.finishedAt), asRuntimeRecord(result.metadata).sandbox);
|
|
1343
|
+
}
|
|
1344
|
+
async function executeSubjectRevisionExecutionInSandbox(args, execution, startedAt, capability, eventPublisher) {
|
|
1345
|
+
const { workload, result } = await runHostedProtocolExecutionResult(args, execution, startedAt, capability, eventPublisher);
|
|
1346
|
+
if (result.error || (result.exitCode ?? 0) !== 0) {
|
|
1347
|
+
return failWorkbenchRunJob(args.job, startedAt, result.error ?? `Adapter ${execution.adapter.use} exited with status ${result.exitCode}.`, result.finishedAt, result);
|
|
1348
|
+
}
|
|
1349
|
+
const finishedAt = result.finishedAt ?? new Date().toISOString();
|
|
1350
|
+
const subjectPatch = createSubjectPatchFromResult(result, args.spec);
|
|
1351
|
+
if (subjectPatch.fileChanges.length === 0) {
|
|
1352
|
+
return failWorkbenchRunJob(args.job, startedAt, `${execution.adapter.use === "command" ? "Command improve adapter" : `Adapter ${execution.adapter.use}`} completed without changing subject files covered by optimizer edits.`, finishedAt, result);
|
|
1353
|
+
}
|
|
1354
|
+
const subjectRevisionFiles = applyWorkbenchSubjectPatch({
|
|
1355
|
+
baseFiles: workload.subjectFiles,
|
|
1356
|
+
patch: subjectPatch,
|
|
1357
|
+
edits: requireOptimizerEdits(args.spec),
|
|
1358
|
+
});
|
|
1359
|
+
const usage = assignUsageRole("optimizer", result.usage);
|
|
1360
|
+
return {
|
|
1361
|
+
...args.job,
|
|
1362
|
+
status: "succeeded",
|
|
1363
|
+
attempt: Math.max(1, args.job.attempt),
|
|
1364
|
+
startedAt,
|
|
1365
|
+
finishedAt,
|
|
1366
|
+
updatedAt: finishedAt,
|
|
1367
|
+
output: {
|
|
1368
|
+
ok: true,
|
|
1369
|
+
executionId: execution.id,
|
|
1370
|
+
purpose: execution.purpose,
|
|
1371
|
+
subjectId: workload.subjectId,
|
|
1372
|
+
attemptIndex: workload.attemptIndex,
|
|
1373
|
+
baseId: workload.baseId,
|
|
1374
|
+
prompt: workload.prompt,
|
|
1375
|
+
subjectPatch,
|
|
1376
|
+
fileChanges: subjectPatch.fileChanges,
|
|
1377
|
+
files: subjectRevisionFiles,
|
|
1378
|
+
traces: traceFilePaths(result.files),
|
|
1379
|
+
...(usage ? { usage } : {}),
|
|
1380
|
+
...(result.summary !== undefined ? { summary: result.summary } : {}),
|
|
1381
|
+
...(result.feedback !== undefined ? { feedback: result.feedback } : {}),
|
|
1382
|
+
},
|
|
1383
|
+
};
|
|
1384
|
+
}
|
|
1385
|
+
async function executeAttemptExecutionInSandbox(args, execution, startedAt, capability, eventPublisher) {
|
|
1386
|
+
const workload = createWorkbenchRunWorkload({
|
|
1387
|
+
job: args.job,
|
|
1388
|
+
spec: args.spec,
|
|
1389
|
+
baseFiles: args.baseFiles,
|
|
1390
|
+
engineResolveFiles: args.engineResolveFiles,
|
|
1391
|
+
engineCases: args.engineCases,
|
|
1392
|
+
traceFiles: args.traceFiles,
|
|
1393
|
+
});
|
|
1394
|
+
const workloadResult = await runHostedCommandExecutionPhases(args, workload, attemptPhasesForExecution(execution, args.spec, args.adapterManifests), startedAt, {
|
|
1395
|
+
capability,
|
|
1396
|
+
eventPublisher,
|
|
1397
|
+
});
|
|
1398
|
+
if (workloadResult.error || (workloadResult.exitCode ?? 0) !== 0) {
|
|
1399
|
+
return failWorkbenchRunJob(args.job, startedAt, workloadResult.error ?? `Attempt adapter execution exited with status ${workloadResult.exitCode}.`, workloadResult.finishedAt, workloadResult);
|
|
1400
|
+
}
|
|
1401
|
+
const engineResult = workloadResult.result;
|
|
1402
|
+
if (!engineResult ||
|
|
1403
|
+
typeof engineResult.score !== "number" ||
|
|
1404
|
+
!Number.isFinite(engineResult.score)) {
|
|
1405
|
+
return failWorkbenchRunJob(args.job, startedAt, "Attempt engine must return a workbench-result result with a finite numeric score.", workloadResult.finishedAt, workloadResult);
|
|
1406
|
+
}
|
|
1407
|
+
const finishedAt = workloadResult.finishedAt ?? new Date().toISOString();
|
|
1408
|
+
const usage = mergeUsageSummaries([
|
|
1409
|
+
workloadResult.usage,
|
|
1410
|
+
engineResult.usage,
|
|
1411
|
+
]);
|
|
1412
|
+
const sample = evaluateSample({
|
|
1413
|
+
subjectId: workload.subjectId,
|
|
1414
|
+
files: workloadResult.files,
|
|
1415
|
+
engineResolveFiles: workload.engineResolveFiles,
|
|
1416
|
+
spec: workload.spec,
|
|
1417
|
+
attemptIndex: workload.attemptIndex,
|
|
1418
|
+
sampleIndex: workload.sampleIndex,
|
|
1419
|
+
caseId: workload.caseId,
|
|
1420
|
+
startedAt,
|
|
1421
|
+
finishedAt,
|
|
1422
|
+
durationMs: workloadResult.durationMs,
|
|
1423
|
+
workload: {
|
|
1424
|
+
...workloadResult,
|
|
1425
|
+
...(usage ? { usage } : {}),
|
|
1426
|
+
result: engineResult,
|
|
1427
|
+
},
|
|
1428
|
+
});
|
|
1429
|
+
return {
|
|
1430
|
+
...args.job,
|
|
1431
|
+
status: "succeeded",
|
|
1432
|
+
attempt: Math.max(1, args.job.attempt),
|
|
1433
|
+
startedAt,
|
|
1434
|
+
finishedAt: sample.finishedAt,
|
|
1435
|
+
updatedAt: sample.finishedAt ?? startedAt,
|
|
1436
|
+
output: {
|
|
1437
|
+
ok: true,
|
|
1438
|
+
executionId: execution.id,
|
|
1439
|
+
purpose: execution.purpose,
|
|
1440
|
+
subjectId: workload.subjectId,
|
|
1441
|
+
attemptIndex: workload.attemptIndex,
|
|
1442
|
+
sampleIndex: workload.sampleIndex,
|
|
1443
|
+
caseId: workload.caseId,
|
|
1444
|
+
prompt: workload.prompt,
|
|
1445
|
+
result: engineResult,
|
|
1446
|
+
fileChanges: workloadResult.fileChanges.length > 0
|
|
1447
|
+
? workloadResult.fileChanges
|
|
1448
|
+
: workload.changedPaths,
|
|
1449
|
+
files: workloadResult.files,
|
|
1450
|
+
sample,
|
|
1451
|
+
...(usage ? { usage } : {}),
|
|
1452
|
+
traces: traceFilePaths(workloadResult.files),
|
|
1453
|
+
},
|
|
1454
|
+
};
|
|
1455
|
+
}
|
|
1456
|
+
async function runHostedProtocolExecutionResult(args, execution, startedAt, capability, eventPublisher) {
|
|
1457
|
+
const workload = createWorkbenchRunWorkload({
|
|
1458
|
+
job: args.job,
|
|
1459
|
+
spec: args.spec,
|
|
1460
|
+
baseFiles: args.baseFiles,
|
|
1461
|
+
engineResolveFiles: args.engineResolveFiles,
|
|
1462
|
+
engineCases: args.engineCases,
|
|
1463
|
+
traceFiles: args.traceFiles,
|
|
1464
|
+
});
|
|
1465
|
+
const result = await runHostedCommandExecutionPhases(args, workload, [protocolPhaseForExecution(execution, args.adapterManifests)], startedAt, {
|
|
1466
|
+
capability,
|
|
1467
|
+
eventPublisher,
|
|
1468
|
+
});
|
|
1469
|
+
return { workload, result };
|
|
1470
|
+
}
|
|
1471
|
+
async function runHostedCommandExecutionPhases(args, workload, phases, startedAt, options = {}) {
|
|
1472
|
+
const [{ execFile }, fs, os, path, { promisify }] = await Promise.all([
|
|
1473
|
+
importNodeModule(nodeBuiltin("child_process")),
|
|
1474
|
+
importNodeModule(nodeBuiltin("fs/promises")),
|
|
1475
|
+
importNodeModule(nodeBuiltin("os")),
|
|
1476
|
+
importNodeModule(nodeBuiltin("path")),
|
|
1477
|
+
importNodeModule(nodeBuiltin("util")),
|
|
1478
|
+
]);
|
|
1479
|
+
const execFileAsync = promisify(execFile);
|
|
1480
|
+
const resolvedRuntime = workload.engineCaseSpec
|
|
1481
|
+
? resolveEngineCaseExecutionConfig({
|
|
1482
|
+
spec: workload.spec,
|
|
1483
|
+
engineCase: workload.engineCaseSpec,
|
|
1484
|
+
}).environment
|
|
1485
|
+
: workload.spec.environment;
|
|
1486
|
+
const environmentVersion = args.environmentVersion
|
|
1487
|
+
? environmentVersionForRuntime(resolvedRuntime, args.environmentVersion)
|
|
1488
|
+
: environmentVersionForRuntime(resolvedRuntime);
|
|
1489
|
+
const workspace = await createRuntimeWorkspaceRoot(args, fs, os, path, "workbench-execution-sandbox-");
|
|
1490
|
+
try {
|
|
1491
|
+
await stageWorkbenchRunWorkload(workspace.root, workload);
|
|
1492
|
+
let exitCode = 0;
|
|
1493
|
+
let runtimeError;
|
|
1494
|
+
const phaseResults = [];
|
|
1495
|
+
try {
|
|
1496
|
+
if (!environmentVersion) {
|
|
1497
|
+
throw new Error("environment is required for adapter command executions.");
|
|
1498
|
+
}
|
|
1499
|
+
if (environmentVersion) {
|
|
1500
|
+
await fs.writeFile(path.join(outputDir(workspace.root), "sandbox-environment.json"), `${JSON.stringify({
|
|
1501
|
+
imageRef: environmentVersion.imageRef,
|
|
1502
|
+
resources: environmentVersion.spec.resources,
|
|
1503
|
+
network: environmentVersion.spec.network,
|
|
1504
|
+
}, null, 2)}\n`);
|
|
1505
|
+
}
|
|
1506
|
+
const phaseTimeoutMs = environmentVersion
|
|
1507
|
+
? environmentVersionTimeoutMs(environmentVersion)
|
|
1508
|
+
: 5 * 60 * 1000;
|
|
1509
|
+
const execution = readWorkbenchExecutionSpec(workload.job);
|
|
1510
|
+
for (const phase of phases) {
|
|
1511
|
+
await resetHostedWorkloadPhaseOutput(workspace.root, phase);
|
|
1512
|
+
if (phase.kind === "engine" && execution.purpose === "attempt") {
|
|
1513
|
+
await stageAttemptScoringInputs(workspace.root, workload);
|
|
1514
|
+
}
|
|
1515
|
+
const adapterRequestPath = await writeWorkbenchAdapterRequest(workspace.root, workload, execution, phase, adapterAuthRequestForPhase(args, phase.adapter?.use ?? execution.adapter.use), args.adapterManifests);
|
|
1516
|
+
const phaseRole = phaseEventRole(phase);
|
|
1517
|
+
await publishCommandPhaseEvent(options.eventPublisher, {
|
|
1518
|
+
phase: phase.label,
|
|
1519
|
+
status: "started",
|
|
1520
|
+
...(phaseRole ? { role: phaseRole } : {}),
|
|
1521
|
+
});
|
|
1522
|
+
try {
|
|
1523
|
+
if (!phase.command) {
|
|
1524
|
+
throw new Error(`Adapter phase ${phase.label} is missing a command.`);
|
|
1525
|
+
}
|
|
1526
|
+
const command = createHostedWorkloadShellCommand(workspace.root, phase.command, phase.label, phase.okExitCodes);
|
|
1527
|
+
await execFileAsync("sh", ["-c", command], {
|
|
1528
|
+
cwd: workspace.root,
|
|
1529
|
+
env: createHostedWorkloadPhaseEnv(workspace.root, adapterRequestPath, args.adapterAuthEnv),
|
|
1530
|
+
maxBuffer: 10 * 1024 * 1024,
|
|
1531
|
+
timeout: phaseTimeoutMs,
|
|
1532
|
+
});
|
|
1533
|
+
const operationResult = await readWorkbenchAdapterOperationResult(outputDir(workspace.root), phase.operation);
|
|
1534
|
+
assertWorkbenchAdapterOperationResultOk(operationResult, `Adapter ${phase.adapter?.use ?? execution.adapter.use} ${phase.operation}`);
|
|
1535
|
+
phaseResults.push(operationResult);
|
|
1536
|
+
await publishCommandPhaseEvent(options.eventPublisher, {
|
|
1537
|
+
phase: phase.label,
|
|
1538
|
+
status: "succeeded",
|
|
1539
|
+
...(phaseRole ? { role: phaseRole } : {}),
|
|
1540
|
+
});
|
|
1541
|
+
}
|
|
1542
|
+
catch (error) {
|
|
1543
|
+
await publishCommandPhaseEvent(options.eventPublisher, {
|
|
1544
|
+
phase: phase.label,
|
|
1545
|
+
status: "failed",
|
|
1546
|
+
exitCode: readExitCode(error),
|
|
1547
|
+
error: error instanceof Error ? error.message : String(error),
|
|
1548
|
+
...(phaseRole ? { role: phaseRole } : {}),
|
|
1549
|
+
});
|
|
1550
|
+
throw error;
|
|
1551
|
+
}
|
|
1552
|
+
}
|
|
1553
|
+
}
|
|
1554
|
+
catch (error) {
|
|
1555
|
+
exitCode = readExitCode(error);
|
|
1556
|
+
if (exitCode === 0) {
|
|
1557
|
+
exitCode = 1;
|
|
1558
|
+
}
|
|
1559
|
+
runtimeError =
|
|
1560
|
+
error instanceof Error ? (error.stack ?? error.message) : String(error);
|
|
1561
|
+
await fs
|
|
1562
|
+
.writeFile(path.join(outputDir(workspace.root), "sandbox_error.log"), `${runtimeError}\n`)
|
|
1563
|
+
.catch(() => undefined);
|
|
1564
|
+
}
|
|
1565
|
+
if (exitCode !== 0) {
|
|
1566
|
+
return await readHostedRunFailureResult(workspace.root, workload, {
|
|
1567
|
+
exitCode,
|
|
1568
|
+
error: runtimeError ?? `Runtime command exited with status ${exitCode}.`,
|
|
1569
|
+
startedAt,
|
|
1570
|
+
});
|
|
1571
|
+
}
|
|
1572
|
+
return await readWorkbenchRunWorkloadResult(workspace.root, workload, {
|
|
1573
|
+
exitCode,
|
|
1574
|
+
startedAt,
|
|
1575
|
+
phaseResults,
|
|
1576
|
+
});
|
|
1577
|
+
}
|
|
1578
|
+
finally {
|
|
1579
|
+
await workspace.cleanup();
|
|
1580
|
+
}
|
|
1581
|
+
}
|
|
1582
|
+
async function createRuntimeWorkspaceRoot(args, fs, os, path, prefix) {
|
|
1583
|
+
if (args.workspaceRoot) {
|
|
1584
|
+
await fs.mkdir(args.workspaceRoot, { recursive: true });
|
|
1585
|
+
return { root: args.workspaceRoot, cleanup: async () => undefined };
|
|
1586
|
+
}
|
|
1587
|
+
if (args.workdir) {
|
|
1588
|
+
await fs.mkdir(args.workdir, { recursive: true });
|
|
1589
|
+
const root = await fs.mkdtemp(path.join(args.workdir, prefix));
|
|
1590
|
+
return {
|
|
1591
|
+
root,
|
|
1592
|
+
cleanup: async () => {
|
|
1593
|
+
await fs
|
|
1594
|
+
.rm(root, { force: true, recursive: true })
|
|
1595
|
+
.catch(() => undefined);
|
|
1596
|
+
},
|
|
1597
|
+
};
|
|
1598
|
+
}
|
|
1599
|
+
const sandboxRoot = "/workspace";
|
|
1600
|
+
const sandboxRootStat = await fs.stat(sandboxRoot).catch(() => null);
|
|
1601
|
+
if (sandboxRootStat?.isDirectory()) {
|
|
1602
|
+
return {
|
|
1603
|
+
root: sandboxRoot,
|
|
1604
|
+
cleanup: async () => undefined,
|
|
1605
|
+
};
|
|
1606
|
+
}
|
|
1607
|
+
const root = await fs.mkdtemp(path.join(os.tmpdir(), prefix));
|
|
1608
|
+
return {
|
|
1609
|
+
root,
|
|
1610
|
+
cleanup: async () => {
|
|
1611
|
+
await fs
|
|
1612
|
+
.rm(root, { force: true, recursive: true })
|
|
1613
|
+
.catch(() => undefined);
|
|
1614
|
+
},
|
|
1615
|
+
};
|
|
1616
|
+
}
|
|
1617
|
+
function phaseEventRole(phase) {
|
|
1618
|
+
if (phase.kind === "optimizer") {
|
|
1619
|
+
return "optimizer";
|
|
1620
|
+
}
|
|
1621
|
+
if (phase.kind === "runner") {
|
|
1622
|
+
return "runner";
|
|
1623
|
+
}
|
|
1624
|
+
if (phase.kind === "engine") {
|
|
1625
|
+
return "engine";
|
|
1626
|
+
}
|
|
1627
|
+
return undefined;
|
|
1628
|
+
}
|
|
1629
|
+
function adapterOperationUsageSummary(result) {
|
|
1630
|
+
if (result.operation === "optimizer.improve") {
|
|
1631
|
+
return assignUsageRole("optimizer", result.usage);
|
|
1632
|
+
}
|
|
1633
|
+
if (result.operation === "subject.run") {
|
|
1634
|
+
return assignUsageRole("runner", result.usage);
|
|
1635
|
+
}
|
|
1636
|
+
if (result.operation === "engine.run") {
|
|
1637
|
+
return assignUsageRole("engine", result.usage);
|
|
1638
|
+
}
|
|
1639
|
+
return result.usage;
|
|
1640
|
+
}
|
|
1641
|
+
function executionPurposeRole(purpose) {
|
|
1642
|
+
if (purpose === "improve") {
|
|
1643
|
+
return "optimizer";
|
|
1644
|
+
}
|
|
1645
|
+
return "runner";
|
|
1646
|
+
}
|
|
1647
|
+
function createSubjectPatchFromResult(result, spec) {
|
|
1648
|
+
if (result.subjectPatch) {
|
|
1649
|
+
return result.subjectPatch;
|
|
1650
|
+
}
|
|
1651
|
+
const changedEditPaths = result.fileChanges
|
|
1652
|
+
.map(normalizeRelativePath)
|
|
1653
|
+
.filter((filePath) => !filePath.startsWith(".workbench/") &&
|
|
1654
|
+
isSubjectEditPath(filePath, optimizerEdits(spec)));
|
|
1655
|
+
const changedSet = new Set(changedEditPaths);
|
|
1656
|
+
const files = result.files
|
|
1657
|
+
.filter((file) => changedSet.has(normalizeRelativePath(file.path)))
|
|
1658
|
+
.map((file) => ({ ...file, path: normalizeRelativePath(file.path) }));
|
|
1659
|
+
return {
|
|
1660
|
+
files,
|
|
1661
|
+
fileChanges: changedEditPaths,
|
|
1662
|
+
...(result.summary ? { summary: result.summary } : {}),
|
|
1663
|
+
...(result.feedback !== undefined ? { feedback: result.feedback } : {}),
|
|
1664
|
+
};
|
|
1665
|
+
}
|
|
1666
|
+
function isSubjectEditPath(filePath, edits) {
|
|
1667
|
+
const normalized = normalizeRelativePath(filePath);
|
|
1668
|
+
return edits.some((entry) => {
|
|
1669
|
+
const editPath = normalizeRelativePath(entry).replace(/\/+$/u, "");
|
|
1670
|
+
return (normalized === editPath || normalized.startsWith(`${editPath}/`));
|
|
1671
|
+
});
|
|
1672
|
+
}
|
|
1673
|
+
function environmentVersionForSpec(spec) {
|
|
1674
|
+
return environmentVersionForRuntime(spec.environment);
|
|
1675
|
+
}
|
|
1676
|
+
function environmentVersionForRuntime(runtime, base) {
|
|
1677
|
+
const image = runtime.dockerfile;
|
|
1678
|
+
const resolved = findEnvironmentVersionForImage(image, DEFAULT_ENVIRONMENT_VERSIONS) ?? base;
|
|
1679
|
+
if (resolved) {
|
|
1680
|
+
return {
|
|
1681
|
+
id: resolved.id,
|
|
1682
|
+
imageRef: resolved.imageRef,
|
|
1683
|
+
spec: {
|
|
1684
|
+
...resolved.spec,
|
|
1685
|
+
network: environmentNetwork(runtime),
|
|
1686
|
+
resources: definedEnvironmentResources(environmentResources(runtime)),
|
|
1687
|
+
},
|
|
1688
|
+
};
|
|
1689
|
+
}
|
|
1690
|
+
return {
|
|
1691
|
+
id: "spec_environment",
|
|
1692
|
+
imageRef: image.startsWith("dockerfile://")
|
|
1693
|
+
? image
|
|
1694
|
+
: `dockerfile://${image}`,
|
|
1695
|
+
spec: {
|
|
1696
|
+
base: "custom",
|
|
1697
|
+
network: environmentNetwork(runtime),
|
|
1698
|
+
resources: definedEnvironmentResources(environmentResources(runtime)),
|
|
1699
|
+
},
|
|
1700
|
+
};
|
|
1701
|
+
}
|
|
1702
|
+
function definedEnvironmentResources(resources) {
|
|
1703
|
+
return {
|
|
1704
|
+
cpu: resources?.cpu ?? 2,
|
|
1705
|
+
memoryGb: resources?.memoryGb ?? 4,
|
|
1706
|
+
diskGb: resources?.diskGb ?? 10,
|
|
1707
|
+
timeoutMinutes: resources?.timeoutMinutes ?? 30,
|
|
1708
|
+
};
|
|
1709
|
+
}
|
|
1710
|
+
export async function stageWorkbenchRunWorkload(root, workload) {
|
|
1711
|
+
const fs = await importNodeModule(nodeBuiltin("fs/promises"));
|
|
1712
|
+
const purpose = readWorkloadExecutionPurpose(workload);
|
|
1713
|
+
await Promise.all([
|
|
1714
|
+
fs
|
|
1715
|
+
.rm(inputDir(root), { recursive: true, force: true })
|
|
1716
|
+
.catch(() => undefined),
|
|
1717
|
+
fs
|
|
1718
|
+
.rm(outputDir(root), { recursive: true, force: true })
|
|
1719
|
+
.catch(() => undefined),
|
|
1720
|
+
fs
|
|
1721
|
+
.rm(runtimePrivateDir(root), { recursive: true, force: true })
|
|
1722
|
+
.catch(() => undefined),
|
|
1723
|
+
fs
|
|
1724
|
+
.rm(runtimeLogsDir(root), { recursive: true, force: true })
|
|
1725
|
+
.catch(() => undefined),
|
|
1726
|
+
]);
|
|
1727
|
+
await fs.mkdir(inputDir(root), { recursive: true });
|
|
1728
|
+
await fs.mkdir(outputDir(root), { recursive: true });
|
|
1729
|
+
if (purpose === "attempt") {
|
|
1730
|
+
assertMutableWorkspaceFiles(workload.subjectFiles, "Subject files");
|
|
1731
|
+
await fs.mkdir(subjectDir(root), { recursive: true });
|
|
1732
|
+
await fs.mkdir(caseDir(root), { recursive: true });
|
|
1733
|
+
await fs.mkdir(runtimeLogsAgentDir(root), { recursive: true });
|
|
1734
|
+
await fs.mkdir(runtimeLogsVerifierDir(root), { recursive: true });
|
|
1735
|
+
const engineCase = requireWorkloadEngineCase(workload, "Attempt staging");
|
|
1736
|
+
await writeSurfaceFiles(subjectDir(root), workload.subjectFiles);
|
|
1737
|
+
await writeSurfaceFiles(caseDir(root), engineCaseSubjectVisibleFiles(engineCase));
|
|
1738
|
+
await writeSurfaceFiles(root, workload.subjectFiles);
|
|
1739
|
+
return;
|
|
1740
|
+
}
|
|
1741
|
+
if (purpose === "improve") {
|
|
1742
|
+
assertMutableWorkspaceFiles(workload.subjectFiles, "Subject files");
|
|
1743
|
+
await fs.mkdir(subjectDir(root), { recursive: true });
|
|
1744
|
+
await writeSurfaceFiles(subjectDir(root), workload.subjectFiles);
|
|
1745
|
+
await writeSurfaceFiles(root, workload.subjectFiles);
|
|
1746
|
+
await fs.mkdir(tracesDir(root), { recursive: true });
|
|
1747
|
+
await writeSurfaceFiles(tracesDir(root), workload.traceFiles);
|
|
1748
|
+
}
|
|
1749
|
+
}
|
|
1750
|
+
async function stageAttemptScoringInputs(root, workload) {
|
|
1751
|
+
const fs = await importNodeModule(nodeBuiltin("fs/promises"));
|
|
1752
|
+
const engineCase = requireWorkloadEngineCase(workload, "Attempt scoring");
|
|
1753
|
+
await Promise.all([
|
|
1754
|
+
fs
|
|
1755
|
+
.rm(runtimeEnginePrivateDir(root), { recursive: true, force: true })
|
|
1756
|
+
.catch(() => undefined),
|
|
1757
|
+
fs
|
|
1758
|
+
.rm(runtimeLogsVerifierDir(root), { recursive: true, force: true })
|
|
1759
|
+
.catch(() => undefined),
|
|
1760
|
+
]);
|
|
1761
|
+
await fs.mkdir(runtimeEnginePrivateDir(root), { recursive: true });
|
|
1762
|
+
await fs.mkdir(runtimeLogsVerifierDir(root), { recursive: true });
|
|
1763
|
+
await writeSurfaceFiles(runtimeEnginePrivateDir(root), engineCaseEnginePrivateFiles(engineCase));
|
|
1764
|
+
}
|
|
1765
|
+
async function readHostedRunFailureResult(root, workload, options) {
|
|
1766
|
+
const traceFiles = await readRuntimeTraceFiles(root, workload);
|
|
1767
|
+
const outputFiles = filterRuntimeOutputFiles(await readSurfaceFiles(outputDir(root)));
|
|
1768
|
+
const startedAt = options.startedAt ?? new Date().toISOString();
|
|
1769
|
+
const finishedAt = new Date().toISOString();
|
|
1770
|
+
const files = [...outputFiles, ...traceFiles].sort((left, right) => left.path.localeCompare(right.path));
|
|
1771
|
+
return {
|
|
1772
|
+
files,
|
|
1773
|
+
fileChanges: files.map((file) => file.path),
|
|
1774
|
+
exitCode: options.exitCode,
|
|
1775
|
+
error: options.error,
|
|
1776
|
+
startedAt,
|
|
1777
|
+
finishedAt,
|
|
1778
|
+
};
|
|
1779
|
+
}
|
|
1780
|
+
async function readWorkbenchRunWorkloadResult(root, workload, options = {}) {
|
|
1781
|
+
const path = await importNodeModule(nodeBuiltin("path"));
|
|
1782
|
+
const traceFiles = await readRuntimeTraceFiles(root, workload);
|
|
1783
|
+
const outputFiles = filterRuntimeOutputFiles(await readSurfaceFiles(outputDir(root)));
|
|
1784
|
+
const outputExitCode = await readOptionalNumber(path.join(outputDir(root), "exit_code"));
|
|
1785
|
+
const startedAt = options.startedAt ?? new Date().toISOString();
|
|
1786
|
+
const finishedAt = new Date().toISOString();
|
|
1787
|
+
const purpose = readWorkloadExecutionPurpose(workload);
|
|
1788
|
+
const primaryOperation = purpose === "improve"
|
|
1789
|
+
? "optimizer.improve"
|
|
1790
|
+
: "engine.run";
|
|
1791
|
+
const primaryResult = [...(options.phaseResults ?? [])]
|
|
1792
|
+
.reverse()
|
|
1793
|
+
.find((result) => result.operation === primaryOperation);
|
|
1794
|
+
const resultPayload = jsonRecord(primaryResult?.value);
|
|
1795
|
+
const usage = mergeUsageSummaries([
|
|
1796
|
+
options.usage,
|
|
1797
|
+
...(options.phaseResults ?? []).map(adapterOperationUsageSummary),
|
|
1798
|
+
]);
|
|
1799
|
+
const metrics = normalizeRewardMetrics(resultPayload.metrics);
|
|
1800
|
+
const cases = normalizeRewardCases(resultPayload.cases);
|
|
1801
|
+
const includeResultScoring = purpose === "attempt";
|
|
1802
|
+
const files = [...outputFiles, ...traceFiles].sort((left, right) => left.path.localeCompare(right.path));
|
|
1803
|
+
const subjectPatch = purpose === "improve" ? primaryResult?.value : undefined;
|
|
1804
|
+
const engineResult = purpose === "attempt" ? primaryResult?.value : undefined;
|
|
1805
|
+
const declaredChanges = subjectPatch?.fileChanges ??
|
|
1806
|
+
(Array.isArray(resultPayload.fileChanges)
|
|
1807
|
+
? resultPayload.fileChanges.filter((entry) => typeof entry === "string")
|
|
1808
|
+
: files.map((file) => file.path));
|
|
1809
|
+
return {
|
|
1810
|
+
files,
|
|
1811
|
+
fileChanges: declaredChanges,
|
|
1812
|
+
...(subjectPatch ? { subjectPatch } : {}),
|
|
1813
|
+
...(engineResult ? { result: engineResult } : {}),
|
|
1814
|
+
...(includeResultScoring && metrics ? { metrics } : {}),
|
|
1815
|
+
...(includeResultScoring && cases ? { cases } : {}),
|
|
1816
|
+
...(typeof resultPayload.summary === "string"
|
|
1817
|
+
? { summary: resultPayload.summary }
|
|
1818
|
+
: primaryResult?.summary !== undefined
|
|
1819
|
+
? { summary: primaryResult.summary }
|
|
1820
|
+
: {}),
|
|
1821
|
+
...(resultPayload.feedback !== undefined
|
|
1822
|
+
? { feedback: resultPayload.feedback }
|
|
1823
|
+
: primaryResult?.feedback !== undefined
|
|
1824
|
+
? { feedback: primaryResult.feedback }
|
|
1825
|
+
: {}),
|
|
1826
|
+
...(usage ? { usage } : {}),
|
|
1827
|
+
exitCode: options.exitCode ?? outputExitCode ?? 0,
|
|
1828
|
+
...(options.error ? { error: options.error } : {}),
|
|
1829
|
+
startedAt,
|
|
1830
|
+
finishedAt,
|
|
1831
|
+
durationMs: Math.max(0, Date.parse(finishedAt) - Date.parse(startedAt)),
|
|
1832
|
+
};
|
|
1833
|
+
}
|
|
1834
|
+
async function readRuntimeTraceFiles(root, workload) {
|
|
1835
|
+
const path = await importNodeModule(nodeBuiltin("path"));
|
|
1836
|
+
const traceRoot = path.join(outputDir(root), ".workbench", "traces", workload.job.id);
|
|
1837
|
+
const purpose = readWorkloadExecutionPurpose(workload);
|
|
1838
|
+
const outputTraceRoot = workbenchTracePhaseDirectory({
|
|
1839
|
+
sequence: 1,
|
|
1840
|
+
runId: workload.job.runId,
|
|
1841
|
+
phase: purpose,
|
|
1842
|
+
});
|
|
1843
|
+
return (await readSurfaceFiles(traceRoot)).map((file) => ({
|
|
1844
|
+
...file,
|
|
1845
|
+
path: normalizeRelativePath(`${outputTraceRoot}/${workload.job.id}/${file.path}`),
|
|
1846
|
+
}));
|
|
1847
|
+
}
|
|
1848
|
+
function filterRuntimeOutputFiles(files) {
|
|
1849
|
+
return files.filter((file) => !isWorkbenchInternalOutputPath(file.path));
|
|
1850
|
+
}
|
|
1851
|
+
function createHostedWorkloadShellCommand(root, command, prefix = "", okExitCodes = [0]) {
|
|
1852
|
+
const outputPrefix = prefix ? `${prefix}_` : "";
|
|
1853
|
+
const okExpression = [...new Set(okExitCodes)]
|
|
1854
|
+
.sort((left, right) => left - right)
|
|
1855
|
+
.map((code) => `[ "$status" -eq ${code} ]`)
|
|
1856
|
+
.join(" || ");
|
|
1857
|
+
const output = quoteShellArg(outputDir(root));
|
|
1858
|
+
const stdout = quoteShellArg(`${outputDir(root)}/${outputPrefix}stdout.log`);
|
|
1859
|
+
const stderr = quoteShellArg(`${outputDir(root)}/${outputPrefix}stderr.log`);
|
|
1860
|
+
return [
|
|
1861
|
+
`mkdir -p ${output}`,
|
|
1862
|
+
`(${command}) > ${stdout} 2> ${stderr}`,
|
|
1863
|
+
"status=$?",
|
|
1864
|
+
`printf '%s\\n' "$status" > ${quoteShellArg(`${outputDir(root)}/${outputPrefix}exit_code`)}`,
|
|
1865
|
+
okExpression ? `{ ${okExpression}; } && exit 0` : "",
|
|
1866
|
+
`if [ -s ${stderr} ]; then sed -n '1,120p' ${stderr} >&2; fi`,
|
|
1867
|
+
`if [ -s ${stdout} ]; then sed -n '1,40p' ${stdout} >&2; fi`,
|
|
1868
|
+
'exit "$status"',
|
|
1869
|
+
].join("; ");
|
|
1870
|
+
}
|
|
1871
|
+
async function resetHostedWorkloadPhaseOutput(root, _phase) {
|
|
1872
|
+
const fs = await importNodeModule(nodeBuiltin("fs/promises"));
|
|
1873
|
+
await fs
|
|
1874
|
+
.rm(workbenchAdapterOperationResultPath(outputDir(root)), { force: true })
|
|
1875
|
+
.catch(() => undefined);
|
|
1876
|
+
}
|
|
1877
|
+
async function writeWorkbenchAdapterRequest(root, workload, execution, phase, auth, manifests) {
|
|
1878
|
+
const [fs, path] = await Promise.all([
|
|
1879
|
+
importNodeModule(nodeBuiltin("fs/promises")),
|
|
1880
|
+
importNodeModule(nodeBuiltin("path")),
|
|
1881
|
+
]);
|
|
1882
|
+
const requestPath = path.join(root, ".workbench", "request.json");
|
|
1883
|
+
await fs.mkdir(path.dirname(requestPath), { recursive: true });
|
|
1884
|
+
const casePrompt = workload.engineCaseSpec?.prompt;
|
|
1885
|
+
const adapter = phase.adapter ?? execution.adapter;
|
|
1886
|
+
const subjectCommand = adapterProtocolCommandSpec(workload.spec.run, "subject.run", manifests).command;
|
|
1887
|
+
await fs.writeFile(requestPath, `${JSON.stringify({
|
|
1888
|
+
protocol: "workbench.adapter.v3",
|
|
1889
|
+
id: execution.id,
|
|
1890
|
+
jobId: workload.job.id,
|
|
1891
|
+
operation: phase.operation,
|
|
1892
|
+
invocation: {
|
|
1893
|
+
use: adapter.use,
|
|
1894
|
+
with: adapterConfigRecord(adapter, manifests),
|
|
1895
|
+
...(adapter.auth !== undefined ? { auth: adapter.auth } : {}),
|
|
1896
|
+
},
|
|
1897
|
+
...(auth !== undefined ? { auth } : {}),
|
|
1898
|
+
context: {
|
|
1899
|
+
benchmark: {
|
|
1900
|
+
name: workload.spec.benchmark.name,
|
|
1901
|
+
description: workload.spec.benchmark.description,
|
|
1902
|
+
},
|
|
1903
|
+
subject: {
|
|
1904
|
+
id: workload.subjectId,
|
|
1905
|
+
path: workload.spec.subject.files.path,
|
|
1906
|
+
run: {
|
|
1907
|
+
...workload.spec.run,
|
|
1908
|
+
command: subjectCommand,
|
|
1909
|
+
},
|
|
1910
|
+
},
|
|
1911
|
+
...(workload.spec.optimizer
|
|
1912
|
+
? { optimizer: { edits: [...workload.spec.optimizer.edits] } }
|
|
1913
|
+
: {}),
|
|
1914
|
+
attempt: {
|
|
1915
|
+
attemptIndex: workload.attemptIndex,
|
|
1916
|
+
sampleIndex: workload.sampleIndex,
|
|
1917
|
+
caseId: workload.caseId,
|
|
1918
|
+
},
|
|
1919
|
+
case: {
|
|
1920
|
+
id: workload.caseId,
|
|
1921
|
+
...(casePrompt ? { prompt: casePrompt } : {}),
|
|
1922
|
+
},
|
|
1923
|
+
},
|
|
1924
|
+
paths: {
|
|
1925
|
+
workspace: root,
|
|
1926
|
+
cwd: root,
|
|
1927
|
+
output: outputDir(root),
|
|
1928
|
+
result: workbenchAdapterOperationResultPath(outputDir(root)),
|
|
1929
|
+
subject: subjectDir(root),
|
|
1930
|
+
...(workload.engineCaseSpec ? { case: caseDir(root) } : {}),
|
|
1931
|
+
traces: tracesDir(root),
|
|
1932
|
+
...(phase.kind === "engine" ? { enginePrivate: runtimeEnginePrivateDir(root) } : {}),
|
|
1933
|
+
logs: runtimeLogsDir(root),
|
|
1934
|
+
},
|
|
1935
|
+
}, null, 2)}\n`);
|
|
1936
|
+
return requestPath;
|
|
1937
|
+
}
|
|
1938
|
+
function optimizerEdits(spec) {
|
|
1939
|
+
return spec.optimizer?.edits ?? [];
|
|
1940
|
+
}
|
|
1941
|
+
function requireOptimizerEdits(spec) {
|
|
1942
|
+
const edits = optimizerEdits(spec);
|
|
1943
|
+
if (edits.length === 0) {
|
|
1944
|
+
throw new Error("Optimizer YAML must declare at least one entry in edits.");
|
|
1945
|
+
}
|
|
1946
|
+
return edits;
|
|
1947
|
+
}
|
|
1948
|
+
function createHostedWorkloadPhaseEnv(root, adapterRequestPath, adapterEnv = {}) {
|
|
1949
|
+
const env = {};
|
|
1950
|
+
for (const [key, value] of Object.entries(process.env)) {
|
|
1951
|
+
if (typeof value === "string") {
|
|
1952
|
+
env[key] = value;
|
|
1953
|
+
}
|
|
1954
|
+
}
|
|
1955
|
+
for (const key of Object.keys(env)) {
|
|
1956
|
+
if (key.startsWith("WORKBENCH_")) {
|
|
1957
|
+
delete env[key];
|
|
1958
|
+
}
|
|
1959
|
+
}
|
|
1960
|
+
const runtimeBins = [
|
|
1961
|
+
"/workbench-runtime/node_modules/.bin",
|
|
1962
|
+
"/workbench-runtime/products/workbench/node_modules/.bin",
|
|
1963
|
+
].join(":");
|
|
1964
|
+
const systemBins = "/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin";
|
|
1965
|
+
env.PATH = process.env.PATH
|
|
1966
|
+
? `${systemBins}:${runtimeBins}:${process.env.PATH}`
|
|
1967
|
+
: `${systemBins}:${runtimeBins}`;
|
|
1968
|
+
env.WORKBENCH_ADAPTER_REQUEST = adapterRequestPath;
|
|
1969
|
+
env.WORKBENCH_OUTPUT = outputDir(root);
|
|
1970
|
+
env.WORKBENCH_RESULT = workbenchAdapterOperationResultPath(outputDir(root));
|
|
1971
|
+
Object.assign(env, adapterEnv);
|
|
1972
|
+
return env;
|
|
1973
|
+
}
|
|
1974
|
+
function readWorkloadExecutionPurpose(workload) {
|
|
1975
|
+
const purpose = workbenchExecutionPurpose(workload.job);
|
|
1976
|
+
if (purpose === "improve" || purpose === "attempt") {
|
|
1977
|
+
return purpose;
|
|
1978
|
+
}
|
|
1979
|
+
throw new Error(`Execution job ${workload.job.id} is missing a supported execution purpose.`);
|
|
1980
|
+
}
|
|
1981
|
+
function requireWorkloadEngineCase(workload, label) {
|
|
1982
|
+
if (!workload.engineCase) {
|
|
1983
|
+
throw new Error(`${label} workload is missing an engine case.`);
|
|
1984
|
+
}
|
|
1985
|
+
return workload.engineCase;
|
|
1986
|
+
}
|
|
1987
|
+
function subjectDir(root) {
|
|
1988
|
+
return `${inputDir(root)}/subject`;
|
|
1989
|
+
}
|
|
1990
|
+
function caseDir(root) {
|
|
1991
|
+
return `${inputDir(root)}/case`;
|
|
1992
|
+
}
|
|
1993
|
+
function tracesDir(root) {
|
|
1994
|
+
return `${inputDir(root)}/traces`;
|
|
1995
|
+
}
|
|
1996
|
+
function inputDir(root) {
|
|
1997
|
+
return `${root}/input`;
|
|
1998
|
+
}
|
|
1999
|
+
function outputDir(root) {
|
|
2000
|
+
return `${root}/output`;
|
|
2001
|
+
}
|
|
2002
|
+
function runtimePrivateDir(root) {
|
|
2003
|
+
return `${root}/private`;
|
|
2004
|
+
}
|
|
2005
|
+
function runtimeEnginePrivateDir(root) {
|
|
2006
|
+
return `${runtimePrivateDir(root)}/engine`;
|
|
2007
|
+
}
|
|
2008
|
+
function runtimeLogsDir(root) {
|
|
2009
|
+
return `${root}/logs`;
|
|
2010
|
+
}
|
|
2011
|
+
function runtimeLogsAgentDir(root) {
|
|
2012
|
+
return `${runtimeLogsDir(root)}/agent`;
|
|
2013
|
+
}
|
|
2014
|
+
function runtimeLogsVerifierDir(root) {
|
|
2015
|
+
return `${runtimeLogsDir(root)}/verifier`;
|
|
2016
|
+
}
|
|
2017
|
+
function assertMutableWorkspaceFiles(files, label) {
|
|
2018
|
+
const reserved = files
|
|
2019
|
+
.map((file) => normalizeRelativePath(file.path))
|
|
2020
|
+
.filter(isRuntimeReservedWorkspacePath);
|
|
2021
|
+
if (reserved.length > 0) {
|
|
2022
|
+
throw new Error(`${label} cannot target runtime-reserved workspace paths: ${reserved.join(", ")}.`);
|
|
2023
|
+
}
|
|
2024
|
+
}
|
|
2025
|
+
function isRuntimeReservedWorkspacePath(normalizedPath) {
|
|
2026
|
+
return normalizedPath === ".workbench" ||
|
|
2027
|
+
normalizedPath.startsWith(".workbench/") ||
|
|
2028
|
+
normalizedPath === "input" ||
|
|
2029
|
+
normalizedPath.startsWith("input/") ||
|
|
2030
|
+
normalizedPath === "output" ||
|
|
2031
|
+
normalizedPath.startsWith("output/") ||
|
|
2032
|
+
normalizedPath === "logs" ||
|
|
2033
|
+
normalizedPath.startsWith("logs/") ||
|
|
2034
|
+
normalizedPath === "private" ||
|
|
2035
|
+
normalizedPath.startsWith("private/");
|
|
2036
|
+
}
|
|
2037
|
+
async function writeSurfaceFiles(root, files) {
|
|
2038
|
+
const fs = await importNodeModule(nodeBuiltin("fs/promises"));
|
|
2039
|
+
const path = await importNodeModule(nodeBuiltin("path"));
|
|
2040
|
+
for (const file of files) {
|
|
2041
|
+
const target = path.join(root, normalizeRelativePath(file.path));
|
|
2042
|
+
await fs.mkdir(path.dirname(target), { recursive: true });
|
|
2043
|
+
const body = file.encoding === "base64"
|
|
2044
|
+
? Buffer.from(file.content, "base64")
|
|
2045
|
+
: Buffer.from(file.content, "utf8");
|
|
2046
|
+
await fs.writeFile(target, body);
|
|
2047
|
+
if (file.executable) {
|
|
2048
|
+
await fs.chmod(target, 0o755).catch(() => undefined);
|
|
2049
|
+
}
|
|
2050
|
+
}
|
|
2051
|
+
}
|
|
2052
|
+
async function readSurfaceFiles(root) {
|
|
2053
|
+
const fs = await importNodeModule(nodeBuiltin("fs/promises"));
|
|
2054
|
+
const path = await importNodeModule(nodeBuiltin("path"));
|
|
2055
|
+
const utf8Decoder = new TextDecoder("utf-8", { fatal: true });
|
|
2056
|
+
const files = [];
|
|
2057
|
+
async function walk(directory) {
|
|
2058
|
+
const entries = await fs
|
|
2059
|
+
.readdir(directory, { withFileTypes: true })
|
|
2060
|
+
.catch(() => []);
|
|
2061
|
+
for (const entry of entries) {
|
|
2062
|
+
const absolutePath = path.join(directory, entry.name);
|
|
2063
|
+
if (entry.isDirectory()) {
|
|
2064
|
+
await walk(absolutePath);
|
|
2065
|
+
continue;
|
|
2066
|
+
}
|
|
2067
|
+
if (!entry.isFile()) {
|
|
2068
|
+
continue;
|
|
2069
|
+
}
|
|
2070
|
+
const relativePath = normalizeRelativePath(path.relative(root, absolutePath).replace(/\\/gu, "/"));
|
|
2071
|
+
const body = await fs.readFile(absolutePath);
|
|
2072
|
+
const stats = await fs.stat(absolutePath);
|
|
2073
|
+
const content = encodeSurfaceSnapshotContent(body, utf8Decoder);
|
|
2074
|
+
files.push({
|
|
2075
|
+
path: relativePath,
|
|
2076
|
+
kind: content.encoding === "base64" ? "binary" : "text",
|
|
2077
|
+
encoding: content.encoding,
|
|
2078
|
+
content: content.content,
|
|
2079
|
+
executable: (stats.mode & 0o111) !== 0,
|
|
2080
|
+
});
|
|
2081
|
+
}
|
|
2082
|
+
}
|
|
2083
|
+
await walk(root);
|
|
2084
|
+
return files.sort((left, right) => left.path.localeCompare(right.path));
|
|
2085
|
+
}
|
|
2086
|
+
function encodeSurfaceSnapshotContent(body, utf8Decoder) {
|
|
2087
|
+
try {
|
|
2088
|
+
return {
|
|
2089
|
+
encoding: "utf8",
|
|
2090
|
+
content: utf8Decoder.decode(body),
|
|
2091
|
+
};
|
|
2092
|
+
}
|
|
2093
|
+
catch {
|
|
2094
|
+
return {
|
|
2095
|
+
encoding: "base64",
|
|
2096
|
+
content: body.toString("base64"),
|
|
2097
|
+
};
|
|
2098
|
+
}
|
|
2099
|
+
}
|
|
2100
|
+
function normalizeRewardMetrics(value) {
|
|
2101
|
+
if (!value || typeof value !== "object" || Array.isArray(value)) {
|
|
2102
|
+
return undefined;
|
|
2103
|
+
}
|
|
2104
|
+
const metrics = {};
|
|
2105
|
+
for (const [key, entry] of Object.entries(value)) {
|
|
2106
|
+
if (typeof entry === "number" && Number.isFinite(entry)) {
|
|
2107
|
+
metrics[key] = entry;
|
|
2108
|
+
}
|
|
2109
|
+
}
|
|
2110
|
+
return Object.keys(metrics).length > 0 ? metrics : undefined;
|
|
2111
|
+
}
|
|
2112
|
+
function normalizeRewardCases(value) {
|
|
2113
|
+
if (!Array.isArray(value)) {
|
|
2114
|
+
return undefined;
|
|
2115
|
+
}
|
|
2116
|
+
const cases = value.flatMap((entry) => {
|
|
2117
|
+
if (!entry || typeof entry !== "object" || Array.isArray(entry)) {
|
|
2118
|
+
return [];
|
|
2119
|
+
}
|
|
2120
|
+
const record = entry;
|
|
2121
|
+
const id = typeof record.id === "string" && record.id.length > 0 ? record.id : "";
|
|
2122
|
+
if (!id) {
|
|
2123
|
+
return [];
|
|
2124
|
+
}
|
|
2125
|
+
const metrics = normalizeRewardMetrics(record.metrics) ?? {};
|
|
2126
|
+
const status = record.status === "completed" || record.status === "error"
|
|
2127
|
+
? record.status
|
|
2128
|
+
: undefined;
|
|
2129
|
+
const criteria = Array.isArray(record.criteria)
|
|
2130
|
+
? record.criteria.flatMap((criterion) => {
|
|
2131
|
+
if (!criterion ||
|
|
2132
|
+
typeof criterion !== "object" ||
|
|
2133
|
+
Array.isArray(criterion)) {
|
|
2134
|
+
return [];
|
|
2135
|
+
}
|
|
2136
|
+
const criterionRecord = criterion;
|
|
2137
|
+
const criterionId = typeof criterionRecord.criterion_id === "string"
|
|
2138
|
+
? criterionRecord.criterion_id
|
|
2139
|
+
: "";
|
|
2140
|
+
const label = typeof criterionRecord.label === "string"
|
|
2141
|
+
? criterionRecord.label
|
|
2142
|
+
: criterionId;
|
|
2143
|
+
const score = typeof criterionRecord.score === "number" &&
|
|
2144
|
+
Number.isFinite(criterionRecord.score)
|
|
2145
|
+
? criterionRecord.score
|
|
2146
|
+
: undefined;
|
|
2147
|
+
const pass = typeof criterionRecord.pass === "boolean"
|
|
2148
|
+
? criterionRecord.pass
|
|
2149
|
+
: score !== undefined
|
|
2150
|
+
? score >= 0.5
|
|
2151
|
+
: undefined;
|
|
2152
|
+
if (!criterionId || score === undefined || pass === undefined) {
|
|
2153
|
+
return [];
|
|
2154
|
+
}
|
|
2155
|
+
const errors = Array.isArray(criterionRecord.errors)
|
|
2156
|
+
? criterionRecord.errors.filter((error) => typeof error === "string")
|
|
2157
|
+
: [];
|
|
2158
|
+
const rationale = typeof criterionRecord.rationale === "string" &&
|
|
2159
|
+
criterionRecord.rationale.trim().length > 0
|
|
2160
|
+
? criterionRecord.rationale.trim()
|
|
2161
|
+
: undefined;
|
|
2162
|
+
return [
|
|
2163
|
+
{
|
|
2164
|
+
criterion_id: criterionId,
|
|
2165
|
+
label,
|
|
2166
|
+
score,
|
|
2167
|
+
pass,
|
|
2168
|
+
...(errors.length > 0 ? { errors } : {}),
|
|
2169
|
+
...(rationale ? { rationale } : {}),
|
|
2170
|
+
},
|
|
2171
|
+
];
|
|
2172
|
+
})
|
|
2173
|
+
: undefined;
|
|
2174
|
+
return [
|
|
2175
|
+
{
|
|
2176
|
+
id,
|
|
2177
|
+
...(typeof record.label === "string" ? { label: record.label } : {}),
|
|
2178
|
+
...(typeof record.split === "string" ? { split: record.split } : {}),
|
|
2179
|
+
...(status ? { status } : {}),
|
|
2180
|
+
...(typeof record.durationMs === "number" &&
|
|
2181
|
+
Number.isFinite(record.durationMs)
|
|
2182
|
+
? { durationMs: record.durationMs }
|
|
2183
|
+
: {}),
|
|
2184
|
+
metrics,
|
|
2185
|
+
...(record.source &&
|
|
2186
|
+
typeof record.source === "object" &&
|
|
2187
|
+
!Array.isArray(record.source)
|
|
2188
|
+
? { source: record.source }
|
|
2189
|
+
: {}),
|
|
2190
|
+
...(record.feedback !== undefined
|
|
2191
|
+
? { feedback: record.feedback }
|
|
2192
|
+
: {}),
|
|
2193
|
+
...(criteria && criteria.length > 0 ? { criteria } : {}),
|
|
2194
|
+
},
|
|
2195
|
+
];
|
|
2196
|
+
});
|
|
2197
|
+
return cases.length > 0 ? cases : undefined;
|
|
2198
|
+
}
|
|
2199
|
+
async function readOptionalNumber(pathname) {
|
|
2200
|
+
const fs = await importNodeModule(nodeBuiltin("fs/promises"));
|
|
2201
|
+
try {
|
|
2202
|
+
const parsed = Number.parseInt(String(await fs.readFile(pathname, "utf8")).trim(), 10);
|
|
2203
|
+
return Number.isFinite(parsed) ? parsed : undefined;
|
|
2204
|
+
}
|
|
2205
|
+
catch (error) {
|
|
2206
|
+
if (error.code === "ENOENT") {
|
|
2207
|
+
return undefined;
|
|
2208
|
+
}
|
|
2209
|
+
throw error;
|
|
2210
|
+
}
|
|
2211
|
+
}
|
|
2212
|
+
export function workloadTimeoutMs(spec) {
|
|
2213
|
+
return environmentVersionTimeoutMs(environmentVersionForSpec(spec));
|
|
2214
|
+
}
|
|
2215
|
+
export function findEnvironmentVersionForImage(image, versions) {
|
|
2216
|
+
const normalizedImage = normalizeDockerImageRef(image);
|
|
2217
|
+
return (versions.find((entry) => normalizeDockerImageRef(entry.imageRef) === normalizedImage) ?? null);
|
|
2218
|
+
}
|
|
2219
|
+
export function normalizeDockerImageRef(image) {
|
|
2220
|
+
return image.startsWith("docker://") ? image : `docker://${image}`;
|
|
2221
|
+
}
|
|
2222
|
+
export function environmentVersionTimeoutMs(version) {
|
|
2223
|
+
const timeoutMinutes = version?.spec.resources.timeoutMinutes ?? 30;
|
|
2224
|
+
return Math.max(1, timeoutMinutes) * 60 * 1000;
|
|
2225
|
+
}
|
|
2226
|
+
function readExitCode(error) {
|
|
2227
|
+
if (error &&
|
|
2228
|
+
typeof error === "object" &&
|
|
2229
|
+
"code" in error &&
|
|
2230
|
+
typeof error.code === "number") {
|
|
2231
|
+
return error.code;
|
|
2232
|
+
}
|
|
2233
|
+
if (error && typeof error === "object" && "signal" in error) {
|
|
2234
|
+
return 124;
|
|
2235
|
+
}
|
|
2236
|
+
return 1;
|
|
2237
|
+
}
|
|
2238
|
+
function failWorkbenchRunJob(job, startedAt, error, finishedAt = new Date().toISOString(), result) {
|
|
2239
|
+
const message = error instanceof Error ? error.message : String(error);
|
|
2240
|
+
const output = {
|
|
2241
|
+
ok: false,
|
|
2242
|
+
error: message,
|
|
2243
|
+
...(result?.files ? { files: result.files } : {}),
|
|
2244
|
+
...(result?.fileChanges ? { fileChanges: result.fileChanges } : {}),
|
|
2245
|
+
};
|
|
2246
|
+
return {
|
|
2247
|
+
...job,
|
|
2248
|
+
status: "failed",
|
|
2249
|
+
attempt: Math.max(1, job.attempt),
|
|
2250
|
+
startedAt,
|
|
2251
|
+
finishedAt,
|
|
2252
|
+
updatedAt: finishedAt,
|
|
2253
|
+
error: message,
|
|
2254
|
+
output,
|
|
2255
|
+
};
|
|
2256
|
+
}
|
|
2257
|
+
function evaluateSample(args) {
|
|
2258
|
+
const durationMs = args.durationMs ??
|
|
2259
|
+
Math.max(0, Date.parse(args.finishedAt) - Date.parse(args.startedAt));
|
|
2260
|
+
const sampleScore = args.workload.result?.score;
|
|
2261
|
+
if (typeof sampleScore !== "number" || !Number.isFinite(sampleScore)) {
|
|
2262
|
+
throw new Error("Evaluation sample requires an engine result with a finite numeric score.");
|
|
2263
|
+
}
|
|
2264
|
+
const cases = args.workload.cases?.length ? args.workload.cases : undefined;
|
|
2265
|
+
const metrics = args.workload.metrics ?? {
|
|
2266
|
+
score: sampleScore,
|
|
2267
|
+
};
|
|
2268
|
+
if (metrics.score === undefined) {
|
|
2269
|
+
metrics.score = sampleScore;
|
|
2270
|
+
}
|
|
2271
|
+
const feedback = {
|
|
2272
|
+
...(args.workload.summary !== undefined
|
|
2273
|
+
? { summary: args.workload.summary }
|
|
2274
|
+
: {}),
|
|
2275
|
+
...(args.workload.feedback !== undefined
|
|
2276
|
+
? { detail: args.workload.feedback }
|
|
2277
|
+
: {}),
|
|
2278
|
+
};
|
|
2279
|
+
const usage = completeUsageSummary(args.workload.usage);
|
|
2280
|
+
return {
|
|
2281
|
+
id: `${args.caseId}__sample_${String(args.sampleIndex + 1).padStart(3, "0")}`,
|
|
2282
|
+
index: args.sampleIndex,
|
|
2283
|
+
subject: {
|
|
2284
|
+
id: args.subjectId,
|
|
2285
|
+
kind: "subject",
|
|
2286
|
+
label: args.subjectId,
|
|
2287
|
+
},
|
|
2288
|
+
status: "completed",
|
|
2289
|
+
startedAt: args.startedAt,
|
|
2290
|
+
finishedAt: args.finishedAt,
|
|
2291
|
+
durationMs,
|
|
2292
|
+
metrics,
|
|
2293
|
+
...(usage ? { usage } : {}),
|
|
2294
|
+
...(cases ? { cases } : {}),
|
|
2295
|
+
feedback,
|
|
2296
|
+
};
|
|
2297
|
+
}
|
|
2298
|
+
function normalizeSampleJobOutput(value, fallbackFiles = []) {
|
|
2299
|
+
if (!value || typeof value !== "object" || Array.isArray(value)) {
|
|
2300
|
+
return null;
|
|
2301
|
+
}
|
|
2302
|
+
const record = value;
|
|
2303
|
+
if (record.ok !== true || typeof record.subjectId !== "string") {
|
|
2304
|
+
return null;
|
|
2305
|
+
}
|
|
2306
|
+
const files = Array.isArray(record.files)
|
|
2307
|
+
? record.files.filter(isSurfaceSnapshotFile)
|
|
2308
|
+
: [];
|
|
2309
|
+
const sample = isEvaluationSampleRecord(record.sample) ? record.sample : null;
|
|
2310
|
+
if (!sample) {
|
|
2311
|
+
return null;
|
|
2312
|
+
}
|
|
2313
|
+
if (typeof record.attemptIndex !== "number" ||
|
|
2314
|
+
!Number.isFinite(record.attemptIndex)) {
|
|
2315
|
+
return null;
|
|
2316
|
+
}
|
|
2317
|
+
const sampleFiles = files.length > 0
|
|
2318
|
+
? files
|
|
2319
|
+
: fallbackFiles.map((file) => ({ ...file }));
|
|
2320
|
+
return {
|
|
2321
|
+
subjectId: record.subjectId,
|
|
2322
|
+
attemptIndex: record.attemptIndex,
|
|
2323
|
+
sample,
|
|
2324
|
+
fileChanges: Array.isArray(record.fileChanges)
|
|
2325
|
+
? record.fileChanges.filter((entry) => typeof entry === "string")
|
|
2326
|
+
: [],
|
|
2327
|
+
files: sampleFiles,
|
|
2328
|
+
traces: Array.isArray(record.traces)
|
|
2329
|
+
? record.traces.filter((entry) => typeof entry === "string")
|
|
2330
|
+
: traceFilePaths(sampleFiles),
|
|
2331
|
+
};
|
|
2332
|
+
}
|
|
2333
|
+
function normalizeEvaluationSampleOutputs(args) {
|
|
2334
|
+
return args.jobs.flatMap((job) => {
|
|
2335
|
+
const output = normalizeSampleJobOutput(job.output);
|
|
2336
|
+
return output ? [{ jobs: [job], output }] : [];
|
|
2337
|
+
});
|
|
2338
|
+
}
|
|
2339
|
+
function meanFinite(values) {
|
|
2340
|
+
const finite = values.filter((value) => typeof value === "number" && Number.isFinite(value));
|
|
2341
|
+
if (finite.length === 0) {
|
|
2342
|
+
return undefined;
|
|
2343
|
+
}
|
|
2344
|
+
return Number((finite.reduce((sum, value) => sum + value, 0) / finite.length).toFixed(6));
|
|
2345
|
+
}
|
|
2346
|
+
function minIsoTimestamp(values) {
|
|
2347
|
+
const sorted = values
|
|
2348
|
+
.filter((value) => Number.isFinite(Date.parse(value)))
|
|
2349
|
+
.sort((left, right) => Date.parse(left) - Date.parse(right));
|
|
2350
|
+
return sorted[0] ?? null;
|
|
2351
|
+
}
|
|
2352
|
+
function maxIsoTimestamp(values) {
|
|
2353
|
+
const sorted = values
|
|
2354
|
+
.filter((value) => Number.isFinite(Date.parse(value)))
|
|
2355
|
+
.sort((left, right) => Date.parse(right) - Date.parse(left));
|
|
2356
|
+
return sorted[0] ?? null;
|
|
2357
|
+
}
|
|
2358
|
+
function withJobUsage(sample, _jobs, attemptJob) {
|
|
2359
|
+
const usage = normalizeUsageSummary(jsonRecord(attemptJob.output).usage)
|
|
2360
|
+
?? completeUsageSummary(sample.usage);
|
|
2361
|
+
if (!usage) {
|
|
2362
|
+
return sample;
|
|
2363
|
+
}
|
|
2364
|
+
return {
|
|
2365
|
+
...sample,
|
|
2366
|
+
usage,
|
|
2367
|
+
};
|
|
2368
|
+
}
|
|
2369
|
+
function normalizeSubjectRevisionJobOutput(value) {
|
|
2370
|
+
if (!value || typeof value !== "object" || Array.isArray(value)) {
|
|
2371
|
+
return null;
|
|
2372
|
+
}
|
|
2373
|
+
const record = value;
|
|
2374
|
+
if (record.ok !== true || typeof record.subjectId !== "string") {
|
|
2375
|
+
return null;
|
|
2376
|
+
}
|
|
2377
|
+
const files = Array.isArray(record.files)
|
|
2378
|
+
? record.files.filter(isSurfaceSnapshotFile)
|
|
2379
|
+
: [];
|
|
2380
|
+
if (typeof record.attemptIndex !== "number" ||
|
|
2381
|
+
!Number.isFinite(record.attemptIndex)) {
|
|
2382
|
+
return null;
|
|
2383
|
+
}
|
|
2384
|
+
const usage = normalizeUsageSummary(record.usage);
|
|
2385
|
+
return {
|
|
2386
|
+
subjectId: record.subjectId,
|
|
2387
|
+
attemptIndex: record.attemptIndex,
|
|
2388
|
+
baseId: typeof record.baseId === "string" && record.baseId.length > 0
|
|
2389
|
+
? record.baseId
|
|
2390
|
+
: null,
|
|
2391
|
+
...(typeof record.prompt === "string" ? { prompt: record.prompt } : {}),
|
|
2392
|
+
fileChanges: Array.isArray(record.fileChanges)
|
|
2393
|
+
? record.fileChanges.filter((entry) => typeof entry === "string")
|
|
2394
|
+
: [],
|
|
2395
|
+
files,
|
|
2396
|
+
traces: Array.isArray(record.traces)
|
|
2397
|
+
? record.traces.filter((entry) => typeof entry === "string")
|
|
2398
|
+
: traceFilePaths(files),
|
|
2399
|
+
...(usage ? { usage } : {}),
|
|
2400
|
+
};
|
|
2401
|
+
}
|
|
2402
|
+
function errorEvaluationSamplesFromJobs(jobs, subjectId, attemptIndex, completedSampleKeys) {
|
|
2403
|
+
const groups = new Map();
|
|
2404
|
+
for (const job of jobs) {
|
|
2405
|
+
const key = evaluationSampleGroupKeyFromJob(job);
|
|
2406
|
+
if (!key || completedSampleKeys.has(key)) {
|
|
2407
|
+
continue;
|
|
2408
|
+
}
|
|
2409
|
+
groups.set(key, [...(groups.get(key) ?? []), job]);
|
|
2410
|
+
}
|
|
2411
|
+
return [...groups.values()]
|
|
2412
|
+
.map((group) => errorEvaluationSampleFromJobGroup(group, subjectId, attemptIndex))
|
|
2413
|
+
.filter((sample) => sample !== null);
|
|
2414
|
+
}
|
|
2415
|
+
function errorEvaluationSampleFromJobGroup(jobs, subjectId, attemptIndex) {
|
|
2416
|
+
const job = jobs[0];
|
|
2417
|
+
if (!job) {
|
|
2418
|
+
return null;
|
|
2419
|
+
}
|
|
2420
|
+
const sampleIndex = readOptionalJobNumber(job.input, "sampleIndex");
|
|
2421
|
+
const caseId = readJobString(job.input, "caseId");
|
|
2422
|
+
if (sampleIndex === null || !caseId) {
|
|
2423
|
+
return null;
|
|
2424
|
+
}
|
|
2425
|
+
const startedAt = minIsoTimestamp(jobs.map((entry) => entry.startedAt ?? entry.createdAt));
|
|
2426
|
+
const finishedAt = maxIsoTimestamp(jobs.map((entry) => entry.finishedAt ?? entry.updatedAt ?? entry.startedAt));
|
|
2427
|
+
const error = summarizeEvaluationJobErrors(jobs) ?? "Evaluation job did not produce a valid sample.";
|
|
2428
|
+
return {
|
|
2429
|
+
id: `${caseId}__sample_${String(sampleIndex + 1).padStart(3, "0")}`,
|
|
2430
|
+
index: sampleIndex,
|
|
2431
|
+
subject: {
|
|
2432
|
+
id: subjectId,
|
|
2433
|
+
kind: "subject",
|
|
2434
|
+
label: subjectId,
|
|
2435
|
+
},
|
|
2436
|
+
status: "error",
|
|
2437
|
+
...(startedAt ? { startedAt } : {}),
|
|
2438
|
+
...(finishedAt ? { finishedAt } : {}),
|
|
2439
|
+
...(startedAt && finishedAt
|
|
2440
|
+
? { durationMs: Math.max(0, Date.parse(finishedAt) - Date.parse(startedAt)) }
|
|
2441
|
+
: {}),
|
|
2442
|
+
...(error ? { error } : {}),
|
|
2443
|
+
cases: [{
|
|
2444
|
+
id: caseId,
|
|
2445
|
+
status: "error",
|
|
2446
|
+
metrics: {},
|
|
2447
|
+
...(error ? { feedback: { summary: error } } : {}),
|
|
2448
|
+
}],
|
|
2449
|
+
feedback: {
|
|
2450
|
+
traces: [...new Set(jobs.flatMap(jobTracePaths))].sort(),
|
|
2451
|
+
},
|
|
2452
|
+
};
|
|
2453
|
+
}
|
|
2454
|
+
function evaluationSampleGroupKeyFromOutput(output) {
|
|
2455
|
+
const caseId = output.sample.cases?.[0]?.id;
|
|
2456
|
+
if (!caseId) {
|
|
2457
|
+
return null;
|
|
2458
|
+
}
|
|
2459
|
+
return evaluationSampleGroupKey(caseId, output.sample.index);
|
|
2460
|
+
}
|
|
2461
|
+
function evaluationSampleGroupKeyFromJob(job) {
|
|
2462
|
+
const sampleIndex = readOptionalJobNumber(job.input, "sampleIndex");
|
|
2463
|
+
const caseId = readJobString(job.input, "caseId");
|
|
2464
|
+
if (sampleIndex === null || !caseId) {
|
|
2465
|
+
return null;
|
|
2466
|
+
}
|
|
2467
|
+
return evaluationSampleGroupKey(caseId, sampleIndex);
|
|
2468
|
+
}
|
|
2469
|
+
function evaluationSampleGroupKey(caseId, sampleIndex) {
|
|
2470
|
+
return `${caseId}\0${sampleIndex}`;
|
|
2471
|
+
}
|
|
2472
|
+
function summarizeEvaluationJobErrors(jobs) {
|
|
2473
|
+
const failures = jobs
|
|
2474
|
+
.map((job) => job.error ? `${job.id}: ${job.error}` : null)
|
|
2475
|
+
.filter((entry) => entry !== null);
|
|
2476
|
+
if (failures.length === 0) {
|
|
2477
|
+
return null;
|
|
2478
|
+
}
|
|
2479
|
+
return failures.length === 1
|
|
2480
|
+
? failures[0]
|
|
2481
|
+
: `${failures.length} evaluation job errors: ${failures.join("; ")}`;
|
|
2482
|
+
}
|
|
2483
|
+
function jobTracePaths(job) {
|
|
2484
|
+
const output = job.output && typeof job.output === "object" && !Array.isArray(job.output)
|
|
2485
|
+
? job.output
|
|
2486
|
+
: {};
|
|
2487
|
+
const files = Array.isArray(output.files)
|
|
2488
|
+
? output.files.filter(isSurfaceSnapshotFile)
|
|
2489
|
+
: [];
|
|
2490
|
+
return Array.isArray(output.traces)
|
|
2491
|
+
? output.traces.filter((entry) => typeof entry === "string")
|
|
2492
|
+
: traceFilePaths(files);
|
|
2493
|
+
}
|
|
2494
|
+
function compareSampleOutputs(left, right) {
|
|
2495
|
+
const sampleOrder = left.sample.index - right.sample.index;
|
|
2496
|
+
if (sampleOrder !== 0) {
|
|
2497
|
+
return sampleOrder;
|
|
2498
|
+
}
|
|
2499
|
+
return left.sample.id.localeCompare(right.sample.id);
|
|
2500
|
+
}
|
|
2501
|
+
function createEvaluationRecord(subjectId, rawSamples) {
|
|
2502
|
+
const samples = mergeEvaluationSampleRecords(rawSamples);
|
|
2503
|
+
const startedAt = minTimestamp(samples.flatMap((sample) => (sample.startedAt ? [sample.startedAt] : [])));
|
|
2504
|
+
const finishedAt = maxTimestamp(samples.flatMap((sample) => (sample.finishedAt ? [sample.finishedAt] : [])));
|
|
2505
|
+
const durationValues = samples.flatMap((sample) => typeof sample.durationMs === "number" ? [sample.durationMs] : []);
|
|
2506
|
+
const metrics = aggregateSampleMetrics(samples);
|
|
2507
|
+
const usage = usageStats(samples.flatMap((sample) => sample.usage ? [sample.usage] : []));
|
|
2508
|
+
const cases = createEvaluationCaseStats(samples);
|
|
2509
|
+
const completedSampleCount = samples.filter((sample) => sample.status === "completed").length;
|
|
2510
|
+
const errorSampleCount = samples.filter((sample) => sample.status === "error")
|
|
2511
|
+
.length;
|
|
2512
|
+
return {
|
|
2513
|
+
subject: {
|
|
2514
|
+
id: subjectId,
|
|
2515
|
+
kind: "subject",
|
|
2516
|
+
},
|
|
2517
|
+
status: samples.length > 0 && completedSampleCount === samples.length
|
|
2518
|
+
? "completed"
|
|
2519
|
+
: samples.length > 0 && errorSampleCount === samples.length
|
|
2520
|
+
? "error"
|
|
2521
|
+
: "partial",
|
|
2522
|
+
sampleCount: samples.length,
|
|
2523
|
+
completedSampleCount,
|
|
2524
|
+
errorSampleCount,
|
|
2525
|
+
...(startedAt ? { startedAt } : {}),
|
|
2526
|
+
...(finishedAt ? { finishedAt } : {}),
|
|
2527
|
+
...(metrics ? { metrics } : {}),
|
|
2528
|
+
...(durationValues.length > 0
|
|
2529
|
+
? { durationMs: metricStats(durationValues) }
|
|
2530
|
+
: {}),
|
|
2531
|
+
...(usage ? { usage } : {}),
|
|
2532
|
+
...(cases ? { cases } : {}),
|
|
2533
|
+
samples,
|
|
2534
|
+
};
|
|
2535
|
+
}
|
|
2536
|
+
function aggregateSampleMetrics(samples) {
|
|
2537
|
+
const metricNames = new Set(samples.flatMap((sample) => Object.keys(sample.metrics ?? {})));
|
|
2538
|
+
if (metricNames.size === 0) {
|
|
2539
|
+
return undefined;
|
|
2540
|
+
}
|
|
2541
|
+
const metrics = Object.fromEntries([...metricNames].sort().map((metric) => [
|
|
2542
|
+
metric,
|
|
2543
|
+
metricStats(samples.flatMap((sample) => {
|
|
2544
|
+
const value = sample.metrics?.[metric];
|
|
2545
|
+
return typeof value === "number" && Number.isFinite(value)
|
|
2546
|
+
? [value]
|
|
2547
|
+
: [];
|
|
2548
|
+
})),
|
|
2549
|
+
]));
|
|
2550
|
+
return Object.keys(metrics).length > 0 ? metrics : undefined;
|
|
2551
|
+
}
|
|
2552
|
+
function mergeEvaluationSampleRecords(samples) {
|
|
2553
|
+
const groups = new Map();
|
|
2554
|
+
for (const sample of samples) {
|
|
2555
|
+
const key = String(sample.index);
|
|
2556
|
+
groups.set(key, [...(groups.get(key) ?? []), sample]);
|
|
2557
|
+
}
|
|
2558
|
+
return [...groups.values()]
|
|
2559
|
+
.map(mergeEvaluationSampleGroup)
|
|
2560
|
+
.sort((left, right) => left.index - right.index ||
|
|
2561
|
+
left.id.localeCompare(right.id));
|
|
2562
|
+
}
|
|
2563
|
+
function mergeEvaluationSampleGroup(group) {
|
|
2564
|
+
const first = group[0];
|
|
2565
|
+
if (group.length === 1) {
|
|
2566
|
+
return normalizeSingleCaseDurations(first);
|
|
2567
|
+
}
|
|
2568
|
+
const startedAt = minTimestamp(group.flatMap((sample) => (sample.startedAt ? [sample.startedAt] : [])));
|
|
2569
|
+
const finishedAt = maxTimestamp(group.flatMap((sample) => (sample.finishedAt ? [sample.finishedAt] : [])));
|
|
2570
|
+
const durationMs = startedAt && finishedAt
|
|
2571
|
+
? Math.max(0, Date.parse(finishedAt) - Date.parse(startedAt))
|
|
2572
|
+
: undefined;
|
|
2573
|
+
const cases = group.flatMap((sample) => normalizeCaseDurations(sample));
|
|
2574
|
+
const metrics = aggregateSampleGroupMetrics(group);
|
|
2575
|
+
const usage = mergeUsageSummaries(group.map((sample) => sample.usage));
|
|
2576
|
+
const errors = group.flatMap((sample) => sample.error ? [sample.error] : []);
|
|
2577
|
+
return {
|
|
2578
|
+
id: `sample_${String(first.index + 1).padStart(3, "0")}`,
|
|
2579
|
+
index: first.index,
|
|
2580
|
+
subject: first.subject,
|
|
2581
|
+
status: mergeEvaluationSampleStatus(group),
|
|
2582
|
+
...(startedAt ? { startedAt } : {}),
|
|
2583
|
+
...(finishedAt ? { finishedAt } : {}),
|
|
2584
|
+
...(durationMs !== undefined ? { durationMs } : {}),
|
|
2585
|
+
...(metrics ? { metrics } : {}),
|
|
2586
|
+
...(usage ? { usage } : {}),
|
|
2587
|
+
...(errors.length > 0 ? { error: errors.join("; ") } : {}),
|
|
2588
|
+
...(cases.length > 0 ? { cases } : {}),
|
|
2589
|
+
};
|
|
2590
|
+
}
|
|
2591
|
+
function normalizeSingleCaseDurations(sample) {
|
|
2592
|
+
if (!sample.cases) {
|
|
2593
|
+
return sample;
|
|
2594
|
+
}
|
|
2595
|
+
const cases = normalizeCaseDurations(sample);
|
|
2596
|
+
return cases.length === sample.cases.length
|
|
2597
|
+
? { ...sample, cases }
|
|
2598
|
+
: sample;
|
|
2599
|
+
}
|
|
2600
|
+
function normalizeCaseDurations(sample) {
|
|
2601
|
+
return (sample.cases ?? []).map((caseResult) => (typeof caseResult.durationMs === "number" ||
|
|
2602
|
+
sample.cases?.length !== 1 ||
|
|
2603
|
+
typeof sample.durationMs !== "number"
|
|
2604
|
+
? caseResult
|
|
2605
|
+
: { ...caseResult, durationMs: sample.durationMs }));
|
|
2606
|
+
}
|
|
2607
|
+
function aggregateSampleGroupMetrics(group) {
|
|
2608
|
+
const metricNames = new Set(group.flatMap((sample) => Object.keys(sample.metrics ?? {})));
|
|
2609
|
+
if (metricNames.size === 0) {
|
|
2610
|
+
return undefined;
|
|
2611
|
+
}
|
|
2612
|
+
const metrics = Object.fromEntries([...metricNames].sort().flatMap((metric) => {
|
|
2613
|
+
const value = meanFinite(group.map((sample) => sample.metrics?.[metric]));
|
|
2614
|
+
return value === undefined ? [] : [[metric, value]];
|
|
2615
|
+
}));
|
|
2616
|
+
return Object.keys(metrics).length > 0 ? metrics : undefined;
|
|
2617
|
+
}
|
|
2618
|
+
function mergeEvaluationSampleStatus(group) {
|
|
2619
|
+
if (group.some((sample) => sample.status === "error")) {
|
|
2620
|
+
return "error";
|
|
2621
|
+
}
|
|
2622
|
+
if (group.some((sample) => sample.status === "running")) {
|
|
2623
|
+
return "running";
|
|
2624
|
+
}
|
|
2625
|
+
if (group.length > 0 && group.every((sample) => sample.status === "completed")) {
|
|
2626
|
+
return "completed";
|
|
2627
|
+
}
|
|
2628
|
+
return "planned";
|
|
2629
|
+
}
|
|
2630
|
+
function minTimestamp(values) {
|
|
2631
|
+
return values.length > 0
|
|
2632
|
+
? values.reduce((min, value) => (value < min ? value : min))
|
|
2633
|
+
: null;
|
|
2634
|
+
}
|
|
2635
|
+
function maxTimestamp(values) {
|
|
2636
|
+
return values.length > 0
|
|
2637
|
+
? values.reduce((max, value) => (value > max ? value : max))
|
|
2638
|
+
: null;
|
|
2639
|
+
}
|
|
2640
|
+
function createEvaluationCaseStats(samples) {
|
|
2641
|
+
const byCase = new Map();
|
|
2642
|
+
for (const caseResult of samples.flatMap((sample) => sample.cases ?? [])) {
|
|
2643
|
+
byCase.set(caseResult.id, [
|
|
2644
|
+
...(byCase.get(caseResult.id) ?? []),
|
|
2645
|
+
caseResult,
|
|
2646
|
+
]);
|
|
2647
|
+
}
|
|
2648
|
+
if (byCase.size === 0) {
|
|
2649
|
+
return undefined;
|
|
2650
|
+
}
|
|
2651
|
+
return [...byCase.entries()]
|
|
2652
|
+
.sort((left, right) => left[0].localeCompare(right[0]))
|
|
2653
|
+
.map(([id, results]) => {
|
|
2654
|
+
const first = results[0];
|
|
2655
|
+
const metricNames = new Set(results.flatMap((result) => Object.keys(result.metrics)));
|
|
2656
|
+
const durationValues = results.flatMap((result) => typeof result.durationMs === "number" ? [result.durationMs] : []);
|
|
2657
|
+
const status = aggregateCaseStatus(results);
|
|
2658
|
+
return {
|
|
2659
|
+
id,
|
|
2660
|
+
...(first.label ? { label: first.label } : {}),
|
|
2661
|
+
...(first.split ? { split: first.split } : {}),
|
|
2662
|
+
...(status ? { status } : {}),
|
|
2663
|
+
sampleCount: results.length,
|
|
2664
|
+
metrics: Object.fromEntries([...metricNames].sort().map((metric) => [
|
|
2665
|
+
metric,
|
|
2666
|
+
metricStats(results.flatMap((result) => {
|
|
2667
|
+
const value = result.metrics[metric];
|
|
2668
|
+
return typeof value === "number" && Number.isFinite(value)
|
|
2669
|
+
? [value]
|
|
2670
|
+
: [];
|
|
2671
|
+
})),
|
|
2672
|
+
])),
|
|
2673
|
+
...(durationValues.length > 0
|
|
2674
|
+
? { durationMs: metricStats(durationValues) }
|
|
2675
|
+
: {}),
|
|
2676
|
+
};
|
|
2677
|
+
});
|
|
2678
|
+
}
|
|
2679
|
+
function aggregateCaseStatus(results) {
|
|
2680
|
+
if (results.some((result) => result.status === "error")) {
|
|
2681
|
+
return "error";
|
|
2682
|
+
}
|
|
2683
|
+
if (results.length > 0) {
|
|
2684
|
+
return "completed";
|
|
2685
|
+
}
|
|
2686
|
+
return undefined;
|
|
2687
|
+
}
|
|
2688
|
+
function evaluationMeanMetrics(evaluation) {
|
|
2689
|
+
const entries = Object.entries(evaluation.metrics ?? {}).filter((entry) => Number.isFinite(entry[1].mean));
|
|
2690
|
+
return entries.length > 0
|
|
2691
|
+
? Object.fromEntries(entries.map(([key, stats]) => [key, Number(stats.mean.toFixed(3))]))
|
|
2692
|
+
: undefined;
|
|
2693
|
+
}
|
|
2694
|
+
function selectSubject(args) {
|
|
2695
|
+
let selected = args.previousSubject;
|
|
2696
|
+
for (const subject of args.subjects) {
|
|
2697
|
+
if (!selected || hasHigherScore(subject, selected)) {
|
|
2698
|
+
selected = subject;
|
|
2699
|
+
}
|
|
2700
|
+
}
|
|
2701
|
+
return selected;
|
|
2702
|
+
}
|
|
2703
|
+
function hasHigherScore(subject, incumbent) {
|
|
2704
|
+
const subjectValue = readMetric(subject, "score");
|
|
2705
|
+
const incumbentValue = readMetric(incumbent, "score");
|
|
2706
|
+
if (subjectValue == null) {
|
|
2707
|
+
return false;
|
|
2708
|
+
}
|
|
2709
|
+
if (incumbentValue == null) {
|
|
2710
|
+
return true;
|
|
2711
|
+
}
|
|
2712
|
+
return subjectValue > incumbentValue;
|
|
2713
|
+
}
|
|
2714
|
+
function readMetric(subject, metric) {
|
|
2715
|
+
const direct = subject.metrics?.[metric];
|
|
2716
|
+
return typeof direct === "number" && Number.isFinite(direct) ? direct : null;
|
|
2717
|
+
}
|
|
2718
|
+
function metricStats(values) {
|
|
2719
|
+
const count = values.length;
|
|
2720
|
+
if (count === 0) {
|
|
2721
|
+
return {
|
|
2722
|
+
count: 0,
|
|
2723
|
+
mean: 0,
|
|
2724
|
+
variance: 0,
|
|
2725
|
+
stddev: 0,
|
|
2726
|
+
min: 0,
|
|
2727
|
+
max: 0,
|
|
2728
|
+
};
|
|
2729
|
+
}
|
|
2730
|
+
const mean = values.reduce((sum, value) => sum + value, 0) / count;
|
|
2731
|
+
const variance = values.reduce((sum, value) => sum + (value - mean) ** 2, 0) / count;
|
|
2732
|
+
return {
|
|
2733
|
+
count,
|
|
2734
|
+
mean,
|
|
2735
|
+
variance,
|
|
2736
|
+
stddev: Math.sqrt(variance),
|
|
2737
|
+
min: Math.min(...values),
|
|
2738
|
+
max: Math.max(...values),
|
|
2739
|
+
};
|
|
2740
|
+
}
|
|
2741
|
+
function normalizeRelativePath(filePath) {
|
|
2742
|
+
const normalized = filePath.replace(/\\/gu, "/").replace(/^\/+/u, "");
|
|
2743
|
+
if (!normalized || normalized.includes("\0")) {
|
|
2744
|
+
throw new Error("File paths must be non-empty relative paths.");
|
|
2745
|
+
}
|
|
2746
|
+
const parts = normalized.split("/");
|
|
2747
|
+
if (parts.some((part) => part === ".." || part === "." || part === "")) {
|
|
2748
|
+
throw new Error(`Unsafe relative file path: ${filePath}`);
|
|
2749
|
+
}
|
|
2750
|
+
return normalized;
|
|
2751
|
+
}
|
|
2752
|
+
function detectMimeType(filePath) {
|
|
2753
|
+
const lower = filePath.toLowerCase();
|
|
2754
|
+
if (lower.endsWith(".md") || lower.endsWith(".markdown")) {
|
|
2755
|
+
return "text/markdown";
|
|
2756
|
+
}
|
|
2757
|
+
if (lower.endsWith(".json")) {
|
|
2758
|
+
return "application/json";
|
|
2759
|
+
}
|
|
2760
|
+
if (lower.endsWith(".csv")) {
|
|
2761
|
+
return "text/csv";
|
|
2762
|
+
}
|
|
2763
|
+
if (lower.endsWith(".pdf")) {
|
|
2764
|
+
return "application/pdf";
|
|
2765
|
+
}
|
|
2766
|
+
if (lower.endsWith(".png")) {
|
|
2767
|
+
return "image/png";
|
|
2768
|
+
}
|
|
2769
|
+
if (lower.endsWith(".jpg") || lower.endsWith(".jpeg")) {
|
|
2770
|
+
return "image/jpeg";
|
|
2771
|
+
}
|
|
2772
|
+
if (lower.endsWith(".xlsx")) {
|
|
2773
|
+
return "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet";
|
|
2774
|
+
}
|
|
2775
|
+
return "text/plain";
|
|
2776
|
+
}
|
|
2777
|
+
function resolvePreviewKind(filePath) {
|
|
2778
|
+
const lower = filePath.toLowerCase();
|
|
2779
|
+
if (lower.endsWith(".md") || lower.endsWith(".markdown")) {
|
|
2780
|
+
return "markdown";
|
|
2781
|
+
}
|
|
2782
|
+
if (lower.endsWith(".csv")) {
|
|
2783
|
+
return "table";
|
|
2784
|
+
}
|
|
2785
|
+
if (lower.endsWith(".xlsx") || lower.endsWith(".xls")) {
|
|
2786
|
+
return "spreadsheet";
|
|
2787
|
+
}
|
|
2788
|
+
if (lower.endsWith(".png") ||
|
|
2789
|
+
lower.endsWith(".jpg") ||
|
|
2790
|
+
lower.endsWith(".jpeg") ||
|
|
2791
|
+
lower.endsWith(".gif")) {
|
|
2792
|
+
return "image";
|
|
2793
|
+
}
|
|
2794
|
+
if (lower.endsWith(".pdf")) {
|
|
2795
|
+
return "pdf";
|
|
2796
|
+
}
|
|
2797
|
+
return "text";
|
|
2798
|
+
}
|
|
2799
|
+
function readJobString(value, key) {
|
|
2800
|
+
if (!value || typeof value !== "object" || Array.isArray(value)) {
|
|
2801
|
+
return null;
|
|
2802
|
+
}
|
|
2803
|
+
const raw = value[key];
|
|
2804
|
+
return typeof raw === "string" && raw.length > 0 ? raw : null;
|
|
2805
|
+
}
|
|
2806
|
+
function readRequiredJobString(value, key, label) {
|
|
2807
|
+
const result = readJobString(value, key);
|
|
2808
|
+
if (!result) {
|
|
2809
|
+
throw new Error(`${label} is missing ${key}.`);
|
|
2810
|
+
}
|
|
2811
|
+
return result;
|
|
2812
|
+
}
|
|
2813
|
+
function readOptionalJobNumber(value, key) {
|
|
2814
|
+
if (!value || typeof value !== "object" || Array.isArray(value)) {
|
|
2815
|
+
return null;
|
|
2816
|
+
}
|
|
2817
|
+
const raw = value[key];
|
|
2818
|
+
return typeof raw === "number" && Number.isFinite(raw) ? raw : null;
|
|
2819
|
+
}
|
|
2820
|
+
function readRequiredJobNumber(value, key, label) {
|
|
2821
|
+
const result = readOptionalJobNumber(value, key);
|
|
2822
|
+
if (result === null) {
|
|
2823
|
+
throw new Error(`${label} is missing ${key}.`);
|
|
2824
|
+
}
|
|
2825
|
+
return result;
|
|
2826
|
+
}
|
|
2827
|
+
function isEvaluationSampleRecord(value) {
|
|
2828
|
+
const record = value;
|
|
2829
|
+
return Boolean(value &&
|
|
2830
|
+
typeof value === "object" &&
|
|
2831
|
+
!Array.isArray(value) &&
|
|
2832
|
+
typeof record.id === "string" &&
|
|
2833
|
+
typeof record.index === "number" &&
|
|
2834
|
+
typeof record.subject === "object" &&
|
|
2835
|
+
isEvaluationSampleStatus(record.status) &&
|
|
2836
|
+
hasOperationalCaseStatuses(record.cases));
|
|
2837
|
+
}
|
|
2838
|
+
function isEvaluationSampleStatus(value) {
|
|
2839
|
+
return value === "planned" ||
|
|
2840
|
+
value === "running" ||
|
|
2841
|
+
value === "completed" ||
|
|
2842
|
+
value === "error";
|
|
2843
|
+
}
|
|
2844
|
+
function hasOperationalCaseStatuses(value) {
|
|
2845
|
+
if (value === undefined) {
|
|
2846
|
+
return true;
|
|
2847
|
+
}
|
|
2848
|
+
if (!Array.isArray(value)) {
|
|
2849
|
+
return false;
|
|
2850
|
+
}
|
|
2851
|
+
return value.every((entry) => {
|
|
2852
|
+
if (!entry || typeof entry !== "object" || Array.isArray(entry)) {
|
|
2853
|
+
return false;
|
|
2854
|
+
}
|
|
2855
|
+
const status = entry.status;
|
|
2856
|
+
return status === undefined || status === "completed" || status === "error";
|
|
2857
|
+
});
|
|
2858
|
+
}
|