@workbench-ai/workbench-core 0.0.49 → 0.0.51
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/candidate-patch.d.ts +8 -0
- package/dist/candidate-patch.d.ts.map +1 -0
- package/dist/{subject-patch.js → candidate-patch.js} +5 -5
- package/dist/execution-evidence.d.ts +5 -5
- package/dist/execution-evidence.d.ts.map +1 -1
- package/dist/execution-evidence.js +8 -8
- package/dist/execution-graph.d.ts +2 -2
- package/dist/execution-graph.d.ts.map +1 -1
- package/dist/execution-graph.js +13 -13
- package/dist/execution-jobs.d.ts +7 -6
- package/dist/execution-jobs.d.ts.map +1 -1
- package/dist/execution-jobs.js +32 -17
- package/dist/execution-outputs.d.ts +2 -2
- package/dist/execution-outputs.d.ts.map +1 -1
- package/dist/execution-outputs.js +25 -13
- package/dist/execution-runtime-types.d.ts +1 -1
- package/dist/execution-runtime-types.d.ts.map +1 -1
- package/dist/execution-traces.js +7 -7
- package/dist/execution-usage.js +9 -9
- package/dist/generic-spec.d.ts +46 -30
- package/dist/generic-spec.d.ts.map +1 -1
- package/dist/generic-spec.js +173 -80
- package/dist/index.d.ts +68 -39
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +805 -359
- package/dist/runtime-utils.d.ts +1 -1
- package/dist/runtime-utils.d.ts.map +1 -1
- package/dist/runtime-utils.js +3 -3
- package/dist/sandbox-backends/docker.js +5 -5
- package/dist/sandbox-inputs.js +3 -3
- package/dist/sandbox-plane.js +7 -7
- package/package.json +3 -3
- package/worker/sandbox-adapter-runner.cjs +2 -2
- package/dist/subject-patch.d.ts +0 -8
- package/dist/subject-patch.d.ts.map +0 -1
package/dist/index.js
CHANGED
|
@@ -4,19 +4,19 @@ import path from "node:path";
|
|
|
4
4
|
import { fileURLToPath } from "node:url";
|
|
5
5
|
import YAML from "yaml";
|
|
6
6
|
import { adapterCommandName, assertWorkbenchAdapterOperationResultOk, collectWorkbenchAdapterAuthRequirements, parseWorkbenchAdapterManifest, readWorkbenchAdapterOperationResult, WORKBENCH_RUNTIME_CONTROL_TOKEN_ENV, WORKBENCH_RUNTIME_CONTROL_URL_ENV, workbenchAdapterOperationCommand, workbenchAdapterOperationExecutor, workbenchAdapterOperationResultPath, } from "@workbench-ai/workbench-protocol";
|
|
7
|
-
import { BENCHMARK_SPEC_FILE, engineCasePrivateFiles, engineCaseFilesForRuntimeInput, engineCasePublicFiles, resolveEngineCaseExecutionConfig, resolveWorkbenchResolvedSourceYaml as resolveWorkbenchResolvedSourceYamlInternal, validateWorkbenchResolvedSourceYaml as validateWorkbenchResolvedSourceYamlInternal,
|
|
7
|
+
import { BENCHMARK_SPEC_FILE, DEFAULT_EXECUTION_RESOURCES, engineCasePrivateFiles, engineCaseFilesForRuntimeInput, engineCasePublicFiles, resolveEngineCaseExecutionConfig, resolveWorkbenchResolvedSourceYaml as resolveWorkbenchResolvedSourceYamlInternal, validateWorkbenchResolvedSourceYaml as validateWorkbenchResolvedSourceYamlInternal, isWorkbenchCandidateManifestPath, } from "./generic-spec.js";
|
|
8
8
|
import { attachSandboxMetadataToJob, createWorkbenchSandboxFileStore, isSurfaceSnapshotFile, readWorkbenchExecutionSpec, } from "./sandbox-inputs.js";
|
|
9
9
|
import { asRuntimeRecord, importNodeModule, isJsonPayload, jsonRecord, nodeBuiltin, quoteShellArg, resolveWorkbenchWorkerId, } from "./runtime-utils.js";
|
|
10
10
|
import { createWorkbenchExecutionCapability, createWorkbenchSandboxAllocation, collectExecutionCapabilityScopeIssues, collectSandboxAllocationScopeIssues, collectSandboxHandleScopeIssues, assertSandboxBackendSupportsNetworkPolicy, executeValidatedSandboxExecution, } from "./sandbox-plane.js";
|
|
11
11
|
import { createSandboxBackendPlaneForProvider, } from "./sandbox-backends/index.js";
|
|
12
|
-
import {
|
|
12
|
+
import { applyWorkbenchCandidatePatch } from "./candidate-patch.js";
|
|
13
13
|
import { assignUsageRole, completeUsageSummary, mergeUsageSummaries, normalizeUsageSummary, usageStats, } from "./execution-usage.js";
|
|
14
14
|
import { traceFilePaths, workbenchTraceExecutionDirectory, } from "./trace-files.js";
|
|
15
15
|
import { engineCaseForCase, } from "./execution-jobs.js";
|
|
16
16
|
import { createWorkbenchExecutionEventPublisher, publishCommandStepEvent, } from "./execution-events.js";
|
|
17
17
|
import { readWorkbenchExecutionPurpose } from "./execution-evidence.js";
|
|
18
18
|
import { adapterAuthEnv, localWorkbenchAdapterAuthStore, normalizeWorkbenchAdapterAuthTarget, sanitizeWorkbenchAdapterAuthBundle, } from "./adapter-auth.js";
|
|
19
|
-
export { BENCHMARK_SPEC_FILE, DEFAULT_EXECUTION_RESOURCES, engineCasePrivateFiles, engineCaseFilesForRuntimeInput, engineCasePublicFiles, engineResolveInvocationForSpec, engineResolveBindingForSpec, engineResolveBindingForSourceYaml,
|
|
19
|
+
export { BENCHMARK_SPEC_FILE, CANDIDATE_SPEC_FILE, DEFAULT_EXECUTION_RESOURCES, engineCasePrivateFiles, engineCaseFilesForRuntimeInput, engineCasePublicFiles, engineResolveInvocationForSpec, engineResolveBindingForSpec, engineResolveBindingForSourceYaml, isWorkbenchCandidateManifestPath, parseWorkbenchSourceFiles, resolveEngineCaseExecutionConfig, resolveWorkbenchResolvedSourceYaml, resolveWorkbenchSourceFiles, runtimeNetwork, runtimeResources, serializeWorkbenchResolvedSourceYaml, validateWorkbenchResolvedSourceYaml, } from "./generic-spec.js";
|
|
20
20
|
export { composeRuntimeDockerfileWithAdapterInstallers, } from "./runtime-dockerfile.js";
|
|
21
21
|
export { adapterCommandName, cloneWorkbenchAdapterManifest, collectWorkbenchAdapterAuthRequirements, collectWorkbenchAdapterInvocations, parseWorkbenchAdapterManifest, workbenchAdapterManifestRequiresAuth, workbenchAdapterManifestSupportsOperation, workbenchAdapterOperationCommand, workbenchAdapterOperationExecutor, withDefaultWorkbenchAdapterAuth, withDefaultWorkbenchAdapterAuthProfiles, } from "@workbench-ai/workbench-protocol";
|
|
22
22
|
export { adapterAuthEnv, createWorkbenchAdapterAuthBundle, defaultWorkbenchAdapterAuthStoreRoot, localWorkbenchAdapterAuthStore, normalizeWorkbenchAdapterAuthTarget, parseWorkbenchAdapterAuthTarget, sanitizeWorkbenchAdapterAuthBundle, } from "./adapter-auth.js";
|
|
@@ -26,16 +26,127 @@ export { createWorkbenchProgressStdoutParser, publishWorkbenchProgressStdoutEnve
|
|
|
26
26
|
export { resolveSandboxTemplateImage, } from "./sandbox-backends/template-images.js";
|
|
27
27
|
export { readOutputTraceFiles, workbenchTraceExecutionDirectory, workbenchTraceRunDirectory, workbenchTraceRunDirectoryName, } from "./trace-files.js";
|
|
28
28
|
export { assertWorkbenchAdapterOperationSupport, assertWorkbenchAdapterOperationResultOk, collectWorkbenchAdapterOperationIssues, collectWorkbenchAdapterOperationRequirements, ensureWorkbenchAdapterOutputDir, WORKBENCH_ADAPTER_RESULT_FILE, normalizeWorkbenchAdapterOperationRequest, normalizeWorkbenchAdapterOperationResult, readWorkbenchAdapterOperationRequest, readWorkbenchAdapterOperationResult, workbenchAdapterOperationResultPath, writeWorkbenchAdapterOperationResult, } from "@workbench-ai/workbench-protocol";
|
|
29
|
-
export {
|
|
29
|
+
export { applyWorkbenchCandidatePatch, } from "./candidate-patch.js";
|
|
30
30
|
export { createWorkbenchSandboxFileStore, createSandboxAdapterRequest, executionResultFromCompletedSandboxJob, materializeWorkbenchSandboxInput, readWorkbenchExecutionSpec, sanitizeWorkbenchExecutionJobForSandbox, } from "./sandbox-inputs.js";
|
|
31
31
|
export { compileWorkbenchExecutionGraph, } from "./execution-graph.js";
|
|
32
|
-
export {
|
|
32
|
+
export { createBaselineCandidateExecution, createBaselineCandidateJob, createWorkbenchExecutionJob, expectedWorkbenchRunJobCount, engineCaseForCase, engineCaseIds, attemptJobCountForRunSpec, workbenchExecutionJobPurpose, MAX_WORKBENCH_RUN_BUDGET, planWorkbenchExecutionJobsForPurpose, validateWorkbenchRunEnvelope, workbenchExecutionJobId, } from "./execution-jobs.js";
|
|
33
33
|
export { addCapacity, capacityFits, runWorkbenchExecutionDag, subtractCapacity, workbenchJobDependencies, workbenchJobHostCost, workbenchJobResources, } from "./execution-scheduler.js";
|
|
34
34
|
export { assertWorkbenchExecutionIsolation, collectWorkbenchExecutionIsolationIssues, validateWorkbenchExecutionOutputPayloads, } from "./execution-outputs.js";
|
|
35
35
|
export { collectSandboxAllocationScopeIssues, collectExecutionCapabilityScopeIssues, collectSandboxHandleScopeIssues, createWorkbenchSandboxAllocation, createWorkbenchSandboxExecutionMetadata, createWorkbenchExecutionCapability, executeValidatedSandboxExecution, } from "./sandbox-plane.js";
|
|
36
|
-
export {
|
|
36
|
+
export { buildCandidateCaseExecutionRefs, buildWorkbenchExecutionEvidence, isWorkbenchExecutionActive, readWorkbenchExecutionId, readWorkbenchExecutionMetadataNumber, readWorkbenchExecutionMetadataString, readWorkbenchExecutionPurpose, resolveWorkbenchJobGroupStatus, } from "./execution-evidence.js";
|
|
37
37
|
export { buildWorkbenchTraceSessionsFromFiles, combineWorkbenchTraceSessions, finalizeWorkbenchExecutionTraceForJob, mergeWorkbenchExecutionTracesByJob, readWorkbenchExecutionTraceFiles, traceSessionLabel, } from "./execution-traces.js";
|
|
38
38
|
export { DOCKER_SANDBOX_BACKEND, assertSandboxHostHealthForProvider, createDockerSandboxBackendDescriptor, createDockerSandboxPlane, resolveWorkbenchSandboxProviderName, sandboxProviderAdmissionForResources, sandboxProviderDefaultMaxConcurrentJobs, sandboxProviderLeaseScope, sandboxHostHealthExpectationForProvider, } from "./sandbox-backends/index.js";
|
|
39
|
+
export function sanitizeWorkbenchRuntimeJobForExchange(job) {
|
|
40
|
+
const { leaseUntil: _leaseUntil, wakeupLeaseUntil: _wakeupLeaseUntil, hostId: _hostId, workerId: _workerId, claimTokenHash: _claimTokenHash, trace: _trace, traceSessions: _traceSessions, ...portable } = job;
|
|
41
|
+
return { ...portable };
|
|
42
|
+
}
|
|
43
|
+
export function sanitizeWorkbenchRuntimeCandidateForExchange(candidate) {
|
|
44
|
+
const { ownerUserId: _ownerUserId, ownerUsername: _ownerUsername, visibility: _visibility, metrics: _metrics, candidateRunId: _candidateRunId, candidateRunName: _candidateRunName, ...portable } = candidate;
|
|
45
|
+
return { ...portable };
|
|
46
|
+
}
|
|
47
|
+
export function workbenchProjectSourceFingerprint(input) {
|
|
48
|
+
const canonical = {
|
|
49
|
+
sourceYaml: normalizeTextForProjectStateFingerprint(input.source),
|
|
50
|
+
candidateFiles: canonicalFilesForProjectStateFingerprint(input.candidateFiles),
|
|
51
|
+
engineResolveFiles: canonicalFilesForProjectStateFingerprint(input.engineResolveFiles),
|
|
52
|
+
engineResolveBinding: {
|
|
53
|
+
engine: input.engineResolveBinding.engine,
|
|
54
|
+
resolver: {
|
|
55
|
+
use: input.engineResolveBinding.resolver.use,
|
|
56
|
+
withFingerprint: input.engineResolveBinding.resolver.withFingerprint,
|
|
57
|
+
},
|
|
58
|
+
},
|
|
59
|
+
adapterFiles: canonicalFilesForProjectStateFingerprint(input.adapterFiles),
|
|
60
|
+
runtimeFiles: canonicalFilesForProjectStateFingerprint(input.runtimeFiles),
|
|
61
|
+
dockerfile: normalizeTextForProjectStateFingerprint(input.dockerfile),
|
|
62
|
+
runtimeDockerfile: normalizeTextForProjectStateFingerprint(input.runtimeDockerfile),
|
|
63
|
+
resources: normalizeProjectStateResources(input.resources),
|
|
64
|
+
network: input.network,
|
|
65
|
+
};
|
|
66
|
+
return createHash("sha256").update(JSON.stringify(canonicalizeProjectState(canonical))).digest("hex");
|
|
67
|
+
}
|
|
68
|
+
export function workbenchRuntimeBundleFingerprint(bundle) {
|
|
69
|
+
const canonical = {
|
|
70
|
+
schema: bundle.schema,
|
|
71
|
+
activeId: bundle.activeId,
|
|
72
|
+
candidates: sortByStableKey(bundle.candidates.map(sanitizeWorkbenchRuntimeCandidateForExchange), (candidate) => candidate.id),
|
|
73
|
+
candidateFiles: sortByStableKey(bundle.candidateFiles.map((group) => ({
|
|
74
|
+
candidateId: group.candidateId,
|
|
75
|
+
files: canonicalFilesForProjectStateFingerprint(group.files),
|
|
76
|
+
})), (group) => group.candidateId),
|
|
77
|
+
evaluations: sortByStableKey(bundle.evaluations, (evaluation) => evaluation.id),
|
|
78
|
+
runs: sortByStableKey(bundle.runs, (run) => run.id),
|
|
79
|
+
jobs: sortByStableKey(bundle.jobs.map(runtimeJobForProjectStateFingerprint), (job) => job.id),
|
|
80
|
+
executionFiles: sortByStableKey(bundle.executionFiles.map((group) => ({
|
|
81
|
+
jobId: group.jobId,
|
|
82
|
+
files: canonicalFilesForProjectStateFingerprint(group.files),
|
|
83
|
+
})), (group) => group.jobId),
|
|
84
|
+
events: sortByStableKey(bundle.events, (event) => [event.runId ?? "_", event.jobId ?? "_", event.at, event.id].join("#")),
|
|
85
|
+
};
|
|
86
|
+
return createHash("sha256").update(JSON.stringify(canonicalizeProjectState(canonical))).digest("hex");
|
|
87
|
+
}
|
|
88
|
+
export function workbenchSurfaceFilesEqualForExchange(left, right) {
|
|
89
|
+
return JSON.stringify(canonicalFilesForProjectStateFingerprint(left)) ===
|
|
90
|
+
JSON.stringify(canonicalFilesForProjectStateFingerprint(right));
|
|
91
|
+
}
|
|
92
|
+
export function workbenchRuntimeBundleStats(bundle) {
|
|
93
|
+
return {
|
|
94
|
+
candidates: bundle.candidates.length,
|
|
95
|
+
candidateFiles: bundle.candidateFiles.reduce((sum, group) => sum + group.files.length, 0),
|
|
96
|
+
evaluations: bundle.evaluations.length,
|
|
97
|
+
runs: bundle.runs.length,
|
|
98
|
+
jobs: bundle.jobs.length,
|
|
99
|
+
executionFiles: bundle.executionFiles.reduce((sum, group) => sum + group.files.length, 0),
|
|
100
|
+
events: bundle.events.length,
|
|
101
|
+
activeId: bundle.activeId,
|
|
102
|
+
};
|
|
103
|
+
}
|
|
104
|
+
function runtimeJobForProjectStateFingerprint(job) {
|
|
105
|
+
const portable = sanitizeWorkbenchRuntimeJobForExchange(job);
|
|
106
|
+
const output = portable.output;
|
|
107
|
+
if (!output || typeof output !== "object" || Array.isArray(output)) {
|
|
108
|
+
return portable;
|
|
109
|
+
}
|
|
110
|
+
const { files: _files, fileSet: _fileSet, ...portableOutput } = output;
|
|
111
|
+
return {
|
|
112
|
+
...portable,
|
|
113
|
+
output: portableOutput,
|
|
114
|
+
};
|
|
115
|
+
}
|
|
116
|
+
function canonicalFilesForProjectStateFingerprint(files) {
|
|
117
|
+
return sortByStableKey(files.map((file) => ({
|
|
118
|
+
path: file.path,
|
|
119
|
+
encoding: file.encoding,
|
|
120
|
+
executable: Boolean(file.executable),
|
|
121
|
+
content: file.content,
|
|
122
|
+
})), (file) => file.path);
|
|
123
|
+
}
|
|
124
|
+
function normalizeTextForProjectStateFingerprint(value) {
|
|
125
|
+
return value.replace(/\r\n/gu, "\n").replace(/\r/gu, "\n");
|
|
126
|
+
}
|
|
127
|
+
function normalizeProjectStateResources(resources) {
|
|
128
|
+
return {
|
|
129
|
+
cpu: resources.cpu ?? DEFAULT_EXECUTION_RESOURCES.cpu,
|
|
130
|
+
memoryGb: resources.memoryGb ?? DEFAULT_EXECUTION_RESOURCES.memoryGb,
|
|
131
|
+
diskGb: resources.diskGb ?? DEFAULT_EXECUTION_RESOURCES.diskGb,
|
|
132
|
+
timeoutMinutes: resources.timeoutMinutes ?? DEFAULT_EXECUTION_RESOURCES.timeoutMinutes,
|
|
133
|
+
};
|
|
134
|
+
}
|
|
135
|
+
function sortByStableKey(items, keyFor) {
|
|
136
|
+
return [...items].sort((left, right) => keyFor(left).localeCompare(keyFor(right)));
|
|
137
|
+
}
|
|
138
|
+
function canonicalizeProjectState(value) {
|
|
139
|
+
if (Array.isArray(value)) {
|
|
140
|
+
return value.map(canonicalizeProjectState);
|
|
141
|
+
}
|
|
142
|
+
if (!value || typeof value !== "object") {
|
|
143
|
+
return value;
|
|
144
|
+
}
|
|
145
|
+
const record = value;
|
|
146
|
+
return Object.fromEntries(Object.keys(record)
|
|
147
|
+
.sort()
|
|
148
|
+
.map((key) => [key, canonicalizeProjectState(record[key])]));
|
|
149
|
+
}
|
|
39
150
|
export const DEFAULT_ENVIRONMENT_VERSIONS = [
|
|
40
151
|
{
|
|
41
152
|
id: "envv_python_3_12",
|
|
@@ -153,7 +264,7 @@ export const DEFAULT_ENVIRONMENTS = [
|
|
|
153
264
|
{
|
|
154
265
|
id: "env_node",
|
|
155
266
|
name: "Node",
|
|
156
|
-
description: "Node runtime for JavaScript and TypeScript
|
|
267
|
+
description: "Node runtime for JavaScript and TypeScript candidates.",
|
|
157
268
|
currentVersionId: "envv_node_22",
|
|
158
269
|
builtIn: true,
|
|
159
270
|
createdAt: "2026-04-23T00:00:00.000Z",
|
|
@@ -191,8 +302,7 @@ function splitAuthoredSourceYaml(sourceYaml) {
|
|
|
191
302
|
}
|
|
192
303
|
const entries = [
|
|
193
304
|
[BENCHMARK_SPEC_FILE, parsed.benchmark],
|
|
194
|
-
["
|
|
195
|
-
["optimizers/current.yaml", splitOptimizerSourceRecord(parsed.optimizer)],
|
|
305
|
+
["candidates/current/candidate.yaml", splitCandidateSourceRecord(parsed.candidate)],
|
|
196
306
|
];
|
|
197
307
|
return entries.flatMap(([filePath, value]) => {
|
|
198
308
|
if (!value || typeof value !== "object" || Array.isArray(value)) {
|
|
@@ -204,23 +314,20 @@ function splitAuthoredSourceYaml(sourceYaml) {
|
|
|
204
314
|
}];
|
|
205
315
|
});
|
|
206
316
|
}
|
|
207
|
-
function
|
|
317
|
+
function splitCandidateSourceRecord(value) {
|
|
208
318
|
const record = cloneYamlRecord(value);
|
|
209
319
|
if (!record) {
|
|
210
320
|
return value;
|
|
211
321
|
}
|
|
212
322
|
delete record.benchmark;
|
|
213
323
|
delete record.path;
|
|
214
|
-
|
|
324
|
+
stripCandidateRuntimeSelection(record);
|
|
325
|
+
rewriteAdapterSources(record, "candidates/current");
|
|
215
326
|
return record;
|
|
216
327
|
}
|
|
217
|
-
function
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
return value;
|
|
221
|
-
}
|
|
222
|
-
rewriteAdapterSources(record, "optimizers");
|
|
223
|
-
return record;
|
|
328
|
+
function stripCandidateRuntimeSelection(record) {
|
|
329
|
+
delete record.selectedRunId;
|
|
330
|
+
delete record.selectedRunName;
|
|
224
331
|
}
|
|
225
332
|
function cloneYamlRecord(value) {
|
|
226
333
|
return value && typeof value === "object" && !Array.isArray(value)
|
|
@@ -242,11 +349,10 @@ function sourcePathRelativeTo(yamlDir, sourcePath) {
|
|
|
242
349
|
}
|
|
243
350
|
function isAuthoredSourceYamlPath(filePath) {
|
|
244
351
|
return filePath === BENCHMARK_SPEC_FILE ||
|
|
245
|
-
|
|
246
|
-
/^optimizers\/[^/]+\.ya?ml$/iu.test(filePath);
|
|
352
|
+
isWorkbenchCandidateManifestPath(filePath);
|
|
247
353
|
}
|
|
248
|
-
function
|
|
249
|
-
return spec.improve ? `adapter:${spec.improve.use}` : "
|
|
354
|
+
function formatImproveSummary(spec) {
|
|
355
|
+
return spec.improve ? `adapter:${spec.improve.use}` : "improve not configured";
|
|
250
356
|
}
|
|
251
357
|
function formatEngineRunSummary(spec) {
|
|
252
358
|
return `adapter:${spec.engineRun.use}`;
|
|
@@ -287,10 +393,10 @@ function protocolStepForExecution(execution, manifests) {
|
|
|
287
393
|
if (execution.purpose !== "improve") {
|
|
288
394
|
throw new Error(`Protocol execution step only supports improve executions, not ${execution.purpose}.`);
|
|
289
395
|
}
|
|
290
|
-
const operation = "
|
|
396
|
+
const operation = "candidate.improve";
|
|
291
397
|
const command = adapterProtocolCommandSpec(execution.adapter, operation, manifests);
|
|
292
398
|
return {
|
|
293
|
-
kind: "
|
|
399
|
+
kind: "improver",
|
|
294
400
|
label: execution.purpose,
|
|
295
401
|
operation,
|
|
296
402
|
executor: command.executor,
|
|
@@ -387,35 +493,32 @@ export function materializeWorkbenchRunResult(args) {
|
|
|
387
493
|
const completed = args.jobs.filter((job) => job.status === "succeeded");
|
|
388
494
|
const failedJobCount = args.jobs.filter((job) => job.status === "failed").length;
|
|
389
495
|
const completedJobCount = args.jobs.filter((job) => job.status === "succeeded").length;
|
|
390
|
-
const
|
|
496
|
+
const candidateRevisions = completed
|
|
391
497
|
.filter((job) => workbenchExecutionPurpose(job) === "improve")
|
|
392
|
-
.map((job) =>
|
|
498
|
+
.map((job) => normalizeCandidateRevisionJobOutput(job.output))
|
|
393
499
|
.filter((output) => output !== null)
|
|
394
500
|
.sort((left, right) => left.attemptIndex - right.attemptIndex);
|
|
395
501
|
const evaluationJobs = args.jobs.filter((job) => workbenchExecutionPurpose(job) === "attempt");
|
|
396
|
-
const
|
|
502
|
+
const evaluationsByCandidate = new Map();
|
|
397
503
|
for (const job of evaluationJobs) {
|
|
398
|
-
const
|
|
399
|
-
readJobString(job.input, "
|
|
400
|
-
job.
|
|
401
|
-
if (
|
|
402
|
-
|
|
403
|
-
...(
|
|
504
|
+
const candidateId = readJobString(job.output, "candidateId") ??
|
|
505
|
+
readJobString(job.input, "candidateId") ??
|
|
506
|
+
job.candidateId;
|
|
507
|
+
if (candidateId) {
|
|
508
|
+
evaluationsByCandidate.set(candidateId, [
|
|
509
|
+
...(evaluationsByCandidate.get(candidateId) ?? []),
|
|
404
510
|
job,
|
|
405
511
|
]);
|
|
406
512
|
}
|
|
407
513
|
}
|
|
408
|
-
const
|
|
409
|
-
const
|
|
514
|
+
const candidates = [];
|
|
515
|
+
const candidateFiles = {};
|
|
410
516
|
const evaluations = [];
|
|
411
|
-
for (const
|
|
412
|
-
const
|
|
413
|
-
const
|
|
414
|
-
const succeededEvaluationJobs =
|
|
415
|
-
const outputs = normalizeEvaluationSampleOutputs(
|
|
416
|
-
jobs: succeededEvaluationJobs,
|
|
417
|
-
allJobs: completed,
|
|
418
|
-
})
|
|
517
|
+
for (const candidateRevision of candidateRevisions) {
|
|
518
|
+
const candidateId = candidateRevision.candidateId;
|
|
519
|
+
const candidateJobs = evaluationsByCandidate.get(candidateId) ?? [];
|
|
520
|
+
const succeededEvaluationJobs = candidateJobs.filter((job) => job.status === "succeeded");
|
|
521
|
+
const outputs = normalizeEvaluationSampleOutputs(succeededEvaluationJobs)
|
|
419
522
|
.sort((left, right) => compareSampleOutputs(left.output, right.output));
|
|
420
523
|
const outputJobIds = new Set(outputs.flatMap(({ jobs }) => jobs.map((job) => job.id)));
|
|
421
524
|
const completedSampleKeys = new Set(outputs
|
|
@@ -425,39 +528,38 @@ export function materializeWorkbenchRunResult(args) {
|
|
|
425
528
|
])
|
|
426
529
|
.filter((key) => key !== null));
|
|
427
530
|
const errorSampleJobs = [
|
|
428
|
-
...
|
|
531
|
+
...candidateJobs.filter((job) => job.status === "failed"),
|
|
429
532
|
...succeededEvaluationJobs.filter((job) => !outputJobIds.has(job.id)),
|
|
430
533
|
];
|
|
431
|
-
const errorSamples = errorEvaluationSamplesFromJobs(errorSampleJobs,
|
|
534
|
+
const errorSamples = errorEvaluationSamplesFromJobs(errorSampleJobs, candidateId, candidateRevision.attemptIndex, completedSampleKeys);
|
|
432
535
|
const samples = [
|
|
433
536
|
...outputs.map(({ jobs, output }) => withJobUsage(output.sample, completed, jobs[0])),
|
|
434
537
|
...errorSamples,
|
|
435
538
|
].sort((left, right) => left.index - right.index || left.id.localeCompare(right.id));
|
|
436
|
-
const
|
|
437
|
-
const evalRecord = createEvaluationRecord(
|
|
539
|
+
const candidateName = normalizedCandidateDisplayName(args.spec.candidate.name);
|
|
540
|
+
const evalRecord = createEvaluationRecord(candidateId, candidateName, samples);
|
|
438
541
|
const usage = mergeUsageSummaries([
|
|
439
|
-
|
|
542
|
+
candidateRevision.usage,
|
|
440
543
|
...samples.map((sample) => sample.usage),
|
|
441
544
|
]);
|
|
442
|
-
const
|
|
443
|
-
const attemptIndex = subjectRevision.attemptIndex;
|
|
545
|
+
const attemptIndex = candidateRevision.attemptIndex;
|
|
444
546
|
const evaluationTraces = [
|
|
445
547
|
...outputs.flatMap(({ output }) => output.traces),
|
|
446
548
|
...errorSampleJobs.flatMap(jobTracePaths),
|
|
447
549
|
].sort();
|
|
448
|
-
const baseId =
|
|
449
|
-
?
|
|
550
|
+
const baseId = candidateRevision.baseId && candidateRevision.baseId !== candidateId
|
|
551
|
+
? candidateRevision.baseId
|
|
450
552
|
: null;
|
|
451
|
-
const sourceMeta =
|
|
553
|
+
const sourceMeta = candidateSourceMetadata(args.candidateSourceFiles);
|
|
452
554
|
const benchmarkMeta = benchmarkSourceMetadata(args.benchmarkSourceFiles);
|
|
453
555
|
const meta = {
|
|
454
556
|
attemptIndex,
|
|
455
557
|
sampleCount: evalRecord.sampleCount,
|
|
456
|
-
|
|
558
|
+
improver: formatImproveSummary(args.spec),
|
|
457
559
|
engineRun: formatEngineRunSummary(args.spec),
|
|
458
560
|
strategy: "greedy",
|
|
459
561
|
traces: {
|
|
460
|
-
improve:
|
|
562
|
+
improve: candidateRevision.traces,
|
|
461
563
|
evaluations: evaluationTraces,
|
|
462
564
|
},
|
|
463
565
|
};
|
|
@@ -467,52 +569,124 @@ export function materializeWorkbenchRunResult(args) {
|
|
|
467
569
|
if (benchmarkMeta) {
|
|
468
570
|
meta.benchmark = benchmarkMeta;
|
|
469
571
|
}
|
|
470
|
-
const record = {
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
572
|
+
const record = preserveExistingCandidateIdentity({
|
|
573
|
+
candidate: {
|
|
574
|
+
id: candidateId,
|
|
575
|
+
...(candidateName ? { name: candidateName } : {}),
|
|
576
|
+
version: args.existingCandidateCount + candidates.length + 1,
|
|
577
|
+
ordinal: args.existingCandidateCount + candidates.length + 1,
|
|
578
|
+
benchmarkFingerprint: args.benchmarkFingerprint,
|
|
579
|
+
candidateFingerprint: args.candidateFingerprint ?? materializedCandidateFingerprint(args.spec, candidateRevision.files),
|
|
580
|
+
createdAt: args.startedAt,
|
|
581
|
+
...(baseId ? { baseId } : {}),
|
|
582
|
+
referenceIds: [],
|
|
583
|
+
status: evalRecord.completedSampleCount > 0 ? "evaluated" : "eval_error",
|
|
584
|
+
fileChanges: candidateRevision.fileChanges,
|
|
585
|
+
...(usage ? { usage } : {}),
|
|
586
|
+
eval: evalRecord,
|
|
587
|
+
...(candidateRevision.prompt ? { prompt: candidateRevision.prompt } : {}),
|
|
588
|
+
meta,
|
|
589
|
+
},
|
|
590
|
+
previousCandidate: args.previousCandidate ?? null,
|
|
591
|
+
});
|
|
592
|
+
candidates.push(record);
|
|
488
593
|
evaluations.push(createEvaluationScorecard({
|
|
489
594
|
runId: args.runId,
|
|
490
595
|
benchmarkFingerprint: args.benchmarkFingerprint,
|
|
491
596
|
createdAt: args.startedAt,
|
|
492
|
-
|
|
597
|
+
candidate: record,
|
|
598
|
+
candidateRunId: args.spec.candidate.selectedRunId,
|
|
599
|
+
candidateRunName: args.spec.candidate.selectedRunName,
|
|
493
600
|
evaluation: evalRecord,
|
|
601
|
+
...(args.selection
|
|
602
|
+
? {
|
|
603
|
+
selection: {
|
|
604
|
+
metric: args.selection.metric,
|
|
605
|
+
caseIds: args.selection.caseIds,
|
|
606
|
+
...(args.selection.label ? { label: args.selection.label } : {}),
|
|
607
|
+
},
|
|
608
|
+
}
|
|
609
|
+
: {}),
|
|
494
610
|
}));
|
|
495
|
-
|
|
496
|
-
|
|
611
|
+
candidateFiles[candidateId] = materializedCandidateFiles({
|
|
612
|
+
candidateRevisionFiles: candidateRevision.files,
|
|
497
613
|
});
|
|
498
614
|
}
|
|
499
|
-
const
|
|
500
|
-
|
|
501
|
-
|
|
615
|
+
const selectedCandidate = selectCandidate({
|
|
616
|
+
candidates,
|
|
617
|
+
previousCandidate: args.previousCandidate ?? null,
|
|
618
|
+
selection: args.selection,
|
|
502
619
|
});
|
|
503
620
|
return {
|
|
504
|
-
|
|
505
|
-
|
|
621
|
+
candidates,
|
|
622
|
+
candidateFiles,
|
|
506
623
|
evaluations,
|
|
507
|
-
|
|
508
|
-
|
|
624
|
+
activeCandidateId: selectedCandidate?.id ?? args.previousCandidate?.id ?? null,
|
|
625
|
+
selectedCandidate,
|
|
509
626
|
completedJobCount,
|
|
510
627
|
failedJobCount,
|
|
511
628
|
};
|
|
512
629
|
}
|
|
513
|
-
function
|
|
630
|
+
function preserveExistingCandidateIdentity(args) {
|
|
631
|
+
const previous = args.previousCandidate;
|
|
632
|
+
if (!previous || previous.id !== args.candidate.id) {
|
|
633
|
+
return args.candidate;
|
|
634
|
+
}
|
|
635
|
+
const baseId = args.candidate.baseId ?? previous.baseId;
|
|
636
|
+
const prompt = args.candidate.prompt ?? previous.prompt;
|
|
637
|
+
const meta = mergeExistingCandidateMeta(previous.meta, args.candidate.meta);
|
|
638
|
+
return {
|
|
639
|
+
...args.candidate,
|
|
640
|
+
version: previous.version,
|
|
641
|
+
ordinal: previous.version,
|
|
642
|
+
createdAt: previous.createdAt,
|
|
643
|
+
...(args.candidate.name ?? previous.name
|
|
644
|
+
? { name: (args.candidate.name ?? previous.name) }
|
|
645
|
+
: {}),
|
|
646
|
+
...(baseId ? { baseId } : {}),
|
|
647
|
+
referenceIds: previous.referenceIds.length > 0
|
|
648
|
+
? [...previous.referenceIds]
|
|
649
|
+
: args.candidate.referenceIds,
|
|
650
|
+
fileChanges: args.candidate.fileChanges.length > 0
|
|
651
|
+
? args.candidate.fileChanges
|
|
652
|
+
: [...previous.fileChanges],
|
|
653
|
+
...(prompt ? { prompt } : {}),
|
|
654
|
+
...(meta ? { meta } : {}),
|
|
655
|
+
};
|
|
656
|
+
}
|
|
657
|
+
function mergeExistingCandidateMeta(previousMeta, candidateMeta) {
|
|
658
|
+
const previous = jsonRecord(previousMeta);
|
|
659
|
+
const candidate = jsonRecord(candidateMeta);
|
|
660
|
+
if (!previous) {
|
|
661
|
+
return candidateMeta;
|
|
662
|
+
}
|
|
663
|
+
if (!candidate) {
|
|
664
|
+
return previousMeta;
|
|
665
|
+
}
|
|
666
|
+
const previousTraces = jsonRecord(previous.traces);
|
|
667
|
+
const candidateTraces = jsonRecord(candidate.traces);
|
|
668
|
+
if (!previousTraces || !candidateTraces) {
|
|
669
|
+
return { ...previous, ...candidate };
|
|
670
|
+
}
|
|
671
|
+
const traces = {
|
|
672
|
+
...previousTraces,
|
|
673
|
+
...candidateTraces,
|
|
674
|
+
};
|
|
675
|
+
const candidateImproveTraces = Array.isArray(candidateTraces.improve)
|
|
676
|
+
? candidateTraces.improve
|
|
677
|
+
: [];
|
|
678
|
+
if (candidateImproveTraces.length === 0 && previousTraces.improve !== undefined) {
|
|
679
|
+
traces.improve = previousTraces.improve;
|
|
680
|
+
}
|
|
681
|
+
return {
|
|
682
|
+
...previous,
|
|
683
|
+
...candidate,
|
|
684
|
+
traces,
|
|
685
|
+
};
|
|
686
|
+
}
|
|
687
|
+
function candidateSourceMetadata(files) {
|
|
514
688
|
const sourceFiles = (files ?? [])
|
|
515
|
-
.filter((file) => /^
|
|
689
|
+
.filter((file) => /^candidates\/[^/]+\/candidate\.ya?ml$/iu.test(file.path))
|
|
516
690
|
.sort((left, right) => left.path.localeCompare(right.path))
|
|
517
691
|
.map((file) => ({
|
|
518
692
|
path: file.path,
|
|
@@ -536,14 +710,13 @@ function benchmarkSourceMetadata(files) {
|
|
|
536
710
|
}));
|
|
537
711
|
return sourceFiles.length > 0 ? { files: sourceFiles } : null;
|
|
538
712
|
}
|
|
539
|
-
function
|
|
713
|
+
function materializedCandidateFingerprint(spec, files) {
|
|
540
714
|
const hash = createHash("sha256");
|
|
541
|
-
hash.update("workbench-
|
|
542
|
-
hash.update("materialized\
|
|
543
|
-
hash.update(JSON.stringify(spec.run));
|
|
715
|
+
hash.update("workbench-candidate-v1\0");
|
|
716
|
+
hash.update("materialized\0");
|
|
544
717
|
hash.update("prepare");
|
|
545
|
-
hash.update(JSON.stringify(spec.
|
|
546
|
-
for (const file of
|
|
718
|
+
hash.update(JSON.stringify(spec.candidate.prepare ?? null));
|
|
719
|
+
for (const file of filterCandidateSourceFiles(files).slice().sort((left, right) => left.path.localeCompare(right.path))) {
|
|
547
720
|
hash.update("\0file\0");
|
|
548
721
|
hash.update(file.path);
|
|
549
722
|
hash.update("\0");
|
|
@@ -555,22 +728,28 @@ function materializedSubjectFingerprint(spec, files) {
|
|
|
555
728
|
}
|
|
556
729
|
return hash.digest("hex");
|
|
557
730
|
}
|
|
558
|
-
function
|
|
731
|
+
function materializedCandidateFiles(args) {
|
|
559
732
|
const byPath = new Map();
|
|
560
|
-
for (const file of
|
|
733
|
+
for (const file of filterCandidateSourceFiles(args.candidateRevisionFiles)) {
|
|
561
734
|
byPath.set(file.path, { ...file });
|
|
562
735
|
}
|
|
563
736
|
return [...byPath.values()].sort((left, right) => left.path.localeCompare(right.path));
|
|
564
737
|
}
|
|
565
738
|
function createEvaluationScorecard(args) {
|
|
566
739
|
const evaluation = args.evaluation;
|
|
740
|
+
const selectionScore = args.selection
|
|
741
|
+
? readEvaluationSelectionStats(evaluation, args.selection.metric, args.selection.caseIds)
|
|
742
|
+
: null;
|
|
567
743
|
return {
|
|
568
|
-
id: evaluationScorecardId(args.runId, args.
|
|
744
|
+
id: evaluationScorecardId(args.runId, args.candidate.id),
|
|
569
745
|
runId: args.runId,
|
|
570
746
|
benchmarkFingerprint: args.benchmarkFingerprint,
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
...(args.
|
|
747
|
+
candidateFingerprint: args.candidate.candidateFingerprint,
|
|
748
|
+
candidateId: args.candidate.id,
|
|
749
|
+
...(args.candidate.name ? { candidateName: args.candidate.name } : {}),
|
|
750
|
+
candidateVersion: args.candidate.version,
|
|
751
|
+
...(args.candidateRunId ? { candidateRunId: args.candidateRunId } : {}),
|
|
752
|
+
...(args.candidateRunName ? { candidateRunName: args.candidateRunName } : {}),
|
|
574
753
|
createdAt: args.createdAt,
|
|
575
754
|
updatedAt: evaluation.finishedAt ?? args.createdAt,
|
|
576
755
|
status: evaluation.status,
|
|
@@ -578,16 +757,19 @@ function createEvaluationScorecard(args) {
|
|
|
578
757
|
completedSampleCount: evaluation.completedSampleCount,
|
|
579
758
|
errorSampleCount: evaluation.errorSampleCount,
|
|
580
759
|
...(evaluation.metrics ? { metrics: evaluation.metrics } : {}),
|
|
760
|
+
...(args.selection ? { selectionMetric: args.selection.metric } : {}),
|
|
761
|
+
...(args.selection ? { selectionLabel: args.selection.label ?? `${args.selection.metric} on selected cases` } : {}),
|
|
762
|
+
...(selectionScore ? { selectionScore } : {}),
|
|
581
763
|
...(evaluation.durationMs ? { durationMs: evaluation.durationMs } : {}),
|
|
582
764
|
...(evaluation.usage ? { usage: evaluation.usage } : {}),
|
|
583
765
|
...(evaluation.error ? { error: evaluation.error } : {}),
|
|
584
766
|
evaluation,
|
|
585
767
|
};
|
|
586
768
|
}
|
|
587
|
-
export function evaluationScorecardId(runId,
|
|
769
|
+
export function evaluationScorecardId(runId, candidateId) {
|
|
588
770
|
const runPart = runId.replace(/[^a-z0-9]+/giu, "_").replace(/^_+|_+$/gu, "").slice(-24);
|
|
589
|
-
const
|
|
590
|
-
return `eval_${runPart}_${
|
|
771
|
+
const candidatePart = candidateId.replace(/[^a-z0-9]+/giu, "_").replace(/^_+|_+$/gu, "").slice(-24);
|
|
772
|
+
return `eval_${runPart}_${candidatePart}`;
|
|
591
773
|
}
|
|
592
774
|
export function selectExecutionOutputFilesForInspection(args) {
|
|
593
775
|
return args.files.filter((file) => !isWorkbenchInternalOutputPath(file.path));
|
|
@@ -602,56 +784,145 @@ export function isWorkbenchInternalOutputPath(filePath) {
|
|
|
602
784
|
normalized === "exit_code" ||
|
|
603
785
|
/^[a-z_-]+_(stdout\.log|stderr\.log|exit_code)$/u.test(normalized));
|
|
604
786
|
}
|
|
605
|
-
export function
|
|
787
|
+
export function createOptimizerTraceInputFiles(args) {
|
|
606
788
|
const files = [];
|
|
607
|
-
const
|
|
789
|
+
const executions = [];
|
|
608
790
|
const jobs = args.jobs
|
|
609
|
-
.filter(
|
|
791
|
+
.filter(isOptimizerTraceInputJob)
|
|
610
792
|
.sort(compareTraceInputJobs);
|
|
611
|
-
|
|
793
|
+
jobs.forEach((job, index) => {
|
|
794
|
+
const sequence = String(index + 1).padStart(6, "0");
|
|
795
|
+
const executionPath = `executions/${sequence}`;
|
|
796
|
+
const operation = "engine.run";
|
|
612
797
|
const jobFiles = completedJobOutputFiles(job);
|
|
613
|
-
const
|
|
614
|
-
|
|
615
|
-
const
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
798
|
+
const requestFile = traceInputRequestFile(jobFiles, operation);
|
|
799
|
+
const resultFile = traceInputResultFile(jobFiles, operation);
|
|
800
|
+
const requestPath = `${executionPath}/request.json`;
|
|
801
|
+
const resultPath = `${executionPath}/result.json`;
|
|
802
|
+
const filesPath = `${executionPath}/files`;
|
|
803
|
+
files.push(textSurfaceFile(requestPath, requestFile?.content ?? `${JSON.stringify(traceInputRequestFallback(job, operation), null, 2)}\n`));
|
|
804
|
+
files.push(textSurfaceFile(resultPath, resultFile?.content ?? `${JSON.stringify(traceInputResultFallback(job, operation), null, 2)}\n`));
|
|
805
|
+
files.push(...jobFiles.map((file) => ({
|
|
806
|
+
...file,
|
|
807
|
+
path: normalizeRelativePath(`${filesPath}/${file.path}`),
|
|
808
|
+
})));
|
|
809
|
+
executions.push({
|
|
810
|
+
path: executionPath,
|
|
811
|
+
operation,
|
|
812
|
+
status: job.status,
|
|
813
|
+
candidateId: job.candidateId ?? readJobString(job.input, "candidateId") ?? null,
|
|
814
|
+
runId: job.runId,
|
|
815
|
+
jobId: job.id,
|
|
816
|
+
attemptIndex: readOptionalJobNumber(job.input, "attemptIndex") ?? null,
|
|
817
|
+
sampleIndex: readOptionalJobNumber(job.input, "sampleIndex") ?? null,
|
|
818
|
+
caseId: readJobString(job.input, "caseId") ?? null,
|
|
819
|
+
requestPath,
|
|
820
|
+
resultPath,
|
|
821
|
+
filesPath,
|
|
631
822
|
});
|
|
632
|
-
}
|
|
633
|
-
files.push(textSurfaceFile("
|
|
634
|
-
|
|
635
|
-
|
|
823
|
+
});
|
|
824
|
+
files.push(textSurfaceFile("index.json", `${JSON.stringify({
|
|
825
|
+
schema: "workbench.optimizer-traces.v1",
|
|
826
|
+
executions,
|
|
636
827
|
}, null, 2)}\n`));
|
|
637
828
|
return dedupeSurfaceFiles(files);
|
|
638
829
|
}
|
|
639
|
-
export function
|
|
640
|
-
|
|
641
|
-
|
|
830
|
+
export function workbenchImproveOptimizeSelector(spec) {
|
|
831
|
+
return cloneWorkbenchCaseSelector(spec.candidate.improve?.optimizeOn ?? { all: true });
|
|
832
|
+
}
|
|
833
|
+
export function workbenchImproveSelectionPolicy(spec) {
|
|
834
|
+
const optimizeOn = workbenchImproveOptimizeSelector(spec);
|
|
835
|
+
const selectBy = spec.candidate.improve?.selectBy;
|
|
836
|
+
return {
|
|
837
|
+
metric: selectBy?.metric ?? "score",
|
|
838
|
+
selector: cloneWorkbenchCaseSelector(selectBy?.cases ?? optimizeOn),
|
|
839
|
+
};
|
|
840
|
+
}
|
|
841
|
+
export function workbenchEngineCaseIdsForSelector(engineCases, selector) {
|
|
842
|
+
return engineCases
|
|
843
|
+
.filter((engineCase) => workbenchEngineCaseMatchesSelector(engineCase, selector))
|
|
844
|
+
.map((engineCase) => engineCase.id);
|
|
845
|
+
}
|
|
846
|
+
export function workbenchEngineCaseIdsForImproveEvaluation(args) {
|
|
847
|
+
const optimizeIds = new Set(workbenchEngineCaseIdsForSelector(args.engineCases, workbenchImproveOptimizeSelector(args.spec)));
|
|
848
|
+
const selectionIds = new Set(workbenchEngineCaseIdsForSelector(args.engineCases, workbenchImproveSelectionPolicy(args.spec).selector));
|
|
849
|
+
return args.engineCases
|
|
850
|
+
.map((engineCase) => engineCase.id)
|
|
851
|
+
.filter((caseId) => optimizeIds.has(caseId) || selectionIds.has(caseId));
|
|
852
|
+
}
|
|
853
|
+
export function filterOptimizerTraceJobsForCaseIds(jobs, caseIds) {
|
|
854
|
+
const allowed = new Set(caseIds);
|
|
855
|
+
if (allowed.size === 0) {
|
|
642
856
|
return [];
|
|
643
857
|
}
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
858
|
+
return jobs.filter((job) => {
|
|
859
|
+
if (workbenchExecutionPurpose(job) !== "attempt") {
|
|
860
|
+
return false;
|
|
861
|
+
}
|
|
862
|
+
const caseId = readJobString(job.input, "caseId");
|
|
863
|
+
return caseId !== null && allowed.has(caseId);
|
|
864
|
+
});
|
|
865
|
+
}
|
|
866
|
+
export function formatWorkbenchCaseSelector(selector) {
|
|
867
|
+
return workbenchCaseSelectorUsesAllCases(selector)
|
|
868
|
+
? "all cases"
|
|
869
|
+
: `split=${selector.split}`;
|
|
870
|
+
}
|
|
871
|
+
export function formatWorkbenchSelectionPolicy(policy) {
|
|
872
|
+
return `${policy.metric} on ${formatWorkbenchCaseSelector(policy.selector)}`;
|
|
873
|
+
}
|
|
874
|
+
export function workbenchCaseSelectorUsesAllCases(selector) {
|
|
875
|
+
return !selector.split;
|
|
876
|
+
}
|
|
877
|
+
function workbenchEngineCaseMatchesSelector(engineCase, selector) {
|
|
878
|
+
if (workbenchCaseSelectorUsesAllCases(selector)) {
|
|
879
|
+
return true;
|
|
880
|
+
}
|
|
881
|
+
return engineCase.case.split === selector.split;
|
|
882
|
+
}
|
|
883
|
+
function cloneWorkbenchCaseSelector(selector) {
|
|
884
|
+
return selector.split ? { split: selector.split } : { all: true };
|
|
885
|
+
}
|
|
886
|
+
export function evaluationMeanMetrics(evaluation) {
|
|
887
|
+
const entries = Object.entries(evaluation?.metrics ?? {})
|
|
888
|
+
.filter((entry) => Number.isFinite(entry[1].mean));
|
|
889
|
+
return entries.length > 0
|
|
890
|
+
? Object.fromEntries(entries.map(([key, stats]) => [key, stats.mean]))
|
|
891
|
+
: undefined;
|
|
892
|
+
}
|
|
893
|
+
export function candidateRecordWithoutDerivedFields(candidate) {
|
|
894
|
+
const { metrics: _metrics, candidateRunId: _candidateRunId, candidateRunName: _candidateRunName, ...record } = candidate;
|
|
895
|
+
return record;
|
|
896
|
+
}
|
|
897
|
+
export function candidateSummaryFromRecord(candidate) {
|
|
898
|
+
const { eval: _eval, prompt: _prompt, meta: _meta, ...summary } = candidateRecordWithoutDerivedFields(candidate);
|
|
899
|
+
return summary;
|
|
900
|
+
}
|
|
901
|
+
export function workbenchRunExecutionFingerprint(args) {
|
|
902
|
+
const hash = createHash("sha256");
|
|
903
|
+
hash.update("workbench-run-execution-v1\0");
|
|
904
|
+
hash.update(args.specVersionId ?? "");
|
|
905
|
+
hash.update("\0");
|
|
906
|
+
hash.update(args.environmentVersionId ?? "");
|
|
907
|
+
hash.update("\0");
|
|
908
|
+
hash.update(args.sourceYaml ?? "");
|
|
909
|
+
for (const file of (args.adapterFiles ?? []).slice().sort((left, right) => left.path.localeCompare(right.path))) {
|
|
910
|
+
hash.update("\0file\0");
|
|
911
|
+
hash.update(file.path);
|
|
912
|
+
hash.update("\0");
|
|
913
|
+
hash.update(file.kind);
|
|
914
|
+
hash.update("\0");
|
|
915
|
+
hash.update(file.encoding);
|
|
916
|
+
hash.update("\0");
|
|
917
|
+
hash.update(file.executable ? "1" : "0");
|
|
918
|
+
hash.update("\0");
|
|
919
|
+
hash.update(file.content);
|
|
920
|
+
}
|
|
921
|
+
return hash.digest("hex");
|
|
922
|
+
}
|
|
923
|
+
function isOptimizerTraceInputJob(job) {
|
|
924
|
+
return isTerminalExecutionJob(job) &&
|
|
925
|
+
workbenchExecutionPurpose(job) === "attempt";
|
|
655
926
|
}
|
|
656
927
|
function isTerminalExecutionJob(job) {
|
|
657
928
|
return job.kind === "execute" && (job.status === "succeeded" ||
|
|
@@ -662,20 +933,10 @@ function compareTraceInputJobs(left, right) {
|
|
|
662
933
|
const leftAttempt = readOptionalJobNumber(left.input, "attemptIndex") ?? -1;
|
|
663
934
|
const rightAttempt = readOptionalJobNumber(right.input, "attemptIndex") ?? -1;
|
|
664
935
|
return leftAttempt - rightAttempt ||
|
|
665
|
-
purposeSortKey(workbenchExecutionPurpose(left)) - purposeSortKey(workbenchExecutionPurpose(right)) ||
|
|
666
936
|
(readOptionalJobNumber(left.input, "sampleIndex") ?? -1) - (readOptionalJobNumber(right.input, "sampleIndex") ?? -1) ||
|
|
667
937
|
(readJobString(left.input, "caseId") ?? "").localeCompare(readJobString(right.input, "caseId") ?? "") ||
|
|
668
938
|
left.id.localeCompare(right.id);
|
|
669
939
|
}
|
|
670
|
-
function purposeSortKey(purpose) {
|
|
671
|
-
if (purpose === "improve") {
|
|
672
|
-
return 0;
|
|
673
|
-
}
|
|
674
|
-
if (purpose === "attempt") {
|
|
675
|
-
return 1;
|
|
676
|
-
}
|
|
677
|
-
return 3;
|
|
678
|
-
}
|
|
679
940
|
function completedJobOutputFiles(job) {
|
|
680
941
|
const output = jsonRecord(job.output);
|
|
681
942
|
if (!Array.isArray(output.files)) {
|
|
@@ -689,35 +950,70 @@ function completedJobOutputFiles(job) {
|
|
|
689
950
|
}
|
|
690
951
|
return files;
|
|
691
952
|
}
|
|
692
|
-
function
|
|
693
|
-
|
|
953
|
+
function traceInputRequestFile(files, operation) {
|
|
954
|
+
return files.find((file) => {
|
|
955
|
+
const normalized = normalizeRelativePath(file.path);
|
|
956
|
+
return normalized.startsWith(".workbench/traces/") &&
|
|
957
|
+
normalized.endsWith("/request.json") &&
|
|
958
|
+
file.encoding === "utf8" &&
|
|
959
|
+
traceJsonOperation(file) === operation;
|
|
960
|
+
}) ?? null;
|
|
961
|
+
}
|
|
962
|
+
function traceInputResultFile(files, operation) {
|
|
963
|
+
return files.find((file) => {
|
|
964
|
+
const normalized = normalizeRelativePath(file.path);
|
|
965
|
+
return normalized.startsWith(".workbench/traces/") &&
|
|
966
|
+
normalized.endsWith("/result.json") &&
|
|
967
|
+
file.encoding === "utf8" &&
|
|
968
|
+
traceJsonOperation(file) === operation;
|
|
969
|
+
}) ?? null;
|
|
970
|
+
}
|
|
971
|
+
function traceJsonOperation(file) {
|
|
972
|
+
try {
|
|
973
|
+
const parsed = JSON.parse(file.content);
|
|
974
|
+
return typeof parsed?.operation === "string" ? parsed.operation : null;
|
|
975
|
+
}
|
|
976
|
+
catch {
|
|
977
|
+
return null;
|
|
978
|
+
}
|
|
979
|
+
}
|
|
980
|
+
function traceInputRequestFallback(job, operation) {
|
|
981
|
+
const execution = jsonRecord(jsonRecord(job.input).execution);
|
|
694
982
|
return {
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
983
|
+
protocol: "workbench.adapter.v3",
|
|
984
|
+
id: typeof execution.id === "string" ? execution.id : job.id,
|
|
985
|
+
jobId: job.id,
|
|
986
|
+
operation,
|
|
987
|
+
invocation: jsonRecord(execution.adapter),
|
|
988
|
+
context: {
|
|
989
|
+
candidate: {
|
|
990
|
+
id: job.candidateId ?? readJobString(job.input, "candidateId") ?? null,
|
|
991
|
+
},
|
|
992
|
+
attempt: {
|
|
993
|
+
attemptIndex: readOptionalJobNumber(job.input, "attemptIndex") ?? null,
|
|
994
|
+
sampleIndex: readOptionalJobNumber(job.input, "sampleIndex") ?? null,
|
|
995
|
+
caseId: readJobString(job.input, "caseId") ?? null,
|
|
996
|
+
},
|
|
997
|
+
},
|
|
710
998
|
};
|
|
711
999
|
}
|
|
712
|
-
function
|
|
713
|
-
const
|
|
714
|
-
const
|
|
715
|
-
const
|
|
1000
|
+
function traceInputResultFallback(job, operation) {
|
|
1001
|
+
const output = jsonRecord(job.output);
|
|
1002
|
+
const ok = job.status === "succeeded" && output.ok !== false;
|
|
1003
|
+
const value = operation === "candidate.improve"
|
|
1004
|
+
? jsonRecord(output.candidatePatch)
|
|
1005
|
+
: operation === "engine.run"
|
|
1006
|
+
? jsonRecord(output.result)
|
|
1007
|
+
: {};
|
|
716
1008
|
return {
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
1009
|
+
protocol: "workbench.adapter-result.v1",
|
|
1010
|
+
operation,
|
|
1011
|
+
ok,
|
|
1012
|
+
...(Object.keys(value).length > 0 ? { value: value } : {}),
|
|
1013
|
+
...(typeof output.summary === "string" ? { summary: output.summary } : {}),
|
|
1014
|
+
...(output.feedback !== undefined ? { feedback: output.feedback } : {}),
|
|
1015
|
+
...(output.usage !== undefined ? { usage: output.usage } : {}),
|
|
1016
|
+
...(!ok ? { error: job.error ?? "Execution did not complete successfully." } : {}),
|
|
721
1017
|
};
|
|
722
1018
|
}
|
|
723
1019
|
function textSurfaceFile(path, content) {
|
|
@@ -744,7 +1040,7 @@ export function buildWorkbenchProjectSourceFiles(input) {
|
|
|
744
1040
|
...(input.specFiles
|
|
745
1041
|
? input.specFiles.map((file) => ({ ...file }))
|
|
746
1042
|
: [textSurfaceFile("benchmark.yaml", input.specSource ?? "")]),
|
|
747
|
-
...prefixProjectSourceFiles(input.
|
|
1043
|
+
...prefixProjectSourceFiles(input.candidateFiles, input.candidateFilesPath),
|
|
748
1044
|
...prefixProjectSourceFiles(input.engineResolveFiles, input.engineResolveFilesPath),
|
|
749
1045
|
...(input.adapterFiles ?? []).map((file) => ({ ...file })),
|
|
750
1046
|
...(input.dockerfiles ?? []).map((file) => ({ ...file })),
|
|
@@ -772,18 +1068,18 @@ function prefixProjectSourceFiles(files, rootPath) {
|
|
|
772
1068
|
};
|
|
773
1069
|
});
|
|
774
1070
|
}
|
|
775
|
-
export function
|
|
1071
|
+
export function isCandidateSourceFilePath(filePath) {
|
|
776
1072
|
const normalized = normalizeRelativePath(filePath);
|
|
777
1073
|
return (normalized !== ".workbench" &&
|
|
778
1074
|
!normalized.startsWith(".workbench/") &&
|
|
779
1075
|
normalized !== "workbench-result.json");
|
|
780
1076
|
}
|
|
781
|
-
export function
|
|
1077
|
+
export function filterCandidateSourceFiles(files) {
|
|
782
1078
|
return files
|
|
783
|
-
.filter((file) =>
|
|
1079
|
+
.filter((file) => isCandidateSourceFilePath(file.path))
|
|
784
1080
|
.map((file) => ({ ...file }));
|
|
785
1081
|
}
|
|
786
|
-
export function
|
|
1082
|
+
export function buildCandidateLineage(args) {
|
|
787
1083
|
const orderedSummaries = args.summaries.slice().sort((left, right) => {
|
|
788
1084
|
const createdAt = left.createdAt.localeCompare(right.createdAt);
|
|
789
1085
|
return createdAt !== 0 ? createdAt : left.id.localeCompare(right.id);
|
|
@@ -856,7 +1152,7 @@ function globPatternToRegExp(pattern) {
|
|
|
856
1152
|
function escapeRegExp(value) {
|
|
857
1153
|
return value.replace(/[\\^$.*+?()[\]{}|]/gu, "\\$&");
|
|
858
1154
|
}
|
|
859
|
-
export function
|
|
1155
|
+
export function summarizeCandidateFiles(files, changedPaths = files.map((file) => file.path)) {
|
|
860
1156
|
const changed = new Set(changedPaths);
|
|
861
1157
|
return [...files]
|
|
862
1158
|
.sort((left, right) => left.path.localeCompare(right.path))
|
|
@@ -875,7 +1171,7 @@ export function summarizeSubjectFiles(files, changedPaths = files.map((file) =>
|
|
|
875
1171
|
};
|
|
876
1172
|
});
|
|
877
1173
|
}
|
|
878
|
-
export function
|
|
1174
|
+
export function createCandidateFilePreview(args) {
|
|
879
1175
|
if (args.view === "diff") {
|
|
880
1176
|
throw new Error("Diff previews require explicit before and after file content.");
|
|
881
1177
|
}
|
|
@@ -901,14 +1197,14 @@ export function createSubjectFilePreview(args) {
|
|
|
901
1197
|
export function createCaseReview(args) {
|
|
902
1198
|
const preferredSampleIndex = uniqueExecutionSampleIndex(args.executions ?? []);
|
|
903
1199
|
const sampleMatchesCase = (sample) => (sample.cases ?? []).some((entry) => entry.id === args.caseId);
|
|
904
|
-
const samples = args.
|
|
1200
|
+
const samples = args.candidate.eval?.samples ?? [];
|
|
905
1201
|
const sampleResult = samples.find((sample) => typeof preferredSampleIndex === "number" &&
|
|
906
1202
|
sample.index === preferredSampleIndex &&
|
|
907
1203
|
sampleMatchesCase(sample)) ?? samples.find(sampleMatchesCase);
|
|
908
1204
|
const caseResult = sampleResult?.cases?.find((entry) => entry.id === args.caseId);
|
|
909
1205
|
if (!sampleResult && (args.executions?.length ?? 0) > 0) {
|
|
910
1206
|
return {
|
|
911
|
-
|
|
1207
|
+
candidateId: args.candidate.id,
|
|
912
1208
|
caseId: args.caseId,
|
|
913
1209
|
caseLabel: args.caseId,
|
|
914
1210
|
...(typeof preferredSampleIndex === "number"
|
|
@@ -920,13 +1216,13 @@ export function createCaseReview(args) {
|
|
|
920
1216
|
};
|
|
921
1217
|
}
|
|
922
1218
|
if (!sampleResult) {
|
|
923
|
-
throw new Error(`Case ${args.caseId} was not found on
|
|
1219
|
+
throw new Error(`Case ${args.caseId} was not found on candidate ${args.candidate.id}.`);
|
|
924
1220
|
}
|
|
925
1221
|
const durationMs = typeof caseResult?.durationMs === "number"
|
|
926
1222
|
? caseResult.durationMs
|
|
927
1223
|
: undefined;
|
|
928
1224
|
return {
|
|
929
|
-
|
|
1225
|
+
candidateId: args.candidate.id,
|
|
930
1226
|
caseId: caseResult?.id ?? args.caseId,
|
|
931
1227
|
caseLabel: caseResult?.label ?? args.caseId,
|
|
932
1228
|
sampleId: sampleResult.id,
|
|
@@ -965,37 +1261,45 @@ function parseAuthoredWorkbenchSourceSpec(source) {
|
|
|
965
1261
|
}
|
|
966
1262
|
const resolved = resolveWorkbenchResolvedSourceYamlInternal(source);
|
|
967
1263
|
return {
|
|
968
|
-
version:
|
|
1264
|
+
version: 4,
|
|
969
1265
|
benchmark: {
|
|
970
1266
|
name: resolved.benchmark.name,
|
|
971
1267
|
description: resolved.benchmark.description,
|
|
972
1268
|
engine: authoredAdapterSpecFromInvocation(resolved.engine),
|
|
973
1269
|
},
|
|
974
|
-
|
|
975
|
-
name: resolved.
|
|
976
|
-
description: resolved.
|
|
977
|
-
files: { path: resolved.
|
|
978
|
-
...(resolved.
|
|
979
|
-
|
|
980
|
-
|
|
981
|
-
|
|
982
|
-
|
|
983
|
-
|
|
984
|
-
|
|
985
|
-
...(resolved.optimizer.description ? { description: resolved.optimizer.description } : {}),
|
|
986
|
-
edits: [...resolved.optimizer.edits],
|
|
987
|
-
improve: improveSpecFromInvocation(resolved.improve),
|
|
1270
|
+
candidate: {
|
|
1271
|
+
name: resolved.candidate.name,
|
|
1272
|
+
description: resolved.candidate.description,
|
|
1273
|
+
files: { path: resolved.candidate.files.path },
|
|
1274
|
+
...(resolved.candidate.prepare ? { prepare: { ...resolved.candidate.prepare } } : {}),
|
|
1275
|
+
defaultRun: resolved.candidate.defaultRun,
|
|
1276
|
+
runs: Object.fromEntries(Object.entries(resolved.candidate.runs).map(([runId, run]) => [
|
|
1277
|
+
runId,
|
|
1278
|
+
{
|
|
1279
|
+
name: run.name,
|
|
1280
|
+
...authoredAdapterSpecFromInvocation(run),
|
|
988
1281
|
},
|
|
989
|
-
|
|
990
|
-
|
|
1282
|
+
])),
|
|
1283
|
+
...(resolved.candidate.improve
|
|
1284
|
+
? {
|
|
1285
|
+
improve: {
|
|
1286
|
+
edits: [...resolved.candidate.improve.edits],
|
|
1287
|
+
...(resolved.candidate.improve.optimizeOn
|
|
1288
|
+
? { optimizeOn: resolved.candidate.improve.optimizeOn }
|
|
1289
|
+
: {}),
|
|
1290
|
+
...(resolved.candidate.improve.selectBy
|
|
1291
|
+
? { selectBy: resolved.candidate.improve.selectBy }
|
|
1292
|
+
: {}),
|
|
1293
|
+
...improveSpecFromInvocation(resolved.improve),
|
|
1294
|
+
},
|
|
1295
|
+
}
|
|
1296
|
+
: {}),
|
|
1297
|
+
},
|
|
991
1298
|
};
|
|
992
1299
|
}
|
|
993
1300
|
function improveSpecFromInvocation(invocation) {
|
|
994
1301
|
return authoredAdapterSpecFromInvocation(invocation);
|
|
995
1302
|
}
|
|
996
|
-
function runSpecFromInvocation(invocation) {
|
|
997
|
-
return authoredAdapterSpecFromInvocation(invocation);
|
|
998
|
-
}
|
|
999
1303
|
function authoredAdapterSpecFromInvocation(invocation) {
|
|
1000
1304
|
const config = jsonRecord(invocation.with);
|
|
1001
1305
|
return {
|
|
@@ -1048,9 +1352,9 @@ export function createWorkbenchRunWorkload(args) {
|
|
|
1048
1352
|
if (!purpose) {
|
|
1049
1353
|
throw new Error(`Unsupported runtime job kind: ${args.job.kind}`);
|
|
1050
1354
|
}
|
|
1051
|
-
const
|
|
1052
|
-
if (!
|
|
1053
|
-
throw new Error(`${purpose} execution job is missing
|
|
1355
|
+
const candidateId = readJobString(args.job.input, "candidateId") ?? args.job.candidateId;
|
|
1356
|
+
if (!candidateId) {
|
|
1357
|
+
throw new Error(`${purpose} execution job is missing candidateId.`);
|
|
1054
1358
|
}
|
|
1055
1359
|
const attemptIndex = readRequiredJobNumber(args.job.input, "attemptIndex", `${purpose} execution job`);
|
|
1056
1360
|
const sampleIndex = purpose === "improve"
|
|
@@ -1066,7 +1370,7 @@ export function createWorkbenchRunWorkload(args) {
|
|
|
1066
1370
|
? engineCaseFilesForRuntimeInput({ spec: args.spec, engineCase })
|
|
1067
1371
|
: [];
|
|
1068
1372
|
const engineCaseSpec = engineCase?.case;
|
|
1069
|
-
const initial =
|
|
1373
|
+
const initial = createInitialCandidateFiles({
|
|
1070
1374
|
baseFiles: args.baseFiles,
|
|
1071
1375
|
spec: args.spec,
|
|
1072
1376
|
attemptIndex,
|
|
@@ -1074,10 +1378,10 @@ export function createWorkbenchRunWorkload(args) {
|
|
|
1074
1378
|
return {
|
|
1075
1379
|
job: args.job,
|
|
1076
1380
|
spec: args.spec,
|
|
1077
|
-
|
|
1381
|
+
candidateId,
|
|
1078
1382
|
attemptIndex,
|
|
1079
1383
|
sampleIndex,
|
|
1080
|
-
|
|
1384
|
+
candidateFiles: initial.files,
|
|
1081
1385
|
caseId,
|
|
1082
1386
|
engineResolveFiles: selectedEngineResolveFiles,
|
|
1083
1387
|
traceFiles: (args.traceFiles ?? []).map((file) => ({ ...file })),
|
|
@@ -1088,22 +1392,22 @@ export function createWorkbenchRunWorkload(args) {
|
|
|
1088
1392
|
baseId: readJobString(args.job.input, "baseId"),
|
|
1089
1393
|
};
|
|
1090
1394
|
}
|
|
1091
|
-
function
|
|
1092
|
-
const editablePaths =
|
|
1395
|
+
function createInitialCandidateFiles(args) {
|
|
1396
|
+
const editablePaths = improveEdits(args.spec).map(normalizeRelativePath);
|
|
1093
1397
|
const editPath = editablePaths[0];
|
|
1094
|
-
const
|
|
1398
|
+
const candidatePaths = editPath ? [editPath] : [];
|
|
1095
1399
|
const files = args.baseFiles.length > 0
|
|
1096
1400
|
? args.baseFiles.map((file) => ({ ...file }))
|
|
1097
1401
|
: editPath
|
|
1098
1402
|
? normalizeSurfaceFiles([{ path: editPath, content: "" }])
|
|
1099
1403
|
: [];
|
|
1100
1404
|
const prompt = [
|
|
1101
|
-
`Run the
|
|
1102
|
-
`Attempt ${args.attemptIndex + 1} uses ${
|
|
1405
|
+
`Run the candidate workload for benchmark: ${args.spec.benchmark.description}`,
|
|
1406
|
+
`Attempt ${args.attemptIndex + 1} uses ${formatImproveSummary(args.spec)}; the improve adapter may edit the candidate before Workbench scores it.`,
|
|
1103
1407
|
].join("\n");
|
|
1104
1408
|
const byPath = new Map(files.map((file) => [file.path, file]));
|
|
1105
1409
|
if (editPath &&
|
|
1106
|
-
![...byPath.keys()].some((filePath) =>
|
|
1410
|
+
![...byPath.keys()].some((filePath) => candidatePaths.includes(filePath))) {
|
|
1107
1411
|
byPath.set(editPath, {
|
|
1108
1412
|
path: editPath,
|
|
1109
1413
|
kind: "text",
|
|
@@ -1167,7 +1471,7 @@ export function workbenchExecutionExecutorForRuntimeInput(args) {
|
|
|
1167
1471
|
}
|
|
1168
1472
|
function adapterOperationForExecutionPurpose(purpose) {
|
|
1169
1473
|
if (purpose === "improve") {
|
|
1170
|
-
return "
|
|
1474
|
+
return "candidate.improve";
|
|
1171
1475
|
}
|
|
1172
1476
|
if (purpose === "attempt") {
|
|
1173
1477
|
return "engine.run";
|
|
@@ -1281,8 +1585,8 @@ function normalizeRuntimeControlInputs(value) {
|
|
|
1281
1585
|
}
|
|
1282
1586
|
const record = value;
|
|
1283
1587
|
const inputs = {};
|
|
1284
|
-
if (hasOwn(record, "
|
|
1285
|
-
inputs.
|
|
1588
|
+
if (hasOwn(record, "candidate")) {
|
|
1589
|
+
inputs.candidate = normalizeRuntimeControlFiles(record.candidate, "inputs.candidate");
|
|
1286
1590
|
}
|
|
1287
1591
|
if (hasOwn(record, "case")) {
|
|
1288
1592
|
inputs.case = normalizeRuntimeControlFiles(record.case, "inputs.case");
|
|
@@ -1326,8 +1630,8 @@ function normalizeRuntimeControlOperation(value, label) {
|
|
|
1326
1630
|
const operation = record.operation;
|
|
1327
1631
|
if (operation !== "engine.resolve" &&
|
|
1328
1632
|
operation !== "engine.run" &&
|
|
1329
|
-
operation !== "
|
|
1330
|
-
operation !== "
|
|
1633
|
+
operation !== "candidate.run" &&
|
|
1634
|
+
operation !== "candidate.improve") {
|
|
1331
1635
|
throw new Error(`Workbench runtime-control ${label}.operation is invalid.`);
|
|
1332
1636
|
}
|
|
1333
1637
|
const invocation = record.invocation;
|
|
@@ -1415,7 +1719,7 @@ export async function executeAdapterInCurrentRuntime(args, execution, startedAt,
|
|
|
1415
1719
|
};
|
|
1416
1720
|
try {
|
|
1417
1721
|
if (execution.purpose === "improve") {
|
|
1418
|
-
return await
|
|
1722
|
+
return await executeCandidateRevisionExecutionInCurrentRuntime(runtimeInput, execution, startedAt, capability, eventPublisher);
|
|
1419
1723
|
}
|
|
1420
1724
|
if (execution.purpose === "attempt") {
|
|
1421
1725
|
return await executeAttemptExecutionInCurrentRuntime(runtimeInput, execution, startedAt, capability, eventPublisher);
|
|
@@ -1589,22 +1893,22 @@ function completedJobFromSandboxResult(fallbackJob, startedAt, result) {
|
|
|
1589
1893
|
}
|
|
1590
1894
|
return attachSandboxMetadataToJob(failWorkbenchRunJob(fallbackJob, result.startedAt || startedAt, result.error ?? `Sandbox execution ${result.status}.`, result.finishedAt), asRuntimeRecord(result.metadata).sandbox);
|
|
1591
1895
|
}
|
|
1592
|
-
async function
|
|
1896
|
+
async function executeCandidateRevisionExecutionInCurrentRuntime(args, execution, startedAt, capability, eventPublisher) {
|
|
1593
1897
|
const { workload, result } = await runHostedProtocolExecutionResult(args, execution, startedAt, capability, eventPublisher);
|
|
1594
1898
|
if (result.error || (result.exitCode ?? 0) !== 0) {
|
|
1595
1899
|
return failWorkbenchRunJob(args.job, startedAt, result.error ?? `Adapter ${execution.adapter.use} exited with status ${result.exitCode}.`, result.finishedAt, result);
|
|
1596
1900
|
}
|
|
1597
1901
|
const finishedAt = result.finishedAt ?? new Date().toISOString();
|
|
1598
|
-
const
|
|
1599
|
-
if (
|
|
1600
|
-
return failWorkbenchRunJob(args.job, startedAt, `${execution.adapter.use === "command" ? "Command improve adapter" : `Adapter ${execution.adapter.use}`} completed without changing
|
|
1601
|
-
}
|
|
1602
|
-
const
|
|
1603
|
-
baseFiles: workload.
|
|
1604
|
-
patch:
|
|
1605
|
-
edits:
|
|
1902
|
+
const candidatePatch = createCandidatePatchFromResult(result, args.spec);
|
|
1903
|
+
if (candidatePatch.fileChanges.length === 0) {
|
|
1904
|
+
return failWorkbenchRunJob(args.job, startedAt, `${execution.adapter.use === "command" ? "Command improve adapter" : `Adapter ${execution.adapter.use}`} completed without changing candidate files covered by improve edits.`, finishedAt, result);
|
|
1905
|
+
}
|
|
1906
|
+
const candidateRevisionFiles = applyWorkbenchCandidatePatch({
|
|
1907
|
+
baseFiles: workload.candidateFiles,
|
|
1908
|
+
patch: candidatePatch,
|
|
1909
|
+
edits: requireImproveEdits(args.spec),
|
|
1606
1910
|
});
|
|
1607
|
-
const usage = assignUsageRole("
|
|
1911
|
+
const usage = assignUsageRole("improver", result.usage);
|
|
1608
1912
|
return {
|
|
1609
1913
|
...args.job,
|
|
1610
1914
|
status: "succeeded",
|
|
@@ -1616,13 +1920,13 @@ async function executeSubjectRevisionExecutionInCurrentRuntime(args, execution,
|
|
|
1616
1920
|
ok: true,
|
|
1617
1921
|
executionId: execution.id,
|
|
1618
1922
|
purpose: execution.purpose,
|
|
1619
|
-
|
|
1923
|
+
candidateId: workload.candidateId,
|
|
1620
1924
|
attemptIndex: workload.attemptIndex,
|
|
1621
1925
|
baseId: workload.baseId,
|
|
1622
1926
|
prompt: workload.prompt,
|
|
1623
|
-
|
|
1624
|
-
fileChanges:
|
|
1625
|
-
files:
|
|
1927
|
+
candidatePatch,
|
|
1928
|
+
fileChanges: candidatePatch.fileChanges,
|
|
1929
|
+
files: candidateRevisionFiles,
|
|
1626
1930
|
traces: traceFilePaths(result.files),
|
|
1627
1931
|
...(usage ? { usage } : {}),
|
|
1628
1932
|
...(result.summary !== undefined ? { summary: result.summary } : {}),
|
|
@@ -1655,13 +1959,14 @@ async function executeAttemptExecutionInCurrentRuntime(args, execution, startedA
|
|
|
1655
1959
|
const finishedAt = workloadResult.finishedAt ?? new Date().toISOString();
|
|
1656
1960
|
const usage = attemptUsageSummary(workloadResult.usage, engineResult.usage);
|
|
1657
1961
|
const sample = evaluateSample({
|
|
1658
|
-
|
|
1962
|
+
candidateId: workload.candidateId,
|
|
1659
1963
|
files: workloadResult.files,
|
|
1660
1964
|
engineResolveFiles: workload.engineResolveFiles,
|
|
1661
1965
|
spec: workload.spec,
|
|
1662
1966
|
attemptIndex: workload.attemptIndex,
|
|
1663
1967
|
sampleIndex: workload.sampleIndex,
|
|
1664
1968
|
caseId: workload.caseId,
|
|
1969
|
+
split: workload.engineCaseSpec?.split,
|
|
1665
1970
|
startedAt,
|
|
1666
1971
|
finishedAt,
|
|
1667
1972
|
durationMs: workloadResult.durationMs,
|
|
@@ -1682,7 +1987,7 @@ async function executeAttemptExecutionInCurrentRuntime(args, execution, startedA
|
|
|
1682
1987
|
ok: true,
|
|
1683
1988
|
executionId: execution.id,
|
|
1684
1989
|
purpose: execution.purpose,
|
|
1685
|
-
|
|
1990
|
+
candidateId: workload.candidateId,
|
|
1686
1991
|
attemptIndex: workload.attemptIndex,
|
|
1687
1992
|
sampleIndex: workload.sampleIndex,
|
|
1688
1993
|
caseId: workload.caseId,
|
|
@@ -1725,7 +2030,7 @@ export async function executeRuntimeControlOperationSequenceInCurrentRuntime(arg
|
|
|
1725
2030
|
? { adapterAuthEnv: adapterAuth.env }
|
|
1726
2031
|
: {}),
|
|
1727
2032
|
}, workload, args.runtimeControlOperation.operations.map((operation, index) => runtimeControlStepForOperation(operation, index, args.adapterManifests)), startedAt, {
|
|
1728
|
-
|
|
2033
|
+
runCandidatePrepare: args.runtimeControlOperation.prepare ?? false,
|
|
1729
2034
|
workspaceFiles: args.runtimeControlOperation.inputs?.workspace ?? [],
|
|
1730
2035
|
outputFiles: args.runtimeControlOperation.inputs?.output ?? [],
|
|
1731
2036
|
collectWorkspace: args.runtimeControlOperation.collectWorkspace ?? false,
|
|
@@ -1823,7 +2128,7 @@ function createRuntimeControlSandboxInput(args, request) {
|
|
|
1823
2128
|
const parentInput = asRuntimeRecord(args.job.input);
|
|
1824
2129
|
const publicFiles = runtimeControlInputFiles(request.inputs, "case", parentWorkload.engineCase ? engineCasePublicFiles(parentWorkload.engineCase) : []);
|
|
1825
2130
|
const privateFiles = runtimeControlInputFiles(request.inputs, "enginePrivate", parentWorkload.engineCase ? engineCasePrivateFiles(parentWorkload.engineCase) : []);
|
|
1826
|
-
const
|
|
2131
|
+
const candidateFiles = runtimeControlInputFiles(request.inputs, "candidate", parentWorkload.candidateFiles);
|
|
1827
2132
|
const traceFiles = runtimeControlInputFiles(request.inputs, "traces", parentWorkload.traceFiles);
|
|
1828
2133
|
const adapter = request.operations[request.operations.length - 1]?.invocation;
|
|
1829
2134
|
const childExecution = {
|
|
@@ -1866,7 +2171,7 @@ function createRuntimeControlSandboxInput(args, request) {
|
|
|
1866
2171
|
const childArgs = {
|
|
1867
2172
|
...args,
|
|
1868
2173
|
job: childJob,
|
|
1869
|
-
baseFiles:
|
|
2174
|
+
baseFiles: candidateFiles,
|
|
1870
2175
|
engineResolveFiles: [...publicFiles, ...privateFiles],
|
|
1871
2176
|
engineCases: [engineCase],
|
|
1872
2177
|
traceFiles,
|
|
@@ -1890,10 +2195,10 @@ function runtimeControlStepForOperation(operation, index, manifests = []) {
|
|
|
1890
2195
|
...(operation.invocation.auth !== undefined ? { auth: operation.invocation.auth } : {}),
|
|
1891
2196
|
}, operation.operation, manifests).command;
|
|
1892
2197
|
return {
|
|
1893
|
-
kind: operation.operation === "
|
|
1894
|
-
? "
|
|
1895
|
-
: operation.operation === "
|
|
1896
|
-
? "
|
|
2198
|
+
kind: operation.operation === "candidate.run"
|
|
2199
|
+
? "candidate"
|
|
2200
|
+
: operation.operation === "candidate.improve"
|
|
2201
|
+
? "improver"
|
|
1897
2202
|
: "engine",
|
|
1898
2203
|
label: operation.label ?? `${operation.operation.replace(".", "_")}_${index + 1}`,
|
|
1899
2204
|
operation: operation.operation,
|
|
@@ -1960,8 +2265,8 @@ function isWorkbenchAdapterOperationResult(value) {
|
|
|
1960
2265
|
return record.protocol === "workbench.adapter-result.v1" &&
|
|
1961
2266
|
(record.operation === "engine.resolve" ||
|
|
1962
2267
|
record.operation === "engine.run" ||
|
|
1963
|
-
record.operation === "
|
|
1964
|
-
record.operation === "
|
|
2268
|
+
record.operation === "candidate.run" ||
|
|
2269
|
+
record.operation === "candidate.improve");
|
|
1965
2270
|
}
|
|
1966
2271
|
function cloneSurfaceFiles(files) {
|
|
1967
2272
|
return files.map((file) => ({ ...file, path: normalizeRelativePath(file.path) }));
|
|
@@ -2040,9 +2345,11 @@ async function runHostedCommandExecutionSteps(args, workload, steps, startedAt,
|
|
|
2040
2345
|
const stepTimeoutMs = environmentVersion
|
|
2041
2346
|
? environmentVersionTimeoutMs(environmentVersion)
|
|
2042
2347
|
: 5 * 60 * 1000;
|
|
2043
|
-
const
|
|
2044
|
-
|
|
2045
|
-
|
|
2348
|
+
const shouldRunCandidatePrepare = options.runCandidatePrepare ??
|
|
2349
|
+
(readWorkloadExecutionPurpose(workload) === "attempt" &&
|
|
2350
|
+
steps.some((step) => step.executor === "sandbox"));
|
|
2351
|
+
if (shouldRunCandidatePrepare) {
|
|
2352
|
+
await runCandidatePrepareCommand({
|
|
2046
2353
|
root: workspace.root,
|
|
2047
2354
|
workload,
|
|
2048
2355
|
execution,
|
|
@@ -2081,6 +2388,9 @@ async function runHostedCommandExecutionSteps(args, workload, steps, startedAt,
|
|
|
2081
2388
|
});
|
|
2082
2389
|
const operationResult = await readWorkbenchAdapterOperationResult(outputDir(workspace.root), step.operation);
|
|
2083
2390
|
assertWorkbenchAdapterOperationResultOk(operationResult, `Adapter ${step.adapter?.use ?? execution.adapter.use} ${step.operation}`);
|
|
2391
|
+
await writeSurfaceFiles(outputDir(workspace.root), [
|
|
2392
|
+
textSurfaceFile(`.workbench/traces/${workload.job.id}/${step.label}/result.json`, `${JSON.stringify(operationResult, null, 2)}\n`),
|
|
2393
|
+
]);
|
|
2084
2394
|
operationResults.push(operationResult);
|
|
2085
2395
|
await publishCommandStepEvent(options.eventPublisher, {
|
|
2086
2396
|
step: step.label,
|
|
@@ -2132,19 +2442,19 @@ async function runHostedCommandExecutionSteps(args, workload, steps, startedAt,
|
|
|
2132
2442
|
await workspace.cleanup();
|
|
2133
2443
|
}
|
|
2134
2444
|
}
|
|
2135
|
-
async function
|
|
2136
|
-
const command = args.workload.spec.
|
|
2445
|
+
async function runCandidatePrepareCommand(args) {
|
|
2446
|
+
const command = args.workload.spec.candidate.prepare?.command;
|
|
2137
2447
|
if (!command) {
|
|
2138
2448
|
return;
|
|
2139
2449
|
}
|
|
2140
|
-
const role = args.execution.purpose === "improve" ? "
|
|
2450
|
+
const role = args.execution.purpose === "improve" ? "improver" : "runner";
|
|
2141
2451
|
await publishCommandStepEvent(args.eventPublisher, {
|
|
2142
|
-
step: "
|
|
2452
|
+
step: "candidate_prepare",
|
|
2143
2453
|
status: "started",
|
|
2144
2454
|
role,
|
|
2145
2455
|
});
|
|
2146
2456
|
try {
|
|
2147
|
-
const shellCommand = createHostedWorkloadShellCommand(args.root, command, "
|
|
2457
|
+
const shellCommand = createHostedWorkloadShellCommand(args.root, command, "candidate_prepare");
|
|
2148
2458
|
await args.execFileAsync("sh", ["-c", shellCommand], {
|
|
2149
2459
|
cwd: args.root,
|
|
2150
2460
|
env: createHostedWorkloadPrepareEnv(args.root),
|
|
@@ -2152,20 +2462,20 @@ async function runSubjectPrepareCommand(args) {
|
|
|
2152
2462
|
timeout: args.timeoutMs,
|
|
2153
2463
|
});
|
|
2154
2464
|
await publishCommandStepEvent(args.eventPublisher, {
|
|
2155
|
-
step: "
|
|
2465
|
+
step: "candidate_prepare",
|
|
2156
2466
|
status: "succeeded",
|
|
2157
2467
|
role,
|
|
2158
2468
|
});
|
|
2159
2469
|
}
|
|
2160
2470
|
catch (error) {
|
|
2161
2471
|
await publishCommandStepEvent(args.eventPublisher, {
|
|
2162
|
-
step: "
|
|
2472
|
+
step: "candidate_prepare",
|
|
2163
2473
|
status: "failed",
|
|
2164
2474
|
exitCode: readExitCode(error),
|
|
2165
2475
|
error: error instanceof Error ? error.message : String(error),
|
|
2166
2476
|
role,
|
|
2167
2477
|
});
|
|
2168
|
-
throw new Error(`
|
|
2478
|
+
throw new Error(`Candidate prepare command failed: ${error instanceof Error ? error.message : String(error)}`);
|
|
2169
2479
|
}
|
|
2170
2480
|
}
|
|
2171
2481
|
async function createRuntimeWorkspaceRoot(args, fs, os, path, prefix) {
|
|
@@ -2204,10 +2514,10 @@ async function createRuntimeWorkspaceRoot(args, fs, os, path, prefix) {
|
|
|
2204
2514
|
};
|
|
2205
2515
|
}
|
|
2206
2516
|
function stepEventRole(step) {
|
|
2207
|
-
if (step.kind === "
|
|
2208
|
-
return "
|
|
2517
|
+
if (step.kind === "improver") {
|
|
2518
|
+
return "improver";
|
|
2209
2519
|
}
|
|
2210
|
-
if (step.kind === "
|
|
2520
|
+
if (step.kind === "candidate") {
|
|
2211
2521
|
return "runner";
|
|
2212
2522
|
}
|
|
2213
2523
|
if (step.kind === "engine") {
|
|
@@ -2219,10 +2529,10 @@ function adapterOperationUsageSummary(result) {
|
|
|
2219
2529
|
if (hasExplicitUsageRole(result.usage)) {
|
|
2220
2530
|
return completeUsageSummary(result.usage);
|
|
2221
2531
|
}
|
|
2222
|
-
if (result.operation === "
|
|
2223
|
-
return assignUsageRole("
|
|
2532
|
+
if (result.operation === "candidate.improve") {
|
|
2533
|
+
return assignUsageRole("improver", result.usage);
|
|
2224
2534
|
}
|
|
2225
|
-
if (result.operation === "
|
|
2535
|
+
if (result.operation === "candidate.run") {
|
|
2226
2536
|
return assignUsageRole("runner", result.usage);
|
|
2227
2537
|
}
|
|
2228
2538
|
if (result.operation === "engine.run") {
|
|
@@ -2239,16 +2549,16 @@ function attemptUsageSummary(workloadUsage, resultUsage) {
|
|
|
2239
2549
|
}
|
|
2240
2550
|
function hasExplicitUsageRole(usage) {
|
|
2241
2551
|
const normalized = completeUsageSummary(usage);
|
|
2242
|
-
return Boolean(normalized?.
|
|
2552
|
+
return Boolean(normalized?.improver || normalized?.runner || normalized?.engine);
|
|
2243
2553
|
}
|
|
2244
|
-
function
|
|
2245
|
-
if (result.
|
|
2246
|
-
return result.
|
|
2554
|
+
function createCandidatePatchFromResult(result, spec) {
|
|
2555
|
+
if (result.candidatePatch) {
|
|
2556
|
+
return result.candidatePatch;
|
|
2247
2557
|
}
|
|
2248
2558
|
const changedEditPaths = result.fileChanges
|
|
2249
2559
|
.map(normalizeRelativePath)
|
|
2250
2560
|
.filter((filePath) => !filePath.startsWith(".workbench/") &&
|
|
2251
|
-
|
|
2561
|
+
isCandidateEditPath(filePath, improveEdits(spec)));
|
|
2252
2562
|
const changedSet = new Set(changedEditPaths);
|
|
2253
2563
|
const files = result.files
|
|
2254
2564
|
.filter((file) => changedSet.has(normalizeRelativePath(file.path)))
|
|
@@ -2260,7 +2570,7 @@ function createSubjectPatchFromResult(result, spec) {
|
|
|
2260
2570
|
...(result.feedback !== undefined ? { feedback: result.feedback } : {}),
|
|
2261
2571
|
};
|
|
2262
2572
|
}
|
|
2263
|
-
function
|
|
2573
|
+
function isCandidateEditPath(filePath, edits) {
|
|
2264
2574
|
const normalized = normalizeRelativePath(filePath);
|
|
2265
2575
|
return edits.some((entry) => {
|
|
2266
2576
|
const editPath = normalizeRelativePath(entry).replace(/\/+$/u, "");
|
|
@@ -2320,21 +2630,33 @@ export async function stageWorkbenchRunWorkload(root, workload) {
|
|
|
2320
2630
|
]);
|
|
2321
2631
|
await fs.mkdir(inputDir(root), { recursive: true });
|
|
2322
2632
|
await fs.mkdir(outputDir(root), { recursive: true });
|
|
2633
|
+
await clearMutableWorkspaceFiles(root);
|
|
2323
2634
|
if (purpose === "attempt") {
|
|
2324
|
-
await fs.mkdir(
|
|
2635
|
+
await fs.mkdir(candidateDir(root), { recursive: true });
|
|
2325
2636
|
await fs.mkdir(caseDir(root), { recursive: true });
|
|
2326
2637
|
const engineCase = requireWorkloadEngineCase(workload, "Attempt staging");
|
|
2327
|
-
await writeSurfaceFiles(
|
|
2638
|
+
await writeSurfaceFiles(candidateDir(root), workload.candidateFiles);
|
|
2328
2639
|
await writeSurfaceFiles(caseDir(root), engineCasePublicFiles(engineCase));
|
|
2329
2640
|
return;
|
|
2330
2641
|
}
|
|
2331
2642
|
if (purpose === "improve") {
|
|
2332
|
-
await
|
|
2333
|
-
await writeSurfaceFiles(subjectDir(root), workload.subjectFiles);
|
|
2643
|
+
await writeSurfaceFiles(root, workload.candidateFiles.filter((file) => isMutableWorkspaceSnapshotPath(file.path)));
|
|
2334
2644
|
await fs.mkdir(tracesDir(root), { recursive: true });
|
|
2335
2645
|
await writeSurfaceFiles(tracesDir(root), workload.traceFiles);
|
|
2336
2646
|
}
|
|
2337
2647
|
}
|
|
2648
|
+
async function clearMutableWorkspaceFiles(root) {
|
|
2649
|
+
const fs = await importNodeModule(nodeBuiltin("fs/promises"));
|
|
2650
|
+
const path = await importNodeModule(nodeBuiltin("path"));
|
|
2651
|
+
const entries = await fs.readdir(root, { withFileTypes: true }).catch(() => []);
|
|
2652
|
+
await Promise.all(entries.map(async (entry) => {
|
|
2653
|
+
const relativePath = normalizeRelativePath(entry.name);
|
|
2654
|
+
if (!isMutableWorkspaceSnapshotPath(relativePath)) {
|
|
2655
|
+
return;
|
|
2656
|
+
}
|
|
2657
|
+
await fs.rm(path.join(root, entry.name), { recursive: true, force: true });
|
|
2658
|
+
}));
|
|
2659
|
+
}
|
|
2338
2660
|
async function stageWorkbenchEnginePrivateFiles(root, workload) {
|
|
2339
2661
|
if (readWorkloadExecutionPurpose(workload) !== "attempt") {
|
|
2340
2662
|
return;
|
|
@@ -2417,7 +2739,7 @@ function adapterFilePathWithinRoot(filePath, sourceRoot) {
|
|
|
2417
2739
|
}
|
|
2418
2740
|
async function readHostedRunFailureResult(root, workload, options) {
|
|
2419
2741
|
const traceFiles = await readRuntimeTraceFiles(root, workload);
|
|
2420
|
-
const outputFiles = filterRuntimeOutputFiles(await readSurfaceFiles(outputDir(root)));
|
|
2742
|
+
const outputFiles = filterRuntimeOutputFiles(await readSurfaceFiles(outputDir(root), { ignorePath: isWorkbenchInternalOutputPath }));
|
|
2421
2743
|
const startedAt = options.startedAt ?? new Date().toISOString();
|
|
2422
2744
|
const finishedAt = new Date().toISOString();
|
|
2423
2745
|
const files = [...outputFiles, ...traceFiles].sort((left, right) => left.path.localeCompare(right.path));
|
|
@@ -2433,13 +2755,13 @@ async function readHostedRunFailureResult(root, workload, options) {
|
|
|
2433
2755
|
async function readWorkbenchRunWorkloadResult(root, workload, options = {}) {
|
|
2434
2756
|
const path = await importNodeModule(nodeBuiltin("path"));
|
|
2435
2757
|
const traceFiles = await readRuntimeTraceFiles(root, workload);
|
|
2436
|
-
const outputFiles = filterRuntimeOutputFiles(await readSurfaceFiles(outputDir(root)));
|
|
2758
|
+
const outputFiles = filterRuntimeOutputFiles(await readSurfaceFiles(outputDir(root), { ignorePath: isWorkbenchInternalOutputPath }));
|
|
2437
2759
|
const outputExitCode = await readOptionalNumber(path.join(outputDir(root), "exit_code"));
|
|
2438
2760
|
const startedAt = options.startedAt ?? new Date().toISOString();
|
|
2439
2761
|
const finishedAt = new Date().toISOString();
|
|
2440
2762
|
const purpose = readWorkloadExecutionPurpose(workload);
|
|
2441
2763
|
const primaryOperation = purpose === "improve"
|
|
2442
|
-
? "
|
|
2764
|
+
? "candidate.improve"
|
|
2443
2765
|
: "engine.run";
|
|
2444
2766
|
const primaryResult = [...(options.operationResults ?? [])]
|
|
2445
2767
|
.reverse()
|
|
@@ -2453,9 +2775,9 @@ async function readWorkbenchRunWorkloadResult(root, workload, options = {}) {
|
|
|
2453
2775
|
const cases = normalizeResultCases(resultPayload.cases);
|
|
2454
2776
|
const includeResultScoring = purpose === "attempt";
|
|
2455
2777
|
const files = [...outputFiles, ...traceFiles].sort((left, right) => left.path.localeCompare(right.path));
|
|
2456
|
-
const
|
|
2778
|
+
const candidatePatch = purpose === "improve" ? primaryResult?.value : undefined;
|
|
2457
2779
|
const engineResult = purpose === "attempt" ? primaryResult?.value : undefined;
|
|
2458
|
-
const declaredChanges =
|
|
2780
|
+
const declaredChanges = candidatePatch?.fileChanges ??
|
|
2459
2781
|
(Array.isArray(resultPayload.fileChanges)
|
|
2460
2782
|
? resultPayload.fileChanges.filter((entry) => typeof entry === "string")
|
|
2461
2783
|
: files.map((file) => file.path));
|
|
@@ -2463,7 +2785,7 @@ async function readWorkbenchRunWorkloadResult(root, workload, options = {}) {
|
|
|
2463
2785
|
files,
|
|
2464
2786
|
fileChanges: declaredChanges,
|
|
2465
2787
|
...(options.operationResults ? { operationResults: [...options.operationResults] } : {}),
|
|
2466
|
-
...(
|
|
2788
|
+
...(candidatePatch ? { candidatePatch } : {}),
|
|
2467
2789
|
...(engineResult ? { result: engineResult } : {}),
|
|
2468
2790
|
...(includeResultScoring && metrics ? { metrics } : {}),
|
|
2469
2791
|
...(includeResultScoring && cases ? { cases } : {}),
|
|
@@ -2536,9 +2858,10 @@ async function writeWorkbenchAdapterRequest(root, workload, execution, step, aut
|
|
|
2536
2858
|
const requestPath = path.join(root, ".workbench", "request.json");
|
|
2537
2859
|
await fs.mkdir(path.dirname(requestPath), { recursive: true });
|
|
2538
2860
|
const casePrompt = workload.engineCaseSpec?.prompt;
|
|
2861
|
+
const caseSplit = workload.engineCaseSpec?.split;
|
|
2539
2862
|
const adapter = step.adapter ?? execution.adapter;
|
|
2540
|
-
const
|
|
2541
|
-
|
|
2863
|
+
const candidateCommand = adapterProtocolCommandSpec(workload.spec.run, "candidate.run", manifests).command;
|
|
2864
|
+
const payload = {
|
|
2542
2865
|
protocol: "workbench.adapter.v3",
|
|
2543
2866
|
id: execution.id,
|
|
2544
2867
|
jobId: workload.job.id,
|
|
@@ -2554,17 +2877,17 @@ async function writeWorkbenchAdapterRequest(root, workload, execution, step, aut
|
|
|
2554
2877
|
name: workload.spec.benchmark.name,
|
|
2555
2878
|
description: workload.spec.benchmark.description,
|
|
2556
2879
|
},
|
|
2557
|
-
|
|
2558
|
-
id: workload.
|
|
2559
|
-
path: workload.spec.
|
|
2560
|
-
...(workload.spec.
|
|
2880
|
+
candidate: {
|
|
2881
|
+
id: workload.candidateId,
|
|
2882
|
+
path: workload.spec.candidate.files.path,
|
|
2883
|
+
...(workload.spec.candidate.prepare ? { prepare: { ...workload.spec.candidate.prepare } } : {}),
|
|
2561
2884
|
run: {
|
|
2562
2885
|
...workload.spec.run,
|
|
2563
|
-
command:
|
|
2886
|
+
command: candidateCommand,
|
|
2564
2887
|
},
|
|
2565
2888
|
},
|
|
2566
|
-
...(workload.spec.
|
|
2567
|
-
? {
|
|
2889
|
+
...(workload.spec.candidate.improve
|
|
2890
|
+
? { improve: { edits: [...workload.spec.candidate.improve.edits] } }
|
|
2568
2891
|
: {}),
|
|
2569
2892
|
attempt: {
|
|
2570
2893
|
attemptIndex: workload.attemptIndex,
|
|
@@ -2574,27 +2897,48 @@ async function writeWorkbenchAdapterRequest(root, workload, execution, step, aut
|
|
|
2574
2897
|
case: {
|
|
2575
2898
|
id: workload.caseId,
|
|
2576
2899
|
...(casePrompt ? { prompt: casePrompt } : {}),
|
|
2900
|
+
...(caseSplit ? { split: caseSplit } : {}),
|
|
2577
2901
|
},
|
|
2578
2902
|
},
|
|
2579
2903
|
paths: {
|
|
2580
2904
|
workspace: root,
|
|
2581
2905
|
output: outputDir(root),
|
|
2582
2906
|
result: workbenchAdapterOperationResultPath(outputDir(root)),
|
|
2583
|
-
|
|
2907
|
+
...(readWorkloadExecutionPurpose(workload) === "attempt" ? { candidate: candidateDir(root) } : {}),
|
|
2584
2908
|
...(workload.engineCaseSpec ? { case: caseDir(root) } : {}),
|
|
2585
2909
|
traces: tracesDir(root),
|
|
2586
2910
|
...(step.kind === "engine" ? { enginePrivate: runtimeEnginePrivateDir(root) } : {}),
|
|
2587
2911
|
},
|
|
2588
|
-
}
|
|
2912
|
+
};
|
|
2913
|
+
await fs.writeFile(requestPath, `${JSON.stringify(payload, null, 2)}\n`);
|
|
2914
|
+
await writeSurfaceFiles(outputDir(root), [
|
|
2915
|
+
textSurfaceFile(`.workbench/traces/${workload.job.id}/${step.label}/request.json`, `${JSON.stringify(sanitizeAdapterRequestTracePayload(payload), null, 2)}\n`),
|
|
2916
|
+
]);
|
|
2589
2917
|
return requestPath;
|
|
2590
2918
|
}
|
|
2591
|
-
function
|
|
2592
|
-
|
|
2919
|
+
function sanitizeAdapterRequestTracePayload(value) {
|
|
2920
|
+
if (Array.isArray(value)) {
|
|
2921
|
+
return value.map((entry) => sanitizeAdapterRequestTracePayload(entry));
|
|
2922
|
+
}
|
|
2923
|
+
if (!value || typeof value !== "object") {
|
|
2924
|
+
return (value ?? null);
|
|
2925
|
+
}
|
|
2926
|
+
const sanitized = {};
|
|
2927
|
+
for (const [key, entry] of Object.entries(value)) {
|
|
2928
|
+
if (key === "auth" || key === "enginePrivate") {
|
|
2929
|
+
continue;
|
|
2930
|
+
}
|
|
2931
|
+
sanitized[key] = sanitizeAdapterRequestTracePayload(entry);
|
|
2932
|
+
}
|
|
2933
|
+
return sanitized;
|
|
2934
|
+
}
|
|
2935
|
+
function improveEdits(spec) {
|
|
2936
|
+
return spec.candidate.improve?.edits ?? [];
|
|
2593
2937
|
}
|
|
2594
|
-
function
|
|
2595
|
-
const edits =
|
|
2938
|
+
function requireImproveEdits(spec) {
|
|
2939
|
+
const edits = improveEdits(spec);
|
|
2596
2940
|
if (edits.length === 0) {
|
|
2597
|
-
throw new Error("
|
|
2941
|
+
throw new Error("Candidate improve configuration must declare at least one entry in edits.");
|
|
2598
2942
|
}
|
|
2599
2943
|
return edits;
|
|
2600
2944
|
}
|
|
@@ -2691,8 +3035,8 @@ function requireWorkloadEngineCase(workload, label) {
|
|
|
2691
3035
|
}
|
|
2692
3036
|
return workload.engineCase;
|
|
2693
3037
|
}
|
|
2694
|
-
function
|
|
2695
|
-
return `${inputDir(root)}/
|
|
3038
|
+
function candidateDir(root) {
|
|
3039
|
+
return `${inputDir(root)}/candidate`;
|
|
2696
3040
|
}
|
|
2697
3041
|
function caseDir(root) {
|
|
2698
3042
|
return `${inputDir(root)}/case`;
|
|
@@ -2727,7 +3071,7 @@ async function writeSurfaceFiles(root, files) {
|
|
|
2727
3071
|
}
|
|
2728
3072
|
}
|
|
2729
3073
|
}
|
|
2730
|
-
async function readSurfaceFiles(root) {
|
|
3074
|
+
async function readSurfaceFiles(root, options = {}) {
|
|
2731
3075
|
const fs = await importNodeModule(nodeBuiltin("fs/promises"));
|
|
2732
3076
|
const path = await importNodeModule(nodeBuiltin("path"));
|
|
2733
3077
|
const utf8Decoder = new TextDecoder("utf-8", { fatal: true });
|
|
@@ -2738,6 +3082,10 @@ async function readSurfaceFiles(root) {
|
|
|
2738
3082
|
.catch(() => []);
|
|
2739
3083
|
for (const entry of entries) {
|
|
2740
3084
|
const absolutePath = path.join(directory, entry.name);
|
|
3085
|
+
const relativePath = normalizeRelativePath(path.relative(root, absolutePath).replace(/\\/gu, "/"));
|
|
3086
|
+
if (options.ignorePath?.(relativePath)) {
|
|
3087
|
+
continue;
|
|
3088
|
+
}
|
|
2741
3089
|
if (entry.isDirectory()) {
|
|
2742
3090
|
await walk(absolutePath);
|
|
2743
3091
|
continue;
|
|
@@ -2745,9 +3093,18 @@ async function readSurfaceFiles(root) {
|
|
|
2745
3093
|
if (!entry.isFile()) {
|
|
2746
3094
|
continue;
|
|
2747
3095
|
}
|
|
2748
|
-
|
|
2749
|
-
|
|
2750
|
-
|
|
3096
|
+
let body;
|
|
3097
|
+
let stats;
|
|
3098
|
+
try {
|
|
3099
|
+
body = await fs.readFile(absolutePath);
|
|
3100
|
+
stats = await fs.stat(absolutePath);
|
|
3101
|
+
}
|
|
3102
|
+
catch (error) {
|
|
3103
|
+
if (isVanishedWalkEntry(error)) {
|
|
3104
|
+
continue;
|
|
3105
|
+
}
|
|
3106
|
+
throw error;
|
|
3107
|
+
}
|
|
2751
3108
|
const content = encodeSurfaceSnapshotContent(body, utf8Decoder);
|
|
2752
3109
|
files.push({
|
|
2753
3110
|
path: relativePath,
|
|
@@ -2761,6 +3118,10 @@ async function readSurfaceFiles(root) {
|
|
|
2761
3118
|
await walk(root);
|
|
2762
3119
|
return files.sort((left, right) => left.path.localeCompare(right.path));
|
|
2763
3120
|
}
|
|
3121
|
+
function isVanishedWalkEntry(error) {
|
|
3122
|
+
const code = error?.code;
|
|
3123
|
+
return code === "ENOENT" || code === "ENOTDIR";
|
|
3124
|
+
}
|
|
2764
3125
|
function encodeSurfaceSnapshotContent(body, utf8Decoder) {
|
|
2765
3126
|
try {
|
|
2766
3127
|
return {
|
|
@@ -2943,7 +3304,14 @@ function evaluateSample(args) {
|
|
|
2943
3304
|
if (metrics.score === undefined) {
|
|
2944
3305
|
metrics.score = sampleScore;
|
|
2945
3306
|
}
|
|
2946
|
-
const cases =
|
|
3307
|
+
const cases = runtimeTimedCaseResults({
|
|
3308
|
+
caseId: args.caseId,
|
|
3309
|
+
split: args.split,
|
|
3310
|
+
status: "completed",
|
|
3311
|
+
durationMs,
|
|
3312
|
+
metrics,
|
|
3313
|
+
cases: args.workload.cases,
|
|
3314
|
+
});
|
|
2947
3315
|
const feedback = {
|
|
2948
3316
|
...(args.workload.summary !== undefined
|
|
2949
3317
|
? { summary: args.workload.summary }
|
|
@@ -2956,10 +3324,10 @@ function evaluateSample(args) {
|
|
|
2956
3324
|
return {
|
|
2957
3325
|
id: `${args.caseId}__sample_${String(args.sampleIndex + 1).padStart(3, "0")}`,
|
|
2958
3326
|
index: args.sampleIndex,
|
|
2959
|
-
|
|
2960
|
-
id: args.
|
|
2961
|
-
kind: "
|
|
2962
|
-
label: args.
|
|
3327
|
+
candidate: {
|
|
3328
|
+
id: args.candidateId,
|
|
3329
|
+
kind: "candidate",
|
|
3330
|
+
label: args.candidateId,
|
|
2963
3331
|
},
|
|
2964
3332
|
status: "completed",
|
|
2965
3333
|
startedAt: args.startedAt,
|
|
@@ -2967,7 +3335,7 @@ function evaluateSample(args) {
|
|
|
2967
3335
|
durationMs,
|
|
2968
3336
|
metrics,
|
|
2969
3337
|
...(usage ? { usage } : {}),
|
|
2970
|
-
|
|
3338
|
+
cases,
|
|
2971
3339
|
feedback,
|
|
2972
3340
|
};
|
|
2973
3341
|
}
|
|
@@ -2976,7 +3344,7 @@ function normalizeSampleJobOutput(value) {
|
|
|
2976
3344
|
return null;
|
|
2977
3345
|
}
|
|
2978
3346
|
const record = value;
|
|
2979
|
-
if (record.ok !== true || typeof record.
|
|
3347
|
+
if (record.ok !== true || typeof record.candidateId !== "string") {
|
|
2980
3348
|
return null;
|
|
2981
3349
|
}
|
|
2982
3350
|
const files = Array.isArray(record.files)
|
|
@@ -2991,7 +3359,7 @@ function normalizeSampleJobOutput(value) {
|
|
|
2991
3359
|
return null;
|
|
2992
3360
|
}
|
|
2993
3361
|
return {
|
|
2994
|
-
|
|
3362
|
+
candidateId: record.candidateId,
|
|
2995
3363
|
attemptIndex: record.attemptIndex,
|
|
2996
3364
|
sample,
|
|
2997
3365
|
fileChanges: Array.isArray(record.fileChanges)
|
|
@@ -3003,12 +3371,72 @@ function normalizeSampleJobOutput(value) {
|
|
|
3003
3371
|
: traceFilePaths(files),
|
|
3004
3372
|
};
|
|
3005
3373
|
}
|
|
3006
|
-
function normalizeEvaluationSampleOutputs(
|
|
3007
|
-
return
|
|
3374
|
+
function normalizeEvaluationSampleOutputs(jobs) {
|
|
3375
|
+
return jobs.flatMap((job) => {
|
|
3008
3376
|
const output = normalizeSampleJobOutput(job.output);
|
|
3009
|
-
|
|
3377
|
+
if (!output) {
|
|
3378
|
+
return [];
|
|
3379
|
+
}
|
|
3380
|
+
const caseId = readJobString(job.input, "caseId") ?? output.sample.cases?.[0]?.id ?? null;
|
|
3381
|
+
const durationMs = runtimeJobDurationMs(job) ?? output.sample.durationMs;
|
|
3382
|
+
const sample = caseId && typeof durationMs === "number" && Number.isFinite(durationMs)
|
|
3383
|
+
? {
|
|
3384
|
+
...output.sample,
|
|
3385
|
+
cases: runtimeTimedCaseResults({
|
|
3386
|
+
caseId,
|
|
3387
|
+
split: readJobEngineCaseSplit(job),
|
|
3388
|
+
status: output.sample.status === "error" ? "error" : "completed",
|
|
3389
|
+
durationMs,
|
|
3390
|
+
metrics: output.sample.metrics ?? {},
|
|
3391
|
+
cases: output.sample.cases,
|
|
3392
|
+
}),
|
|
3393
|
+
}
|
|
3394
|
+
: output.sample;
|
|
3395
|
+
return [{
|
|
3396
|
+
jobs: [job],
|
|
3397
|
+
output: {
|
|
3398
|
+
...output,
|
|
3399
|
+
sample,
|
|
3400
|
+
},
|
|
3401
|
+
}];
|
|
3010
3402
|
});
|
|
3011
3403
|
}
|
|
3404
|
+
function runtimeTimedCaseResults(args) {
|
|
3405
|
+
const cases = args.cases?.length
|
|
3406
|
+
? args.cases
|
|
3407
|
+
: [{
|
|
3408
|
+
id: args.caseId,
|
|
3409
|
+
status: args.status,
|
|
3410
|
+
metrics: args.metrics,
|
|
3411
|
+
}];
|
|
3412
|
+
return cases.map((entry) => ({
|
|
3413
|
+
...entry,
|
|
3414
|
+
...(!entry.split && args.split && entry.id === args.caseId ? { split: args.split } : {}),
|
|
3415
|
+
status: entry.status ?? args.status,
|
|
3416
|
+
metrics: entry.metrics ?? args.metrics,
|
|
3417
|
+
durationMs: args.durationMs,
|
|
3418
|
+
}));
|
|
3419
|
+
}
|
|
3420
|
+
function readJobEngineCaseSplit(job) {
|
|
3421
|
+
const input = jsonRecord(job.input);
|
|
3422
|
+
const execution = jsonRecord(input.execution);
|
|
3423
|
+
const metadata = jsonRecord(execution.metadata);
|
|
3424
|
+
const engineCase = jsonRecord(metadata.engineCase);
|
|
3425
|
+
const split = engineCase.split;
|
|
3426
|
+
return typeof split === "string" && split.trim().length > 0
|
|
3427
|
+
? split.trim()
|
|
3428
|
+
: undefined;
|
|
3429
|
+
}
|
|
3430
|
+
function runtimeJobDurationMs(job) {
|
|
3431
|
+
if (typeof job.startedAt !== "string" || typeof job.finishedAt !== "string") {
|
|
3432
|
+
return undefined;
|
|
3433
|
+
}
|
|
3434
|
+
const startedMs = Date.parse(job.startedAt);
|
|
3435
|
+
const finishedMs = Date.parse(job.finishedAt);
|
|
3436
|
+
return Number.isFinite(startedMs) && Number.isFinite(finishedMs)
|
|
3437
|
+
? Math.max(0, finishedMs - startedMs)
|
|
3438
|
+
: undefined;
|
|
3439
|
+
}
|
|
3012
3440
|
function meanFinite(values) {
|
|
3013
3441
|
const finite = values.filter((value) => typeof value === "number" && Number.isFinite(value));
|
|
3014
3442
|
if (finite.length === 0) {
|
|
@@ -3039,12 +3467,12 @@ function withJobUsage(sample, _jobs, attemptJob) {
|
|
|
3039
3467
|
usage,
|
|
3040
3468
|
};
|
|
3041
3469
|
}
|
|
3042
|
-
function
|
|
3470
|
+
function normalizeCandidateRevisionJobOutput(value) {
|
|
3043
3471
|
if (!value || typeof value !== "object" || Array.isArray(value)) {
|
|
3044
3472
|
return null;
|
|
3045
3473
|
}
|
|
3046
3474
|
const record = value;
|
|
3047
|
-
if (record.ok !== true || typeof record.
|
|
3475
|
+
if (record.ok !== true || typeof record.candidateId !== "string") {
|
|
3048
3476
|
return null;
|
|
3049
3477
|
}
|
|
3050
3478
|
const files = Array.isArray(record.files)
|
|
@@ -3056,7 +3484,7 @@ function normalizeSubjectRevisionJobOutput(value) {
|
|
|
3056
3484
|
}
|
|
3057
3485
|
const usage = normalizeUsageSummary(record.usage);
|
|
3058
3486
|
return {
|
|
3059
|
-
|
|
3487
|
+
candidateId: record.candidateId,
|
|
3060
3488
|
attemptIndex: record.attemptIndex,
|
|
3061
3489
|
baseId: typeof record.baseId === "string" && record.baseId.length > 0
|
|
3062
3490
|
? record.baseId
|
|
@@ -3072,7 +3500,7 @@ function normalizeSubjectRevisionJobOutput(value) {
|
|
|
3072
3500
|
...(usage ? { usage } : {}),
|
|
3073
3501
|
};
|
|
3074
3502
|
}
|
|
3075
|
-
function errorEvaluationSamplesFromJobs(jobs,
|
|
3503
|
+
function errorEvaluationSamplesFromJobs(jobs, candidateId, attemptIndex, completedSampleKeys) {
|
|
3076
3504
|
const groups = new Map();
|
|
3077
3505
|
for (const job of jobs) {
|
|
3078
3506
|
const key = evaluationSampleGroupKeyFromJob(job);
|
|
@@ -3082,40 +3510,44 @@ function errorEvaluationSamplesFromJobs(jobs, subjectId, attemptIndex, completed
|
|
|
3082
3510
|
groups.set(key, [...(groups.get(key) ?? []), job]);
|
|
3083
3511
|
}
|
|
3084
3512
|
return [...groups.values()]
|
|
3085
|
-
.map((group) => errorEvaluationSampleFromJobGroup(group,
|
|
3513
|
+
.map((group) => errorEvaluationSampleFromJobGroup(group, candidateId, attemptIndex))
|
|
3086
3514
|
.filter((sample) => sample !== null);
|
|
3087
3515
|
}
|
|
3088
|
-
function errorEvaluationSampleFromJobGroup(jobs,
|
|
3516
|
+
function errorEvaluationSampleFromJobGroup(jobs, candidateId, attemptIndex) {
|
|
3089
3517
|
const job = jobs[0];
|
|
3090
3518
|
if (!job) {
|
|
3091
3519
|
return null;
|
|
3092
3520
|
}
|
|
3093
3521
|
const sampleIndex = readOptionalJobNumber(job.input, "sampleIndex");
|
|
3094
3522
|
const caseId = readJobString(job.input, "caseId");
|
|
3523
|
+
const split = readJobEngineCaseSplit(job);
|
|
3095
3524
|
if (sampleIndex === null || !caseId) {
|
|
3096
3525
|
return null;
|
|
3097
3526
|
}
|
|
3098
3527
|
const startedAt = minIsoTimestamp(jobs.map((entry) => entry.startedAt ?? entry.createdAt));
|
|
3099
3528
|
const finishedAt = maxIsoTimestamp(jobs.map((entry) => entry.finishedAt ?? entry.updatedAt ?? entry.startedAt));
|
|
3529
|
+
const durationMs = startedAt && finishedAt
|
|
3530
|
+
? Math.max(0, Date.parse(finishedAt) - Date.parse(startedAt))
|
|
3531
|
+
: undefined;
|
|
3100
3532
|
const error = summarizeEvaluationJobErrors(jobs) ?? "Evaluation job did not produce a valid sample.";
|
|
3101
3533
|
return {
|
|
3102
3534
|
id: `${caseId}__sample_${String(sampleIndex + 1).padStart(3, "0")}`,
|
|
3103
3535
|
index: sampleIndex,
|
|
3104
|
-
|
|
3105
|
-
id:
|
|
3106
|
-
kind: "
|
|
3107
|
-
label:
|
|
3536
|
+
candidate: {
|
|
3537
|
+
id: candidateId,
|
|
3538
|
+
kind: "candidate",
|
|
3539
|
+
label: candidateId,
|
|
3108
3540
|
},
|
|
3109
3541
|
status: "error",
|
|
3110
3542
|
...(startedAt ? { startedAt } : {}),
|
|
3111
3543
|
...(finishedAt ? { finishedAt } : {}),
|
|
3112
|
-
...(
|
|
3113
|
-
? { durationMs: Math.max(0, Date.parse(finishedAt) - Date.parse(startedAt)) }
|
|
3114
|
-
: {}),
|
|
3544
|
+
...(durationMs !== undefined ? { durationMs } : {}),
|
|
3115
3545
|
...(error ? { error } : {}),
|
|
3116
3546
|
cases: [{
|
|
3117
3547
|
id: caseId,
|
|
3548
|
+
...(split ? { split } : {}),
|
|
3118
3549
|
status: "error",
|
|
3550
|
+
...(durationMs !== undefined ? { durationMs } : {}),
|
|
3119
3551
|
metrics: {},
|
|
3120
3552
|
...(error ? { feedback: { summary: error } } : {}),
|
|
3121
3553
|
}],
|
|
@@ -3171,13 +3603,13 @@ function compareSampleOutputs(left, right) {
|
|
|
3171
3603
|
}
|
|
3172
3604
|
return left.sample.id.localeCompare(right.sample.id);
|
|
3173
3605
|
}
|
|
3174
|
-
function createEvaluationRecord(
|
|
3175
|
-
const samples = mergeEvaluationSampleRecords(rawSamples).map((sample) =>
|
|
3606
|
+
function createEvaluationRecord(candidateId, candidateName, rawSamples) {
|
|
3607
|
+
const samples = mergeEvaluationSampleRecords(rawSamples).map((sample) => candidateName
|
|
3176
3608
|
? {
|
|
3177
3609
|
...sample,
|
|
3178
|
-
|
|
3179
|
-
...sample.
|
|
3180
|
-
label:
|
|
3610
|
+
candidate: {
|
|
3611
|
+
...sample.candidate,
|
|
3612
|
+
label: candidateName,
|
|
3181
3613
|
},
|
|
3182
3614
|
}
|
|
3183
3615
|
: sample);
|
|
@@ -3191,10 +3623,10 @@ function createEvaluationRecord(subjectId, subjectName, rawSamples) {
|
|
|
3191
3623
|
const errorSampleCount = samples.filter((sample) => sample.status === "error")
|
|
3192
3624
|
.length;
|
|
3193
3625
|
return {
|
|
3194
|
-
|
|
3195
|
-
id:
|
|
3196
|
-
kind: "
|
|
3197
|
-
...(
|
|
3626
|
+
candidate: {
|
|
3627
|
+
id: candidateId,
|
|
3628
|
+
kind: "candidate",
|
|
3629
|
+
...(candidateName ? { label: candidateName } : {}),
|
|
3198
3630
|
},
|
|
3199
3631
|
status: samples.length > 0 && completedSampleCount === samples.length
|
|
3200
3632
|
? "completed"
|
|
@@ -3215,7 +3647,7 @@ function createEvaluationRecord(subjectId, subjectName, rawSamples) {
|
|
|
3215
3647
|
samples,
|
|
3216
3648
|
};
|
|
3217
3649
|
}
|
|
3218
|
-
function
|
|
3650
|
+
function normalizedCandidateDisplayName(value) {
|
|
3219
3651
|
const normalized = value?.trim();
|
|
3220
3652
|
return normalized ? normalized : null;
|
|
3221
3653
|
}
|
|
@@ -3263,7 +3695,7 @@ function mergeEvaluationSampleGroup(group) {
|
|
|
3263
3695
|
return {
|
|
3264
3696
|
id: `sample_${String(first.index + 1).padStart(3, "0")}`,
|
|
3265
3697
|
index: first.index,
|
|
3266
|
-
|
|
3698
|
+
candidate: first.candidate,
|
|
3267
3699
|
status: mergeEvaluationSampleStatus(group),
|
|
3268
3700
|
...(startedAt ? { startedAt } : {}),
|
|
3269
3701
|
...(finishedAt ? { finishedAt } : {}),
|
|
@@ -3355,35 +3787,49 @@ function aggregateCaseStatus(results) {
|
|
|
3355
3787
|
}
|
|
3356
3788
|
return undefined;
|
|
3357
3789
|
}
|
|
3358
|
-
function
|
|
3359
|
-
|
|
3360
|
-
|
|
3361
|
-
|
|
3362
|
-
|
|
3363
|
-
}
|
|
3364
|
-
function selectSubject(args) {
|
|
3365
|
-
let selected = args.previousSubject;
|
|
3366
|
-
for (const subject of args.subjects) {
|
|
3367
|
-
if (!selected || hasHigherScore(subject, selected)) {
|
|
3368
|
-
selected = subject;
|
|
3790
|
+
function selectCandidate(args) {
|
|
3791
|
+
let selected = args.previousCandidate;
|
|
3792
|
+
for (const candidate of args.candidates) {
|
|
3793
|
+
if (!selected || hasHigherEvaluationMetric(candidate, selected, args.selection)) {
|
|
3794
|
+
selected = candidate;
|
|
3369
3795
|
}
|
|
3370
3796
|
}
|
|
3371
3797
|
return selected;
|
|
3372
3798
|
}
|
|
3373
|
-
function
|
|
3374
|
-
const
|
|
3375
|
-
const
|
|
3376
|
-
|
|
3799
|
+
function hasHigherEvaluationMetric(candidate, incumbent, selection) {
|
|
3800
|
+
const metric = selection?.metric ?? "score";
|
|
3801
|
+
const candidateValue = readEvaluationSelectionMean(candidate.eval, metric, selection?.caseIds);
|
|
3802
|
+
const incumbentValue = readEvaluationSelectionMean(incumbent.eval, metric, selection?.caseIds);
|
|
3803
|
+
if (candidateValue == null) {
|
|
3377
3804
|
return false;
|
|
3378
3805
|
}
|
|
3379
3806
|
if (incumbentValue == null) {
|
|
3380
3807
|
return true;
|
|
3381
3808
|
}
|
|
3382
|
-
return
|
|
3809
|
+
return candidateValue > incumbentValue;
|
|
3810
|
+
}
|
|
3811
|
+
function readEvaluationSelectionMean(evaluation, metric, caseIds) {
|
|
3812
|
+
const stats = readEvaluationSelectionStats(evaluation, metric, caseIds);
|
|
3813
|
+
return stats ? stats.mean : null;
|
|
3383
3814
|
}
|
|
3384
|
-
function
|
|
3385
|
-
|
|
3386
|
-
|
|
3815
|
+
function readEvaluationSelectionStats(evaluation, metric, caseIds) {
|
|
3816
|
+
if (!caseIds) {
|
|
3817
|
+
const direct = evaluation?.metrics?.[metric];
|
|
3818
|
+
return direct && Number.isFinite(direct.mean) ? direct : null;
|
|
3819
|
+
}
|
|
3820
|
+
if (caseIds.length === 0) {
|
|
3821
|
+
return null;
|
|
3822
|
+
}
|
|
3823
|
+
const allowed = new Set(caseIds);
|
|
3824
|
+
const values = (evaluation?.samples ?? [])
|
|
3825
|
+
.flatMap((sample) => sample.cases ?? [])
|
|
3826
|
+
.flatMap((caseResult) => {
|
|
3827
|
+
const metricValue = caseResult.metrics[metric];
|
|
3828
|
+
return allowed.has(caseResult.id) && typeof metricValue === "number" && Number.isFinite(metricValue)
|
|
3829
|
+
? [metricValue]
|
|
3830
|
+
: [];
|
|
3831
|
+
});
|
|
3832
|
+
return values.length > 0 ? metricStats(values) : null;
|
|
3387
3833
|
}
|
|
3388
3834
|
function metricStats(values) {
|
|
3389
3835
|
const count = values.length;
|
|
@@ -3501,7 +3947,7 @@ function isEvaluationSampleRecord(value) {
|
|
|
3501
3947
|
!Array.isArray(value) &&
|
|
3502
3948
|
typeof record.id === "string" &&
|
|
3503
3949
|
typeof record.index === "number" &&
|
|
3504
|
-
typeof record.
|
|
3950
|
+
typeof record.candidate === "object" &&
|
|
3505
3951
|
isEvaluationSampleStatus(record.status) &&
|
|
3506
3952
|
hasOperationalCaseStatuses(record.cases));
|
|
3507
3953
|
}
|