@workbench-ai/workbench-core 0.0.49 → 0.0.50
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/candidate-patch.d.ts +8 -0
- package/dist/candidate-patch.d.ts.map +1 -0
- package/dist/{subject-patch.js → candidate-patch.js} +5 -5
- package/dist/execution-evidence.d.ts +5 -5
- package/dist/execution-evidence.d.ts.map +1 -1
- package/dist/execution-evidence.js +8 -8
- package/dist/execution-graph.d.ts +2 -2
- package/dist/execution-graph.d.ts.map +1 -1
- package/dist/execution-graph.js +13 -13
- package/dist/execution-jobs.d.ts +7 -6
- package/dist/execution-jobs.d.ts.map +1 -1
- package/dist/execution-jobs.js +32 -17
- package/dist/execution-outputs.d.ts +2 -2
- package/dist/execution-outputs.d.ts.map +1 -1
- package/dist/execution-outputs.js +25 -13
- package/dist/execution-runtime-types.d.ts +1 -1
- package/dist/execution-runtime-types.d.ts.map +1 -1
- package/dist/execution-traces.js +7 -7
- package/dist/execution-usage.js +9 -9
- package/dist/generic-spec.d.ts +34 -30
- package/dist/generic-spec.d.ts.map +1 -1
- package/dist/generic-spec.js +120 -80
- package/dist/index.d.ts +41 -38
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +575 -353
- package/dist/runtime-utils.d.ts +1 -1
- package/dist/runtime-utils.d.ts.map +1 -1
- package/dist/runtime-utils.js +3 -3
- package/dist/sandbox-backends/docker.js +5 -5
- package/dist/sandbox-inputs.js +3 -3
- package/dist/sandbox-plane.js +7 -7
- package/package.json +3 -3
- package/worker/sandbox-adapter-runner.cjs +2 -2
- package/dist/subject-patch.d.ts +0 -8
- package/dist/subject-patch.d.ts.map +0 -1
package/dist/index.js
CHANGED
|
@@ -4,19 +4,19 @@ import path from "node:path";
|
|
|
4
4
|
import { fileURLToPath } from "node:url";
|
|
5
5
|
import YAML from "yaml";
|
|
6
6
|
import { adapterCommandName, assertWorkbenchAdapterOperationResultOk, collectWorkbenchAdapterAuthRequirements, parseWorkbenchAdapterManifest, readWorkbenchAdapterOperationResult, WORKBENCH_RUNTIME_CONTROL_TOKEN_ENV, WORKBENCH_RUNTIME_CONTROL_URL_ENV, workbenchAdapterOperationCommand, workbenchAdapterOperationExecutor, workbenchAdapterOperationResultPath, } from "@workbench-ai/workbench-protocol";
|
|
7
|
-
import { BENCHMARK_SPEC_FILE, engineCasePrivateFiles, engineCaseFilesForRuntimeInput, engineCasePublicFiles, resolveEngineCaseExecutionConfig, resolveWorkbenchResolvedSourceYaml as resolveWorkbenchResolvedSourceYamlInternal, validateWorkbenchResolvedSourceYaml as validateWorkbenchResolvedSourceYamlInternal,
|
|
7
|
+
import { BENCHMARK_SPEC_FILE, engineCasePrivateFiles, engineCaseFilesForRuntimeInput, engineCasePublicFiles, resolveEngineCaseExecutionConfig, resolveWorkbenchResolvedSourceYaml as resolveWorkbenchResolvedSourceYamlInternal, validateWorkbenchResolvedSourceYaml as validateWorkbenchResolvedSourceYamlInternal, isWorkbenchCandidateManifestPath, } from "./generic-spec.js";
|
|
8
8
|
import { attachSandboxMetadataToJob, createWorkbenchSandboxFileStore, isSurfaceSnapshotFile, readWorkbenchExecutionSpec, } from "./sandbox-inputs.js";
|
|
9
9
|
import { asRuntimeRecord, importNodeModule, isJsonPayload, jsonRecord, nodeBuiltin, quoteShellArg, resolveWorkbenchWorkerId, } from "./runtime-utils.js";
|
|
10
10
|
import { createWorkbenchExecutionCapability, createWorkbenchSandboxAllocation, collectExecutionCapabilityScopeIssues, collectSandboxAllocationScopeIssues, collectSandboxHandleScopeIssues, assertSandboxBackendSupportsNetworkPolicy, executeValidatedSandboxExecution, } from "./sandbox-plane.js";
|
|
11
11
|
import { createSandboxBackendPlaneForProvider, } from "./sandbox-backends/index.js";
|
|
12
|
-
import {
|
|
12
|
+
import { applyWorkbenchCandidatePatch } from "./candidate-patch.js";
|
|
13
13
|
import { assignUsageRole, completeUsageSummary, mergeUsageSummaries, normalizeUsageSummary, usageStats, } from "./execution-usage.js";
|
|
14
14
|
import { traceFilePaths, workbenchTraceExecutionDirectory, } from "./trace-files.js";
|
|
15
15
|
import { engineCaseForCase, } from "./execution-jobs.js";
|
|
16
16
|
import { createWorkbenchExecutionEventPublisher, publishCommandStepEvent, } from "./execution-events.js";
|
|
17
17
|
import { readWorkbenchExecutionPurpose } from "./execution-evidence.js";
|
|
18
18
|
import { adapterAuthEnv, localWorkbenchAdapterAuthStore, normalizeWorkbenchAdapterAuthTarget, sanitizeWorkbenchAdapterAuthBundle, } from "./adapter-auth.js";
|
|
19
|
-
export { BENCHMARK_SPEC_FILE, DEFAULT_EXECUTION_RESOURCES, engineCasePrivateFiles, engineCaseFilesForRuntimeInput, engineCasePublicFiles, engineResolveInvocationForSpec, engineResolveBindingForSpec, engineResolveBindingForSourceYaml,
|
|
19
|
+
export { BENCHMARK_SPEC_FILE, CANDIDATE_SPEC_FILE, DEFAULT_EXECUTION_RESOURCES, engineCasePrivateFiles, engineCaseFilesForRuntimeInput, engineCasePublicFiles, engineResolveInvocationForSpec, engineResolveBindingForSpec, engineResolveBindingForSourceYaml, isWorkbenchCandidateManifestPath, parseWorkbenchSourceFiles, resolveEngineCaseExecutionConfig, resolveWorkbenchResolvedSourceYaml, resolveWorkbenchSourceFiles, runtimeNetwork, runtimeResources, serializeWorkbenchResolvedSourceYaml, validateWorkbenchResolvedSourceYaml, } from "./generic-spec.js";
|
|
20
20
|
export { composeRuntimeDockerfileWithAdapterInstallers, } from "./runtime-dockerfile.js";
|
|
21
21
|
export { adapterCommandName, cloneWorkbenchAdapterManifest, collectWorkbenchAdapterAuthRequirements, collectWorkbenchAdapterInvocations, parseWorkbenchAdapterManifest, workbenchAdapterManifestRequiresAuth, workbenchAdapterManifestSupportsOperation, workbenchAdapterOperationCommand, workbenchAdapterOperationExecutor, withDefaultWorkbenchAdapterAuth, withDefaultWorkbenchAdapterAuthProfiles, } from "@workbench-ai/workbench-protocol";
|
|
22
22
|
export { adapterAuthEnv, createWorkbenchAdapterAuthBundle, defaultWorkbenchAdapterAuthStoreRoot, localWorkbenchAdapterAuthStore, normalizeWorkbenchAdapterAuthTarget, parseWorkbenchAdapterAuthTarget, sanitizeWorkbenchAdapterAuthBundle, } from "./adapter-auth.js";
|
|
@@ -26,14 +26,14 @@ export { createWorkbenchProgressStdoutParser, publishWorkbenchProgressStdoutEnve
|
|
|
26
26
|
export { resolveSandboxTemplateImage, } from "./sandbox-backends/template-images.js";
|
|
27
27
|
export { readOutputTraceFiles, workbenchTraceExecutionDirectory, workbenchTraceRunDirectory, workbenchTraceRunDirectoryName, } from "./trace-files.js";
|
|
28
28
|
export { assertWorkbenchAdapterOperationSupport, assertWorkbenchAdapterOperationResultOk, collectWorkbenchAdapterOperationIssues, collectWorkbenchAdapterOperationRequirements, ensureWorkbenchAdapterOutputDir, WORKBENCH_ADAPTER_RESULT_FILE, normalizeWorkbenchAdapterOperationRequest, normalizeWorkbenchAdapterOperationResult, readWorkbenchAdapterOperationRequest, readWorkbenchAdapterOperationResult, workbenchAdapterOperationResultPath, writeWorkbenchAdapterOperationResult, } from "@workbench-ai/workbench-protocol";
|
|
29
|
-
export {
|
|
29
|
+
export { applyWorkbenchCandidatePatch, } from "./candidate-patch.js";
|
|
30
30
|
export { createWorkbenchSandboxFileStore, createSandboxAdapterRequest, executionResultFromCompletedSandboxJob, materializeWorkbenchSandboxInput, readWorkbenchExecutionSpec, sanitizeWorkbenchExecutionJobForSandbox, } from "./sandbox-inputs.js";
|
|
31
31
|
export { compileWorkbenchExecutionGraph, } from "./execution-graph.js";
|
|
32
|
-
export {
|
|
32
|
+
export { createBaselineCandidateExecution, createBaselineCandidateJob, createWorkbenchExecutionJob, expectedWorkbenchRunJobCount, engineCaseForCase, engineCaseIds, attemptJobCountForRunSpec, workbenchExecutionJobPurpose, MAX_WORKBENCH_RUN_BUDGET, planWorkbenchExecutionJobsForPurpose, validateWorkbenchRunEnvelope, workbenchExecutionJobId, } from "./execution-jobs.js";
|
|
33
33
|
export { addCapacity, capacityFits, runWorkbenchExecutionDag, subtractCapacity, workbenchJobDependencies, workbenchJobHostCost, workbenchJobResources, } from "./execution-scheduler.js";
|
|
34
34
|
export { assertWorkbenchExecutionIsolation, collectWorkbenchExecutionIsolationIssues, validateWorkbenchExecutionOutputPayloads, } from "./execution-outputs.js";
|
|
35
35
|
export { collectSandboxAllocationScopeIssues, collectExecutionCapabilityScopeIssues, collectSandboxHandleScopeIssues, createWorkbenchSandboxAllocation, createWorkbenchSandboxExecutionMetadata, createWorkbenchExecutionCapability, executeValidatedSandboxExecution, } from "./sandbox-plane.js";
|
|
36
|
-
export {
|
|
36
|
+
export { buildCandidateCaseExecutionRefs, buildWorkbenchExecutionEvidence, isWorkbenchExecutionActive, readWorkbenchExecutionId, readWorkbenchExecutionMetadataNumber, readWorkbenchExecutionMetadataString, readWorkbenchExecutionPurpose, resolveWorkbenchJobGroupStatus, } from "./execution-evidence.js";
|
|
37
37
|
export { buildWorkbenchTraceSessionsFromFiles, combineWorkbenchTraceSessions, finalizeWorkbenchExecutionTraceForJob, mergeWorkbenchExecutionTracesByJob, readWorkbenchExecutionTraceFiles, traceSessionLabel, } from "./execution-traces.js";
|
|
38
38
|
export { DOCKER_SANDBOX_BACKEND, assertSandboxHostHealthForProvider, createDockerSandboxBackendDescriptor, createDockerSandboxPlane, resolveWorkbenchSandboxProviderName, sandboxProviderAdmissionForResources, sandboxProviderDefaultMaxConcurrentJobs, sandboxProviderLeaseScope, sandboxHostHealthExpectationForProvider, } from "./sandbox-backends/index.js";
|
|
39
39
|
export const DEFAULT_ENVIRONMENT_VERSIONS = [
|
|
@@ -153,7 +153,7 @@ export const DEFAULT_ENVIRONMENTS = [
|
|
|
153
153
|
{
|
|
154
154
|
id: "env_node",
|
|
155
155
|
name: "Node",
|
|
156
|
-
description: "Node runtime for JavaScript and TypeScript
|
|
156
|
+
description: "Node runtime for JavaScript and TypeScript candidates.",
|
|
157
157
|
currentVersionId: "envv_node_22",
|
|
158
158
|
builtIn: true,
|
|
159
159
|
createdAt: "2026-04-23T00:00:00.000Z",
|
|
@@ -191,8 +191,7 @@ function splitAuthoredSourceYaml(sourceYaml) {
|
|
|
191
191
|
}
|
|
192
192
|
const entries = [
|
|
193
193
|
[BENCHMARK_SPEC_FILE, parsed.benchmark],
|
|
194
|
-
["
|
|
195
|
-
["optimizers/current.yaml", splitOptimizerSourceRecord(parsed.optimizer)],
|
|
194
|
+
["candidates/current/candidate.yaml", splitCandidateSourceRecord(parsed.candidate)],
|
|
196
195
|
];
|
|
197
196
|
return entries.flatMap(([filePath, value]) => {
|
|
198
197
|
if (!value || typeof value !== "object" || Array.isArray(value)) {
|
|
@@ -204,23 +203,20 @@ function splitAuthoredSourceYaml(sourceYaml) {
|
|
|
204
203
|
}];
|
|
205
204
|
});
|
|
206
205
|
}
|
|
207
|
-
function
|
|
206
|
+
function splitCandidateSourceRecord(value) {
|
|
208
207
|
const record = cloneYamlRecord(value);
|
|
209
208
|
if (!record) {
|
|
210
209
|
return value;
|
|
211
210
|
}
|
|
212
211
|
delete record.benchmark;
|
|
213
212
|
delete record.path;
|
|
214
|
-
|
|
213
|
+
stripCandidateRuntimeSelection(record);
|
|
214
|
+
rewriteAdapterSources(record, "candidates/current");
|
|
215
215
|
return record;
|
|
216
216
|
}
|
|
217
|
-
function
|
|
218
|
-
|
|
219
|
-
|
|
220
|
-
return value;
|
|
221
|
-
}
|
|
222
|
-
rewriteAdapterSources(record, "optimizers");
|
|
223
|
-
return record;
|
|
217
|
+
function stripCandidateRuntimeSelection(record) {
|
|
218
|
+
delete record.selectedRunId;
|
|
219
|
+
delete record.selectedRunName;
|
|
224
220
|
}
|
|
225
221
|
function cloneYamlRecord(value) {
|
|
226
222
|
return value && typeof value === "object" && !Array.isArray(value)
|
|
@@ -242,11 +238,10 @@ function sourcePathRelativeTo(yamlDir, sourcePath) {
|
|
|
242
238
|
}
|
|
243
239
|
function isAuthoredSourceYamlPath(filePath) {
|
|
244
240
|
return filePath === BENCHMARK_SPEC_FILE ||
|
|
245
|
-
|
|
246
|
-
/^optimizers\/[^/]+\.ya?ml$/iu.test(filePath);
|
|
241
|
+
isWorkbenchCandidateManifestPath(filePath);
|
|
247
242
|
}
|
|
248
|
-
function
|
|
249
|
-
return spec.improve ? `adapter:${spec.improve.use}` : "
|
|
243
|
+
function formatImproveSummary(spec) {
|
|
244
|
+
return spec.improve ? `adapter:${spec.improve.use}` : "improve not configured";
|
|
250
245
|
}
|
|
251
246
|
function formatEngineRunSummary(spec) {
|
|
252
247
|
return `adapter:${spec.engineRun.use}`;
|
|
@@ -287,10 +282,10 @@ function protocolStepForExecution(execution, manifests) {
|
|
|
287
282
|
if (execution.purpose !== "improve") {
|
|
288
283
|
throw new Error(`Protocol execution step only supports improve executions, not ${execution.purpose}.`);
|
|
289
284
|
}
|
|
290
|
-
const operation = "
|
|
285
|
+
const operation = "candidate.improve";
|
|
291
286
|
const command = adapterProtocolCommandSpec(execution.adapter, operation, manifests);
|
|
292
287
|
return {
|
|
293
|
-
kind: "
|
|
288
|
+
kind: "improver",
|
|
294
289
|
label: execution.purpose,
|
|
295
290
|
operation,
|
|
296
291
|
executor: command.executor,
|
|
@@ -387,31 +382,31 @@ export function materializeWorkbenchRunResult(args) {
|
|
|
387
382
|
const completed = args.jobs.filter((job) => job.status === "succeeded");
|
|
388
383
|
const failedJobCount = args.jobs.filter((job) => job.status === "failed").length;
|
|
389
384
|
const completedJobCount = args.jobs.filter((job) => job.status === "succeeded").length;
|
|
390
|
-
const
|
|
385
|
+
const candidateRevisions = completed
|
|
391
386
|
.filter((job) => workbenchExecutionPurpose(job) === "improve")
|
|
392
|
-
.map((job) =>
|
|
387
|
+
.map((job) => normalizeCandidateRevisionJobOutput(job.output))
|
|
393
388
|
.filter((output) => output !== null)
|
|
394
389
|
.sort((left, right) => left.attemptIndex - right.attemptIndex);
|
|
395
390
|
const evaluationJobs = args.jobs.filter((job) => workbenchExecutionPurpose(job) === "attempt");
|
|
396
|
-
const
|
|
391
|
+
const evaluationsByCandidate = new Map();
|
|
397
392
|
for (const job of evaluationJobs) {
|
|
398
|
-
const
|
|
399
|
-
readJobString(job.input, "
|
|
400
|
-
job.
|
|
401
|
-
if (
|
|
402
|
-
|
|
403
|
-
...(
|
|
393
|
+
const candidateId = readJobString(job.output, "candidateId") ??
|
|
394
|
+
readJobString(job.input, "candidateId") ??
|
|
395
|
+
job.candidateId;
|
|
396
|
+
if (candidateId) {
|
|
397
|
+
evaluationsByCandidate.set(candidateId, [
|
|
398
|
+
...(evaluationsByCandidate.get(candidateId) ?? []),
|
|
404
399
|
job,
|
|
405
400
|
]);
|
|
406
401
|
}
|
|
407
402
|
}
|
|
408
|
-
const
|
|
409
|
-
const
|
|
403
|
+
const candidates = [];
|
|
404
|
+
const candidateFiles = {};
|
|
410
405
|
const evaluations = [];
|
|
411
|
-
for (const
|
|
412
|
-
const
|
|
413
|
-
const
|
|
414
|
-
const succeededEvaluationJobs =
|
|
406
|
+
for (const candidateRevision of candidateRevisions) {
|
|
407
|
+
const candidateId = candidateRevision.candidateId;
|
|
408
|
+
const candidateJobs = evaluationsByCandidate.get(candidateId) ?? [];
|
|
409
|
+
const succeededEvaluationJobs = candidateJobs.filter((job) => job.status === "succeeded");
|
|
415
410
|
const outputs = normalizeEvaluationSampleOutputs({
|
|
416
411
|
jobs: succeededEvaluationJobs,
|
|
417
412
|
allJobs: completed,
|
|
@@ -425,39 +420,38 @@ export function materializeWorkbenchRunResult(args) {
|
|
|
425
420
|
])
|
|
426
421
|
.filter((key) => key !== null));
|
|
427
422
|
const errorSampleJobs = [
|
|
428
|
-
...
|
|
423
|
+
...candidateJobs.filter((job) => job.status === "failed"),
|
|
429
424
|
...succeededEvaluationJobs.filter((job) => !outputJobIds.has(job.id)),
|
|
430
425
|
];
|
|
431
|
-
const errorSamples = errorEvaluationSamplesFromJobs(errorSampleJobs,
|
|
426
|
+
const errorSamples = errorEvaluationSamplesFromJobs(errorSampleJobs, candidateId, candidateRevision.attemptIndex, completedSampleKeys);
|
|
432
427
|
const samples = [
|
|
433
428
|
...outputs.map(({ jobs, output }) => withJobUsage(output.sample, completed, jobs[0])),
|
|
434
429
|
...errorSamples,
|
|
435
430
|
].sort((left, right) => left.index - right.index || left.id.localeCompare(right.id));
|
|
436
|
-
const
|
|
437
|
-
const evalRecord = createEvaluationRecord(
|
|
431
|
+
const candidateName = normalizedCandidateDisplayName(args.spec.candidate.name);
|
|
432
|
+
const evalRecord = createEvaluationRecord(candidateId, candidateName, samples);
|
|
438
433
|
const usage = mergeUsageSummaries([
|
|
439
|
-
|
|
434
|
+
candidateRevision.usage,
|
|
440
435
|
...samples.map((sample) => sample.usage),
|
|
441
436
|
]);
|
|
442
|
-
const
|
|
443
|
-
const attemptIndex = subjectRevision.attemptIndex;
|
|
437
|
+
const attemptIndex = candidateRevision.attemptIndex;
|
|
444
438
|
const evaluationTraces = [
|
|
445
439
|
...outputs.flatMap(({ output }) => output.traces),
|
|
446
440
|
...errorSampleJobs.flatMap(jobTracePaths),
|
|
447
441
|
].sort();
|
|
448
|
-
const baseId =
|
|
449
|
-
?
|
|
442
|
+
const baseId = candidateRevision.baseId && candidateRevision.baseId !== candidateId
|
|
443
|
+
? candidateRevision.baseId
|
|
450
444
|
: null;
|
|
451
|
-
const sourceMeta =
|
|
445
|
+
const sourceMeta = candidateSourceMetadata(args.candidateSourceFiles);
|
|
452
446
|
const benchmarkMeta = benchmarkSourceMetadata(args.benchmarkSourceFiles);
|
|
453
447
|
const meta = {
|
|
454
448
|
attemptIndex,
|
|
455
449
|
sampleCount: evalRecord.sampleCount,
|
|
456
|
-
|
|
450
|
+
improver: formatImproveSummary(args.spec),
|
|
457
451
|
engineRun: formatEngineRunSummary(args.spec),
|
|
458
452
|
strategy: "greedy",
|
|
459
453
|
traces: {
|
|
460
|
-
improve:
|
|
454
|
+
improve: candidateRevision.traces,
|
|
461
455
|
evaluations: evaluationTraces,
|
|
462
456
|
},
|
|
463
457
|
};
|
|
@@ -467,52 +461,114 @@ export function materializeWorkbenchRunResult(args) {
|
|
|
467
461
|
if (benchmarkMeta) {
|
|
468
462
|
meta.benchmark = benchmarkMeta;
|
|
469
463
|
}
|
|
470
|
-
const record = {
|
|
471
|
-
|
|
472
|
-
|
|
473
|
-
|
|
474
|
-
|
|
475
|
-
|
|
476
|
-
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
|
|
480
|
-
|
|
481
|
-
|
|
482
|
-
|
|
483
|
-
|
|
484
|
-
|
|
485
|
-
|
|
486
|
-
|
|
487
|
-
|
|
464
|
+
const record = preserveExistingCandidateIdentity({
|
|
465
|
+
candidate: {
|
|
466
|
+
id: candidateId,
|
|
467
|
+
...(candidateName ? { name: candidateName } : {}),
|
|
468
|
+
version: args.existingCandidateCount + candidates.length + 1,
|
|
469
|
+
ordinal: args.existingCandidateCount + candidates.length + 1,
|
|
470
|
+
benchmarkFingerprint: args.benchmarkFingerprint,
|
|
471
|
+
candidateFingerprint: args.candidateFingerprint ?? materializedCandidateFingerprint(args.spec, candidateRevision.files),
|
|
472
|
+
createdAt: args.startedAt,
|
|
473
|
+
...(baseId ? { baseId } : {}),
|
|
474
|
+
referenceIds: [],
|
|
475
|
+
status: evalRecord.completedSampleCount > 0 ? "evaluated" : "eval_error",
|
|
476
|
+
fileChanges: candidateRevision.fileChanges,
|
|
477
|
+
...(usage ? { usage } : {}),
|
|
478
|
+
eval: evalRecord,
|
|
479
|
+
...(candidateRevision.prompt ? { prompt: candidateRevision.prompt } : {}),
|
|
480
|
+
meta,
|
|
481
|
+
},
|
|
482
|
+
previousCandidate: args.previousCandidate ?? null,
|
|
483
|
+
});
|
|
484
|
+
candidates.push(record);
|
|
488
485
|
evaluations.push(createEvaluationScorecard({
|
|
489
486
|
runId: args.runId,
|
|
490
487
|
benchmarkFingerprint: args.benchmarkFingerprint,
|
|
491
488
|
createdAt: args.startedAt,
|
|
492
|
-
|
|
489
|
+
candidate: record,
|
|
490
|
+
candidateRunId: args.spec.candidate.selectedRunId,
|
|
491
|
+
candidateRunName: args.spec.candidate.selectedRunName,
|
|
493
492
|
evaluation: evalRecord,
|
|
494
493
|
}));
|
|
495
|
-
|
|
496
|
-
|
|
494
|
+
candidateFiles[candidateId] = materializedCandidateFiles({
|
|
495
|
+
candidateRevisionFiles: candidateRevision.files,
|
|
497
496
|
});
|
|
498
497
|
}
|
|
499
|
-
const
|
|
500
|
-
|
|
501
|
-
|
|
498
|
+
const selectedCandidate = selectCandidate({
|
|
499
|
+
candidates,
|
|
500
|
+
previousCandidate: args.previousCandidate ?? null,
|
|
502
501
|
});
|
|
503
502
|
return {
|
|
504
|
-
|
|
505
|
-
|
|
503
|
+
candidates,
|
|
504
|
+
candidateFiles,
|
|
506
505
|
evaluations,
|
|
507
|
-
|
|
508
|
-
|
|
506
|
+
activeCandidateId: selectedCandidate?.id ?? args.previousCandidate?.id ?? null,
|
|
507
|
+
selectedCandidate,
|
|
509
508
|
completedJobCount,
|
|
510
509
|
failedJobCount,
|
|
511
510
|
};
|
|
512
511
|
}
|
|
513
|
-
function
|
|
512
|
+
function preserveExistingCandidateIdentity(args) {
|
|
513
|
+
const previous = args.previousCandidate;
|
|
514
|
+
if (!previous || previous.id !== args.candidate.id) {
|
|
515
|
+
return args.candidate;
|
|
516
|
+
}
|
|
517
|
+
const baseId = args.candidate.baseId ?? previous.baseId;
|
|
518
|
+
const prompt = args.candidate.prompt ?? previous.prompt;
|
|
519
|
+
const meta = mergeExistingCandidateMeta(previous.meta, args.candidate.meta);
|
|
520
|
+
return {
|
|
521
|
+
...args.candidate,
|
|
522
|
+
version: previous.version,
|
|
523
|
+
ordinal: previous.version,
|
|
524
|
+
createdAt: previous.createdAt,
|
|
525
|
+
...(args.candidate.name ?? previous.name
|
|
526
|
+
? { name: (args.candidate.name ?? previous.name) }
|
|
527
|
+
: {}),
|
|
528
|
+
...(baseId ? { baseId } : {}),
|
|
529
|
+
referenceIds: previous.referenceIds.length > 0
|
|
530
|
+
? [...previous.referenceIds]
|
|
531
|
+
: args.candidate.referenceIds,
|
|
532
|
+
fileChanges: args.candidate.fileChanges.length > 0
|
|
533
|
+
? args.candidate.fileChanges
|
|
534
|
+
: [...previous.fileChanges],
|
|
535
|
+
...(prompt ? { prompt } : {}),
|
|
536
|
+
...(meta ? { meta } : {}),
|
|
537
|
+
};
|
|
538
|
+
}
|
|
539
|
+
function mergeExistingCandidateMeta(previousMeta, candidateMeta) {
|
|
540
|
+
const previous = jsonRecord(previousMeta);
|
|
541
|
+
const candidate = jsonRecord(candidateMeta);
|
|
542
|
+
if (!previous) {
|
|
543
|
+
return candidateMeta;
|
|
544
|
+
}
|
|
545
|
+
if (!candidate) {
|
|
546
|
+
return previousMeta;
|
|
547
|
+
}
|
|
548
|
+
const previousTraces = jsonRecord(previous.traces);
|
|
549
|
+
const candidateTraces = jsonRecord(candidate.traces);
|
|
550
|
+
if (!previousTraces || !candidateTraces) {
|
|
551
|
+
return { ...previous, ...candidate };
|
|
552
|
+
}
|
|
553
|
+
const traces = {
|
|
554
|
+
...previousTraces,
|
|
555
|
+
...candidateTraces,
|
|
556
|
+
};
|
|
557
|
+
const candidateImproveTraces = Array.isArray(candidateTraces.improve)
|
|
558
|
+
? candidateTraces.improve
|
|
559
|
+
: [];
|
|
560
|
+
if (candidateImproveTraces.length === 0 && previousTraces.improve !== undefined) {
|
|
561
|
+
traces.improve = previousTraces.improve;
|
|
562
|
+
}
|
|
563
|
+
return {
|
|
564
|
+
...previous,
|
|
565
|
+
...candidate,
|
|
566
|
+
traces,
|
|
567
|
+
};
|
|
568
|
+
}
|
|
569
|
+
function candidateSourceMetadata(files) {
|
|
514
570
|
const sourceFiles = (files ?? [])
|
|
515
|
-
.filter((file) => /^
|
|
571
|
+
.filter((file) => /^candidates\/[^/]+\/candidate\.ya?ml$/iu.test(file.path))
|
|
516
572
|
.sort((left, right) => left.path.localeCompare(right.path))
|
|
517
573
|
.map((file) => ({
|
|
518
574
|
path: file.path,
|
|
@@ -536,14 +592,13 @@ function benchmarkSourceMetadata(files) {
|
|
|
536
592
|
}));
|
|
537
593
|
return sourceFiles.length > 0 ? { files: sourceFiles } : null;
|
|
538
594
|
}
|
|
539
|
-
function
|
|
595
|
+
function materializedCandidateFingerprint(spec, files) {
|
|
540
596
|
const hash = createHash("sha256");
|
|
541
|
-
hash.update("workbench-
|
|
542
|
-
hash.update("materialized\
|
|
543
|
-
hash.update(JSON.stringify(spec.run));
|
|
597
|
+
hash.update("workbench-candidate-v1\0");
|
|
598
|
+
hash.update("materialized\0");
|
|
544
599
|
hash.update("prepare");
|
|
545
|
-
hash.update(JSON.stringify(spec.
|
|
546
|
-
for (const file of
|
|
600
|
+
hash.update(JSON.stringify(spec.candidate.prepare ?? null));
|
|
601
|
+
for (const file of filterCandidateSourceFiles(files).slice().sort((left, right) => left.path.localeCompare(right.path))) {
|
|
547
602
|
hash.update("\0file\0");
|
|
548
603
|
hash.update(file.path);
|
|
549
604
|
hash.update("\0");
|
|
@@ -555,9 +610,9 @@ function materializedSubjectFingerprint(spec, files) {
|
|
|
555
610
|
}
|
|
556
611
|
return hash.digest("hex");
|
|
557
612
|
}
|
|
558
|
-
function
|
|
613
|
+
function materializedCandidateFiles(args) {
|
|
559
614
|
const byPath = new Map();
|
|
560
|
-
for (const file of
|
|
615
|
+
for (const file of filterCandidateSourceFiles(args.candidateRevisionFiles)) {
|
|
561
616
|
byPath.set(file.path, { ...file });
|
|
562
617
|
}
|
|
563
618
|
return [...byPath.values()].sort((left, right) => left.path.localeCompare(right.path));
|
|
@@ -565,12 +620,15 @@ function materializedSubjectFiles(args) {
|
|
|
565
620
|
function createEvaluationScorecard(args) {
|
|
566
621
|
const evaluation = args.evaluation;
|
|
567
622
|
return {
|
|
568
|
-
id: evaluationScorecardId(args.runId, args.
|
|
623
|
+
id: evaluationScorecardId(args.runId, args.candidate.id),
|
|
569
624
|
runId: args.runId,
|
|
570
625
|
benchmarkFingerprint: args.benchmarkFingerprint,
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
...(args.
|
|
626
|
+
candidateFingerprint: args.candidate.candidateFingerprint,
|
|
627
|
+
candidateId: args.candidate.id,
|
|
628
|
+
...(args.candidate.name ? { candidateName: args.candidate.name } : {}),
|
|
629
|
+
candidateVersion: args.candidate.version,
|
|
630
|
+
...(args.candidateRunId ? { candidateRunId: args.candidateRunId } : {}),
|
|
631
|
+
...(args.candidateRunName ? { candidateRunName: args.candidateRunName } : {}),
|
|
574
632
|
createdAt: args.createdAt,
|
|
575
633
|
updatedAt: evaluation.finishedAt ?? args.createdAt,
|
|
576
634
|
status: evaluation.status,
|
|
@@ -584,10 +642,10 @@ function createEvaluationScorecard(args) {
|
|
|
584
642
|
evaluation,
|
|
585
643
|
};
|
|
586
644
|
}
|
|
587
|
-
export function evaluationScorecardId(runId,
|
|
645
|
+
export function evaluationScorecardId(runId, candidateId) {
|
|
588
646
|
const runPart = runId.replace(/[^a-z0-9]+/giu, "_").replace(/^_+|_+$/gu, "").slice(-24);
|
|
589
|
-
const
|
|
590
|
-
return `eval_${runPart}_${
|
|
647
|
+
const candidatePart = candidateId.replace(/[^a-z0-9]+/giu, "_").replace(/^_+|_+$/gu, "").slice(-24);
|
|
648
|
+
return `eval_${runPart}_${candidatePart}`;
|
|
591
649
|
}
|
|
592
650
|
export function selectExecutionOutputFilesForInspection(args) {
|
|
593
651
|
return args.files.filter((file) => !isWorkbenchInternalOutputPath(file.path));
|
|
@@ -602,56 +660,89 @@ export function isWorkbenchInternalOutputPath(filePath) {
|
|
|
602
660
|
normalized === "exit_code" ||
|
|
603
661
|
/^[a-z_-]+_(stdout\.log|stderr\.log|exit_code)$/u.test(normalized));
|
|
604
662
|
}
|
|
605
|
-
export function
|
|
663
|
+
export function createOptimizerTraceInputFiles(args) {
|
|
606
664
|
const files = [];
|
|
607
|
-
const
|
|
665
|
+
const executions = [];
|
|
608
666
|
const jobs = args.jobs
|
|
609
|
-
.filter(
|
|
667
|
+
.filter(isOptimizerTraceInputJob)
|
|
610
668
|
.sort(compareTraceInputJobs);
|
|
611
|
-
|
|
669
|
+
jobs.forEach((job, index) => {
|
|
670
|
+
const sequence = String(index + 1).padStart(6, "0");
|
|
671
|
+
const executionPath = `executions/${sequence}`;
|
|
672
|
+
const operation = "engine.run";
|
|
612
673
|
const jobFiles = completedJobOutputFiles(job);
|
|
613
|
-
const
|
|
614
|
-
|
|
615
|
-
const
|
|
616
|
-
|
|
617
|
-
|
|
618
|
-
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
|
|
622
|
-
|
|
623
|
-
|
|
624
|
-
|
|
625
|
-
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
674
|
+
const requestFile = traceInputRequestFile(jobFiles, operation);
|
|
675
|
+
const resultFile = traceInputResultFile(jobFiles, operation);
|
|
676
|
+
const requestPath = `${executionPath}/request.json`;
|
|
677
|
+
const resultPath = `${executionPath}/result.json`;
|
|
678
|
+
const filesPath = `${executionPath}/files`;
|
|
679
|
+
files.push(textSurfaceFile(requestPath, requestFile?.content ?? `${JSON.stringify(traceInputRequestFallback(job, operation), null, 2)}\n`));
|
|
680
|
+
files.push(textSurfaceFile(resultPath, resultFile?.content ?? `${JSON.stringify(traceInputResultFallback(job, operation), null, 2)}\n`));
|
|
681
|
+
files.push(...jobFiles.map((file) => ({
|
|
682
|
+
...file,
|
|
683
|
+
path: normalizeRelativePath(`${filesPath}/${file.path}`),
|
|
684
|
+
})));
|
|
685
|
+
executions.push({
|
|
686
|
+
path: executionPath,
|
|
687
|
+
operation,
|
|
688
|
+
status: job.status,
|
|
689
|
+
candidateId: job.candidateId ?? readJobString(job.input, "candidateId") ?? null,
|
|
690
|
+
runId: job.runId,
|
|
691
|
+
jobId: job.id,
|
|
692
|
+
attemptIndex: readOptionalJobNumber(job.input, "attemptIndex") ?? null,
|
|
693
|
+
sampleIndex: readOptionalJobNumber(job.input, "sampleIndex") ?? null,
|
|
694
|
+
caseId: readJobString(job.input, "caseId") ?? null,
|
|
695
|
+
requestPath,
|
|
696
|
+
resultPath,
|
|
697
|
+
filesPath,
|
|
631
698
|
});
|
|
632
|
-
}
|
|
633
|
-
files.push(textSurfaceFile("
|
|
634
|
-
|
|
635
|
-
|
|
699
|
+
});
|
|
700
|
+
files.push(textSurfaceFile("index.json", `${JSON.stringify({
|
|
701
|
+
schema: "workbench.optimizer-traces.v1",
|
|
702
|
+
executions,
|
|
636
703
|
}, null, 2)}\n`));
|
|
637
704
|
return dedupeSurfaceFiles(files);
|
|
638
705
|
}
|
|
639
|
-
export function
|
|
640
|
-
const
|
|
641
|
-
|
|
642
|
-
|
|
706
|
+
export function evaluationMeanMetrics(evaluation) {
|
|
707
|
+
const entries = Object.entries(evaluation?.metrics ?? {})
|
|
708
|
+
.filter((entry) => Number.isFinite(entry[1].mean));
|
|
709
|
+
return entries.length > 0
|
|
710
|
+
? Object.fromEntries(entries.map(([key, stats]) => [key, stats.mean]))
|
|
711
|
+
: undefined;
|
|
712
|
+
}
|
|
713
|
+
export function candidateRecordWithoutDerivedFields(candidate) {
|
|
714
|
+
const { metrics: _metrics, candidateRunId: _candidateRunId, candidateRunName: _candidateRunName, ...record } = candidate;
|
|
715
|
+
return record;
|
|
716
|
+
}
|
|
717
|
+
export function candidateSummaryFromRecord(candidate) {
|
|
718
|
+
const { eval: _eval, prompt: _prompt, meta: _meta, ...summary } = candidateRecordWithoutDerivedFields(candidate);
|
|
719
|
+
return summary;
|
|
720
|
+
}
|
|
721
|
+
export function workbenchRunExecutionFingerprint(args) {
|
|
722
|
+
const hash = createHash("sha256");
|
|
723
|
+
hash.update("workbench-run-execution-v1\0");
|
|
724
|
+
hash.update(args.specVersionId ?? "");
|
|
725
|
+
hash.update("\0");
|
|
726
|
+
hash.update(args.environmentVersionId ?? "");
|
|
727
|
+
hash.update("\0");
|
|
728
|
+
hash.update(args.sourceYaml ?? "");
|
|
729
|
+
for (const file of (args.adapterFiles ?? []).slice().sort((left, right) => left.path.localeCompare(right.path))) {
|
|
730
|
+
hash.update("\0file\0");
|
|
731
|
+
hash.update(file.path);
|
|
732
|
+
hash.update("\0");
|
|
733
|
+
hash.update(file.kind);
|
|
734
|
+
hash.update("\0");
|
|
735
|
+
hash.update(file.encoding);
|
|
736
|
+
hash.update("\0");
|
|
737
|
+
hash.update(file.executable ? "1" : "0");
|
|
738
|
+
hash.update("\0");
|
|
739
|
+
hash.update(file.content);
|
|
643
740
|
}
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
648
|
-
|
|
649
|
-
metrics: subject.metrics ?? null,
|
|
650
|
-
fileChanges: subject.fileChanges,
|
|
651
|
-
eval: subject.eval ?? null,
|
|
652
|
-
prompt: subject.prompt ?? null,
|
|
653
|
-
};
|
|
654
|
-
return [textSurfaceFile(filePath, `${JSON.stringify(payload, null, 2)}\n`)];
|
|
741
|
+
return hash.digest("hex");
|
|
742
|
+
}
|
|
743
|
+
function isOptimizerTraceInputJob(job) {
|
|
744
|
+
return isTerminalExecutionJob(job) &&
|
|
745
|
+
workbenchExecutionPurpose(job) === "attempt";
|
|
655
746
|
}
|
|
656
747
|
function isTerminalExecutionJob(job) {
|
|
657
748
|
return job.kind === "execute" && (job.status === "succeeded" ||
|
|
@@ -662,20 +753,10 @@ function compareTraceInputJobs(left, right) {
|
|
|
662
753
|
const leftAttempt = readOptionalJobNumber(left.input, "attemptIndex") ?? -1;
|
|
663
754
|
const rightAttempt = readOptionalJobNumber(right.input, "attemptIndex") ?? -1;
|
|
664
755
|
return leftAttempt - rightAttempt ||
|
|
665
|
-
purposeSortKey(workbenchExecutionPurpose(left)) - purposeSortKey(workbenchExecutionPurpose(right)) ||
|
|
666
756
|
(readOptionalJobNumber(left.input, "sampleIndex") ?? -1) - (readOptionalJobNumber(right.input, "sampleIndex") ?? -1) ||
|
|
667
757
|
(readJobString(left.input, "caseId") ?? "").localeCompare(readJobString(right.input, "caseId") ?? "") ||
|
|
668
758
|
left.id.localeCompare(right.id);
|
|
669
759
|
}
|
|
670
|
-
function purposeSortKey(purpose) {
|
|
671
|
-
if (purpose === "improve") {
|
|
672
|
-
return 0;
|
|
673
|
-
}
|
|
674
|
-
if (purpose === "attempt") {
|
|
675
|
-
return 1;
|
|
676
|
-
}
|
|
677
|
-
return 3;
|
|
678
|
-
}
|
|
679
760
|
function completedJobOutputFiles(job) {
|
|
680
761
|
const output = jsonRecord(job.output);
|
|
681
762
|
if (!Array.isArray(output.files)) {
|
|
@@ -689,35 +770,70 @@ function completedJobOutputFiles(job) {
|
|
|
689
770
|
}
|
|
690
771
|
return files;
|
|
691
772
|
}
|
|
692
|
-
function
|
|
693
|
-
|
|
773
|
+
function traceInputRequestFile(files, operation) {
|
|
774
|
+
return files.find((file) => {
|
|
775
|
+
const normalized = normalizeRelativePath(file.path);
|
|
776
|
+
return normalized.startsWith(".workbench/traces/") &&
|
|
777
|
+
normalized.endsWith("/request.json") &&
|
|
778
|
+
file.encoding === "utf8" &&
|
|
779
|
+
traceJsonOperation(file) === operation;
|
|
780
|
+
}) ?? null;
|
|
781
|
+
}
|
|
782
|
+
function traceInputResultFile(files, operation) {
|
|
783
|
+
return files.find((file) => {
|
|
784
|
+
const normalized = normalizeRelativePath(file.path);
|
|
785
|
+
return normalized.startsWith(".workbench/traces/") &&
|
|
786
|
+
normalized.endsWith("/result.json") &&
|
|
787
|
+
file.encoding === "utf8" &&
|
|
788
|
+
traceJsonOperation(file) === operation;
|
|
789
|
+
}) ?? null;
|
|
790
|
+
}
|
|
791
|
+
function traceJsonOperation(file) {
|
|
792
|
+
try {
|
|
793
|
+
const parsed = JSON.parse(file.content);
|
|
794
|
+
return typeof parsed?.operation === "string" ? parsed.operation : null;
|
|
795
|
+
}
|
|
796
|
+
catch {
|
|
797
|
+
return null;
|
|
798
|
+
}
|
|
799
|
+
}
|
|
800
|
+
function traceInputRequestFallback(job, operation) {
|
|
801
|
+
const execution = jsonRecord(jsonRecord(job.input).execution);
|
|
694
802
|
return {
|
|
695
|
-
|
|
696
|
-
|
|
697
|
-
|
|
698
|
-
|
|
699
|
-
|
|
700
|
-
|
|
701
|
-
|
|
702
|
-
|
|
703
|
-
|
|
704
|
-
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
|
|
708
|
-
|
|
709
|
-
|
|
803
|
+
protocol: "workbench.adapter.v3",
|
|
804
|
+
id: typeof execution.id === "string" ? execution.id : job.id,
|
|
805
|
+
jobId: job.id,
|
|
806
|
+
operation,
|
|
807
|
+
invocation: jsonRecord(execution.adapter),
|
|
808
|
+
context: {
|
|
809
|
+
candidate: {
|
|
810
|
+
id: job.candidateId ?? readJobString(job.input, "candidateId") ?? null,
|
|
811
|
+
},
|
|
812
|
+
attempt: {
|
|
813
|
+
attemptIndex: readOptionalJobNumber(job.input, "attemptIndex") ?? null,
|
|
814
|
+
sampleIndex: readOptionalJobNumber(job.input, "sampleIndex") ?? null,
|
|
815
|
+
caseId: readJobString(job.input, "caseId") ?? null,
|
|
816
|
+
},
|
|
817
|
+
},
|
|
710
818
|
};
|
|
711
819
|
}
|
|
712
|
-
function
|
|
713
|
-
const
|
|
714
|
-
const
|
|
715
|
-
const
|
|
820
|
+
function traceInputResultFallback(job, operation) {
|
|
821
|
+
const output = jsonRecord(job.output);
|
|
822
|
+
const ok = job.status === "succeeded" && output.ok !== false;
|
|
823
|
+
const value = operation === "candidate.improve"
|
|
824
|
+
? jsonRecord(output.candidatePatch)
|
|
825
|
+
: operation === "engine.run"
|
|
826
|
+
? jsonRecord(output.result)
|
|
827
|
+
: {};
|
|
716
828
|
return {
|
|
717
|
-
|
|
718
|
-
|
|
719
|
-
|
|
720
|
-
|
|
829
|
+
protocol: "workbench.adapter-result.v1",
|
|
830
|
+
operation,
|
|
831
|
+
ok,
|
|
832
|
+
...(Object.keys(value).length > 0 ? { value: value } : {}),
|
|
833
|
+
...(typeof output.summary === "string" ? { summary: output.summary } : {}),
|
|
834
|
+
...(output.feedback !== undefined ? { feedback: output.feedback } : {}),
|
|
835
|
+
...(output.usage !== undefined ? { usage: output.usage } : {}),
|
|
836
|
+
...(!ok ? { error: job.error ?? "Execution did not complete successfully." } : {}),
|
|
721
837
|
};
|
|
722
838
|
}
|
|
723
839
|
function textSurfaceFile(path, content) {
|
|
@@ -744,7 +860,7 @@ export function buildWorkbenchProjectSourceFiles(input) {
|
|
|
744
860
|
...(input.specFiles
|
|
745
861
|
? input.specFiles.map((file) => ({ ...file }))
|
|
746
862
|
: [textSurfaceFile("benchmark.yaml", input.specSource ?? "")]),
|
|
747
|
-
...prefixProjectSourceFiles(input.
|
|
863
|
+
...prefixProjectSourceFiles(input.candidateFiles, input.candidateFilesPath),
|
|
748
864
|
...prefixProjectSourceFiles(input.engineResolveFiles, input.engineResolveFilesPath),
|
|
749
865
|
...(input.adapterFiles ?? []).map((file) => ({ ...file })),
|
|
750
866
|
...(input.dockerfiles ?? []).map((file) => ({ ...file })),
|
|
@@ -772,18 +888,18 @@ function prefixProjectSourceFiles(files, rootPath) {
|
|
|
772
888
|
};
|
|
773
889
|
});
|
|
774
890
|
}
|
|
775
|
-
export function
|
|
891
|
+
export function isCandidateSourceFilePath(filePath) {
|
|
776
892
|
const normalized = normalizeRelativePath(filePath);
|
|
777
893
|
return (normalized !== ".workbench" &&
|
|
778
894
|
!normalized.startsWith(".workbench/") &&
|
|
779
895
|
normalized !== "workbench-result.json");
|
|
780
896
|
}
|
|
781
|
-
export function
|
|
897
|
+
export function filterCandidateSourceFiles(files) {
|
|
782
898
|
return files
|
|
783
|
-
.filter((file) =>
|
|
899
|
+
.filter((file) => isCandidateSourceFilePath(file.path))
|
|
784
900
|
.map((file) => ({ ...file }));
|
|
785
901
|
}
|
|
786
|
-
export function
|
|
902
|
+
export function buildCandidateLineage(args) {
|
|
787
903
|
const orderedSummaries = args.summaries.slice().sort((left, right) => {
|
|
788
904
|
const createdAt = left.createdAt.localeCompare(right.createdAt);
|
|
789
905
|
return createdAt !== 0 ? createdAt : left.id.localeCompare(right.id);
|
|
@@ -856,7 +972,7 @@ function globPatternToRegExp(pattern) {
|
|
|
856
972
|
function escapeRegExp(value) {
|
|
857
973
|
return value.replace(/[\\^$.*+?()[\]{}|]/gu, "\\$&");
|
|
858
974
|
}
|
|
859
|
-
export function
|
|
975
|
+
export function summarizeCandidateFiles(files, changedPaths = files.map((file) => file.path)) {
|
|
860
976
|
const changed = new Set(changedPaths);
|
|
861
977
|
return [...files]
|
|
862
978
|
.sort((left, right) => left.path.localeCompare(right.path))
|
|
@@ -875,7 +991,7 @@ export function summarizeSubjectFiles(files, changedPaths = files.map((file) =>
|
|
|
875
991
|
};
|
|
876
992
|
});
|
|
877
993
|
}
|
|
878
|
-
export function
|
|
994
|
+
export function createCandidateFilePreview(args) {
|
|
879
995
|
if (args.view === "diff") {
|
|
880
996
|
throw new Error("Diff previews require explicit before and after file content.");
|
|
881
997
|
}
|
|
@@ -901,14 +1017,14 @@ export function createSubjectFilePreview(args) {
|
|
|
901
1017
|
export function createCaseReview(args) {
|
|
902
1018
|
const preferredSampleIndex = uniqueExecutionSampleIndex(args.executions ?? []);
|
|
903
1019
|
const sampleMatchesCase = (sample) => (sample.cases ?? []).some((entry) => entry.id === args.caseId);
|
|
904
|
-
const samples = args.
|
|
1020
|
+
const samples = args.candidate.eval?.samples ?? [];
|
|
905
1021
|
const sampleResult = samples.find((sample) => typeof preferredSampleIndex === "number" &&
|
|
906
1022
|
sample.index === preferredSampleIndex &&
|
|
907
1023
|
sampleMatchesCase(sample)) ?? samples.find(sampleMatchesCase);
|
|
908
1024
|
const caseResult = sampleResult?.cases?.find((entry) => entry.id === args.caseId);
|
|
909
1025
|
if (!sampleResult && (args.executions?.length ?? 0) > 0) {
|
|
910
1026
|
return {
|
|
911
|
-
|
|
1027
|
+
candidateId: args.candidate.id,
|
|
912
1028
|
caseId: args.caseId,
|
|
913
1029
|
caseLabel: args.caseId,
|
|
914
1030
|
...(typeof preferredSampleIndex === "number"
|
|
@@ -920,13 +1036,13 @@ export function createCaseReview(args) {
|
|
|
920
1036
|
};
|
|
921
1037
|
}
|
|
922
1038
|
if (!sampleResult) {
|
|
923
|
-
throw new Error(`Case ${args.caseId} was not found on
|
|
1039
|
+
throw new Error(`Case ${args.caseId} was not found on candidate ${args.candidate.id}.`);
|
|
924
1040
|
}
|
|
925
1041
|
const durationMs = typeof caseResult?.durationMs === "number"
|
|
926
1042
|
? caseResult.durationMs
|
|
927
1043
|
: undefined;
|
|
928
1044
|
return {
|
|
929
|
-
|
|
1045
|
+
candidateId: args.candidate.id,
|
|
930
1046
|
caseId: caseResult?.id ?? args.caseId,
|
|
931
1047
|
caseLabel: caseResult?.label ?? args.caseId,
|
|
932
1048
|
sampleId: sampleResult.id,
|
|
@@ -965,37 +1081,39 @@ function parseAuthoredWorkbenchSourceSpec(source) {
|
|
|
965
1081
|
}
|
|
966
1082
|
const resolved = resolveWorkbenchResolvedSourceYamlInternal(source);
|
|
967
1083
|
return {
|
|
968
|
-
version:
|
|
1084
|
+
version: 4,
|
|
969
1085
|
benchmark: {
|
|
970
1086
|
name: resolved.benchmark.name,
|
|
971
1087
|
description: resolved.benchmark.description,
|
|
972
1088
|
engine: authoredAdapterSpecFromInvocation(resolved.engine),
|
|
973
1089
|
},
|
|
974
|
-
|
|
975
|
-
name: resolved.
|
|
976
|
-
description: resolved.
|
|
977
|
-
files: { path: resolved.
|
|
978
|
-
...(resolved.
|
|
979
|
-
|
|
980
|
-
|
|
981
|
-
|
|
982
|
-
|
|
983
|
-
|
|
984
|
-
|
|
985
|
-
...(resolved.optimizer.description ? { description: resolved.optimizer.description } : {}),
|
|
986
|
-
edits: [...resolved.optimizer.edits],
|
|
987
|
-
improve: improveSpecFromInvocation(resolved.improve),
|
|
1090
|
+
candidate: {
|
|
1091
|
+
name: resolved.candidate.name,
|
|
1092
|
+
description: resolved.candidate.description,
|
|
1093
|
+
files: { path: resolved.candidate.files.path },
|
|
1094
|
+
...(resolved.candidate.prepare ? { prepare: { ...resolved.candidate.prepare } } : {}),
|
|
1095
|
+
defaultRun: resolved.candidate.defaultRun,
|
|
1096
|
+
runs: Object.fromEntries(Object.entries(resolved.candidate.runs).map(([runId, run]) => [
|
|
1097
|
+
runId,
|
|
1098
|
+
{
|
|
1099
|
+
name: run.name,
|
|
1100
|
+
...authoredAdapterSpecFromInvocation(run),
|
|
988
1101
|
},
|
|
989
|
-
|
|
990
|
-
|
|
1102
|
+
])),
|
|
1103
|
+
...(resolved.candidate.improve
|
|
1104
|
+
? {
|
|
1105
|
+
improve: {
|
|
1106
|
+
edits: [...resolved.candidate.improve.edits],
|
|
1107
|
+
...improveSpecFromInvocation(resolved.improve),
|
|
1108
|
+
},
|
|
1109
|
+
}
|
|
1110
|
+
: {}),
|
|
1111
|
+
},
|
|
991
1112
|
};
|
|
992
1113
|
}
|
|
993
1114
|
function improveSpecFromInvocation(invocation) {
|
|
994
1115
|
return authoredAdapterSpecFromInvocation(invocation);
|
|
995
1116
|
}
|
|
996
|
-
function runSpecFromInvocation(invocation) {
|
|
997
|
-
return authoredAdapterSpecFromInvocation(invocation);
|
|
998
|
-
}
|
|
999
1117
|
function authoredAdapterSpecFromInvocation(invocation) {
|
|
1000
1118
|
const config = jsonRecord(invocation.with);
|
|
1001
1119
|
return {
|
|
@@ -1048,9 +1166,9 @@ export function createWorkbenchRunWorkload(args) {
|
|
|
1048
1166
|
if (!purpose) {
|
|
1049
1167
|
throw new Error(`Unsupported runtime job kind: ${args.job.kind}`);
|
|
1050
1168
|
}
|
|
1051
|
-
const
|
|
1052
|
-
if (!
|
|
1053
|
-
throw new Error(`${purpose} execution job is missing
|
|
1169
|
+
const candidateId = readJobString(args.job.input, "candidateId") ?? args.job.candidateId;
|
|
1170
|
+
if (!candidateId) {
|
|
1171
|
+
throw new Error(`${purpose} execution job is missing candidateId.`);
|
|
1054
1172
|
}
|
|
1055
1173
|
const attemptIndex = readRequiredJobNumber(args.job.input, "attemptIndex", `${purpose} execution job`);
|
|
1056
1174
|
const sampleIndex = purpose === "improve"
|
|
@@ -1066,7 +1184,7 @@ export function createWorkbenchRunWorkload(args) {
|
|
|
1066
1184
|
? engineCaseFilesForRuntimeInput({ spec: args.spec, engineCase })
|
|
1067
1185
|
: [];
|
|
1068
1186
|
const engineCaseSpec = engineCase?.case;
|
|
1069
|
-
const initial =
|
|
1187
|
+
const initial = createInitialCandidateFiles({
|
|
1070
1188
|
baseFiles: args.baseFiles,
|
|
1071
1189
|
spec: args.spec,
|
|
1072
1190
|
attemptIndex,
|
|
@@ -1074,10 +1192,10 @@ export function createWorkbenchRunWorkload(args) {
|
|
|
1074
1192
|
return {
|
|
1075
1193
|
job: args.job,
|
|
1076
1194
|
spec: args.spec,
|
|
1077
|
-
|
|
1195
|
+
candidateId,
|
|
1078
1196
|
attemptIndex,
|
|
1079
1197
|
sampleIndex,
|
|
1080
|
-
|
|
1198
|
+
candidateFiles: initial.files,
|
|
1081
1199
|
caseId,
|
|
1082
1200
|
engineResolveFiles: selectedEngineResolveFiles,
|
|
1083
1201
|
traceFiles: (args.traceFiles ?? []).map((file) => ({ ...file })),
|
|
@@ -1088,22 +1206,22 @@ export function createWorkbenchRunWorkload(args) {
|
|
|
1088
1206
|
baseId: readJobString(args.job.input, "baseId"),
|
|
1089
1207
|
};
|
|
1090
1208
|
}
|
|
1091
|
-
function
|
|
1092
|
-
const editablePaths =
|
|
1209
|
+
function createInitialCandidateFiles(args) {
|
|
1210
|
+
const editablePaths = improveEdits(args.spec).map(normalizeRelativePath);
|
|
1093
1211
|
const editPath = editablePaths[0];
|
|
1094
|
-
const
|
|
1212
|
+
const candidatePaths = editPath ? [editPath] : [];
|
|
1095
1213
|
const files = args.baseFiles.length > 0
|
|
1096
1214
|
? args.baseFiles.map((file) => ({ ...file }))
|
|
1097
1215
|
: editPath
|
|
1098
1216
|
? normalizeSurfaceFiles([{ path: editPath, content: "" }])
|
|
1099
1217
|
: [];
|
|
1100
1218
|
const prompt = [
|
|
1101
|
-
`Run the
|
|
1102
|
-
`Attempt ${args.attemptIndex + 1} uses ${
|
|
1219
|
+
`Run the candidate workload for benchmark: ${args.spec.benchmark.description}`,
|
|
1220
|
+
`Attempt ${args.attemptIndex + 1} uses ${formatImproveSummary(args.spec)}; the improve adapter may edit the candidate before Workbench scores it.`,
|
|
1103
1221
|
].join("\n");
|
|
1104
1222
|
const byPath = new Map(files.map((file) => [file.path, file]));
|
|
1105
1223
|
if (editPath &&
|
|
1106
|
-
![...byPath.keys()].some((filePath) =>
|
|
1224
|
+
![...byPath.keys()].some((filePath) => candidatePaths.includes(filePath))) {
|
|
1107
1225
|
byPath.set(editPath, {
|
|
1108
1226
|
path: editPath,
|
|
1109
1227
|
kind: "text",
|
|
@@ -1167,7 +1285,7 @@ export function workbenchExecutionExecutorForRuntimeInput(args) {
|
|
|
1167
1285
|
}
|
|
1168
1286
|
function adapterOperationForExecutionPurpose(purpose) {
|
|
1169
1287
|
if (purpose === "improve") {
|
|
1170
|
-
return "
|
|
1288
|
+
return "candidate.improve";
|
|
1171
1289
|
}
|
|
1172
1290
|
if (purpose === "attempt") {
|
|
1173
1291
|
return "engine.run";
|
|
@@ -1281,8 +1399,8 @@ function normalizeRuntimeControlInputs(value) {
|
|
|
1281
1399
|
}
|
|
1282
1400
|
const record = value;
|
|
1283
1401
|
const inputs = {};
|
|
1284
|
-
if (hasOwn(record, "
|
|
1285
|
-
inputs.
|
|
1402
|
+
if (hasOwn(record, "candidate")) {
|
|
1403
|
+
inputs.candidate = normalizeRuntimeControlFiles(record.candidate, "inputs.candidate");
|
|
1286
1404
|
}
|
|
1287
1405
|
if (hasOwn(record, "case")) {
|
|
1288
1406
|
inputs.case = normalizeRuntimeControlFiles(record.case, "inputs.case");
|
|
@@ -1326,8 +1444,8 @@ function normalizeRuntimeControlOperation(value, label) {
|
|
|
1326
1444
|
const operation = record.operation;
|
|
1327
1445
|
if (operation !== "engine.resolve" &&
|
|
1328
1446
|
operation !== "engine.run" &&
|
|
1329
|
-
operation !== "
|
|
1330
|
-
operation !== "
|
|
1447
|
+
operation !== "candidate.run" &&
|
|
1448
|
+
operation !== "candidate.improve") {
|
|
1331
1449
|
throw new Error(`Workbench runtime-control ${label}.operation is invalid.`);
|
|
1332
1450
|
}
|
|
1333
1451
|
const invocation = record.invocation;
|
|
@@ -1415,7 +1533,7 @@ export async function executeAdapterInCurrentRuntime(args, execution, startedAt,
|
|
|
1415
1533
|
};
|
|
1416
1534
|
try {
|
|
1417
1535
|
if (execution.purpose === "improve") {
|
|
1418
|
-
return await
|
|
1536
|
+
return await executeCandidateRevisionExecutionInCurrentRuntime(runtimeInput, execution, startedAt, capability, eventPublisher);
|
|
1419
1537
|
}
|
|
1420
1538
|
if (execution.purpose === "attempt") {
|
|
1421
1539
|
return await executeAttemptExecutionInCurrentRuntime(runtimeInput, execution, startedAt, capability, eventPublisher);
|
|
@@ -1589,22 +1707,22 @@ function completedJobFromSandboxResult(fallbackJob, startedAt, result) {
|
|
|
1589
1707
|
}
|
|
1590
1708
|
return attachSandboxMetadataToJob(failWorkbenchRunJob(fallbackJob, result.startedAt || startedAt, result.error ?? `Sandbox execution ${result.status}.`, result.finishedAt), asRuntimeRecord(result.metadata).sandbox);
|
|
1591
1709
|
}
|
|
1592
|
-
async function
|
|
1710
|
+
async function executeCandidateRevisionExecutionInCurrentRuntime(args, execution, startedAt, capability, eventPublisher) {
|
|
1593
1711
|
const { workload, result } = await runHostedProtocolExecutionResult(args, execution, startedAt, capability, eventPublisher);
|
|
1594
1712
|
if (result.error || (result.exitCode ?? 0) !== 0) {
|
|
1595
1713
|
return failWorkbenchRunJob(args.job, startedAt, result.error ?? `Adapter ${execution.adapter.use} exited with status ${result.exitCode}.`, result.finishedAt, result);
|
|
1596
1714
|
}
|
|
1597
1715
|
const finishedAt = result.finishedAt ?? new Date().toISOString();
|
|
1598
|
-
const
|
|
1599
|
-
if (
|
|
1600
|
-
return failWorkbenchRunJob(args.job, startedAt, `${execution.adapter.use === "command" ? "Command improve adapter" : `Adapter ${execution.adapter.use}`} completed without changing
|
|
1601
|
-
}
|
|
1602
|
-
const
|
|
1603
|
-
baseFiles: workload.
|
|
1604
|
-
patch:
|
|
1605
|
-
edits:
|
|
1716
|
+
const candidatePatch = createCandidatePatchFromResult(result, args.spec);
|
|
1717
|
+
if (candidatePatch.fileChanges.length === 0) {
|
|
1718
|
+
return failWorkbenchRunJob(args.job, startedAt, `${execution.adapter.use === "command" ? "Command improve adapter" : `Adapter ${execution.adapter.use}`} completed without changing candidate files covered by improve edits.`, finishedAt, result);
|
|
1719
|
+
}
|
|
1720
|
+
const candidateRevisionFiles = applyWorkbenchCandidatePatch({
|
|
1721
|
+
baseFiles: workload.candidateFiles,
|
|
1722
|
+
patch: candidatePatch,
|
|
1723
|
+
edits: requireImproveEdits(args.spec),
|
|
1606
1724
|
});
|
|
1607
|
-
const usage = assignUsageRole("
|
|
1725
|
+
const usage = assignUsageRole("improver", result.usage);
|
|
1608
1726
|
return {
|
|
1609
1727
|
...args.job,
|
|
1610
1728
|
status: "succeeded",
|
|
@@ -1616,13 +1734,13 @@ async function executeSubjectRevisionExecutionInCurrentRuntime(args, execution,
|
|
|
1616
1734
|
ok: true,
|
|
1617
1735
|
executionId: execution.id,
|
|
1618
1736
|
purpose: execution.purpose,
|
|
1619
|
-
|
|
1737
|
+
candidateId: workload.candidateId,
|
|
1620
1738
|
attemptIndex: workload.attemptIndex,
|
|
1621
1739
|
baseId: workload.baseId,
|
|
1622
1740
|
prompt: workload.prompt,
|
|
1623
|
-
|
|
1624
|
-
fileChanges:
|
|
1625
|
-
files:
|
|
1741
|
+
candidatePatch,
|
|
1742
|
+
fileChanges: candidatePatch.fileChanges,
|
|
1743
|
+
files: candidateRevisionFiles,
|
|
1626
1744
|
traces: traceFilePaths(result.files),
|
|
1627
1745
|
...(usage ? { usage } : {}),
|
|
1628
1746
|
...(result.summary !== undefined ? { summary: result.summary } : {}),
|
|
@@ -1655,7 +1773,7 @@ async function executeAttemptExecutionInCurrentRuntime(args, execution, startedA
|
|
|
1655
1773
|
const finishedAt = workloadResult.finishedAt ?? new Date().toISOString();
|
|
1656
1774
|
const usage = attemptUsageSummary(workloadResult.usage, engineResult.usage);
|
|
1657
1775
|
const sample = evaluateSample({
|
|
1658
|
-
|
|
1776
|
+
candidateId: workload.candidateId,
|
|
1659
1777
|
files: workloadResult.files,
|
|
1660
1778
|
engineResolveFiles: workload.engineResolveFiles,
|
|
1661
1779
|
spec: workload.spec,
|
|
@@ -1682,7 +1800,7 @@ async function executeAttemptExecutionInCurrentRuntime(args, execution, startedA
|
|
|
1682
1800
|
ok: true,
|
|
1683
1801
|
executionId: execution.id,
|
|
1684
1802
|
purpose: execution.purpose,
|
|
1685
|
-
|
|
1803
|
+
candidateId: workload.candidateId,
|
|
1686
1804
|
attemptIndex: workload.attemptIndex,
|
|
1687
1805
|
sampleIndex: workload.sampleIndex,
|
|
1688
1806
|
caseId: workload.caseId,
|
|
@@ -1725,7 +1843,7 @@ export async function executeRuntimeControlOperationSequenceInCurrentRuntime(arg
|
|
|
1725
1843
|
? { adapterAuthEnv: adapterAuth.env }
|
|
1726
1844
|
: {}),
|
|
1727
1845
|
}, workload, args.runtimeControlOperation.operations.map((operation, index) => runtimeControlStepForOperation(operation, index, args.adapterManifests)), startedAt, {
|
|
1728
|
-
|
|
1846
|
+
runCandidatePrepare: args.runtimeControlOperation.prepare ?? false,
|
|
1729
1847
|
workspaceFiles: args.runtimeControlOperation.inputs?.workspace ?? [],
|
|
1730
1848
|
outputFiles: args.runtimeControlOperation.inputs?.output ?? [],
|
|
1731
1849
|
collectWorkspace: args.runtimeControlOperation.collectWorkspace ?? false,
|
|
@@ -1823,7 +1941,7 @@ function createRuntimeControlSandboxInput(args, request) {
|
|
|
1823
1941
|
const parentInput = asRuntimeRecord(args.job.input);
|
|
1824
1942
|
const publicFiles = runtimeControlInputFiles(request.inputs, "case", parentWorkload.engineCase ? engineCasePublicFiles(parentWorkload.engineCase) : []);
|
|
1825
1943
|
const privateFiles = runtimeControlInputFiles(request.inputs, "enginePrivate", parentWorkload.engineCase ? engineCasePrivateFiles(parentWorkload.engineCase) : []);
|
|
1826
|
-
const
|
|
1944
|
+
const candidateFiles = runtimeControlInputFiles(request.inputs, "candidate", parentWorkload.candidateFiles);
|
|
1827
1945
|
const traceFiles = runtimeControlInputFiles(request.inputs, "traces", parentWorkload.traceFiles);
|
|
1828
1946
|
const adapter = request.operations[request.operations.length - 1]?.invocation;
|
|
1829
1947
|
const childExecution = {
|
|
@@ -1866,7 +1984,7 @@ function createRuntimeControlSandboxInput(args, request) {
|
|
|
1866
1984
|
const childArgs = {
|
|
1867
1985
|
...args,
|
|
1868
1986
|
job: childJob,
|
|
1869
|
-
baseFiles:
|
|
1987
|
+
baseFiles: candidateFiles,
|
|
1870
1988
|
engineResolveFiles: [...publicFiles, ...privateFiles],
|
|
1871
1989
|
engineCases: [engineCase],
|
|
1872
1990
|
traceFiles,
|
|
@@ -1890,10 +2008,10 @@ function runtimeControlStepForOperation(operation, index, manifests = []) {
|
|
|
1890
2008
|
...(operation.invocation.auth !== undefined ? { auth: operation.invocation.auth } : {}),
|
|
1891
2009
|
}, operation.operation, manifests).command;
|
|
1892
2010
|
return {
|
|
1893
|
-
kind: operation.operation === "
|
|
1894
|
-
? "
|
|
1895
|
-
: operation.operation === "
|
|
1896
|
-
? "
|
|
2011
|
+
kind: operation.operation === "candidate.run"
|
|
2012
|
+
? "candidate"
|
|
2013
|
+
: operation.operation === "candidate.improve"
|
|
2014
|
+
? "improver"
|
|
1897
2015
|
: "engine",
|
|
1898
2016
|
label: operation.label ?? `${operation.operation.replace(".", "_")}_${index + 1}`,
|
|
1899
2017
|
operation: operation.operation,
|
|
@@ -1960,8 +2078,8 @@ function isWorkbenchAdapterOperationResult(value) {
|
|
|
1960
2078
|
return record.protocol === "workbench.adapter-result.v1" &&
|
|
1961
2079
|
(record.operation === "engine.resolve" ||
|
|
1962
2080
|
record.operation === "engine.run" ||
|
|
1963
|
-
record.operation === "
|
|
1964
|
-
record.operation === "
|
|
2081
|
+
record.operation === "candidate.run" ||
|
|
2082
|
+
record.operation === "candidate.improve");
|
|
1965
2083
|
}
|
|
1966
2084
|
function cloneSurfaceFiles(files) {
|
|
1967
2085
|
return files.map((file) => ({ ...file, path: normalizeRelativePath(file.path) }));
|
|
@@ -2040,9 +2158,11 @@ async function runHostedCommandExecutionSteps(args, workload, steps, startedAt,
|
|
|
2040
2158
|
const stepTimeoutMs = environmentVersion
|
|
2041
2159
|
? environmentVersionTimeoutMs(environmentVersion)
|
|
2042
2160
|
: 5 * 60 * 1000;
|
|
2043
|
-
const
|
|
2044
|
-
|
|
2045
|
-
|
|
2161
|
+
const shouldRunCandidatePrepare = options.runCandidatePrepare ??
|
|
2162
|
+
(readWorkloadExecutionPurpose(workload) === "attempt" &&
|
|
2163
|
+
steps.some((step) => step.executor === "sandbox"));
|
|
2164
|
+
if (shouldRunCandidatePrepare) {
|
|
2165
|
+
await runCandidatePrepareCommand({
|
|
2046
2166
|
root: workspace.root,
|
|
2047
2167
|
workload,
|
|
2048
2168
|
execution,
|
|
@@ -2081,6 +2201,9 @@ async function runHostedCommandExecutionSteps(args, workload, steps, startedAt,
|
|
|
2081
2201
|
});
|
|
2082
2202
|
const operationResult = await readWorkbenchAdapterOperationResult(outputDir(workspace.root), step.operation);
|
|
2083
2203
|
assertWorkbenchAdapterOperationResultOk(operationResult, `Adapter ${step.adapter?.use ?? execution.adapter.use} ${step.operation}`);
|
|
2204
|
+
await writeSurfaceFiles(outputDir(workspace.root), [
|
|
2205
|
+
textSurfaceFile(`.workbench/traces/${workload.job.id}/${step.label}/result.json`, `${JSON.stringify(operationResult, null, 2)}\n`),
|
|
2206
|
+
]);
|
|
2084
2207
|
operationResults.push(operationResult);
|
|
2085
2208
|
await publishCommandStepEvent(options.eventPublisher, {
|
|
2086
2209
|
step: step.label,
|
|
@@ -2132,19 +2255,19 @@ async function runHostedCommandExecutionSteps(args, workload, steps, startedAt,
|
|
|
2132
2255
|
await workspace.cleanup();
|
|
2133
2256
|
}
|
|
2134
2257
|
}
|
|
2135
|
-
async function
|
|
2136
|
-
const command = args.workload.spec.
|
|
2258
|
+
async function runCandidatePrepareCommand(args) {
|
|
2259
|
+
const command = args.workload.spec.candidate.prepare?.command;
|
|
2137
2260
|
if (!command) {
|
|
2138
2261
|
return;
|
|
2139
2262
|
}
|
|
2140
|
-
const role = args.execution.purpose === "improve" ? "
|
|
2263
|
+
const role = args.execution.purpose === "improve" ? "improver" : "runner";
|
|
2141
2264
|
await publishCommandStepEvent(args.eventPublisher, {
|
|
2142
|
-
step: "
|
|
2265
|
+
step: "candidate_prepare",
|
|
2143
2266
|
status: "started",
|
|
2144
2267
|
role,
|
|
2145
2268
|
});
|
|
2146
2269
|
try {
|
|
2147
|
-
const shellCommand = createHostedWorkloadShellCommand(args.root, command, "
|
|
2270
|
+
const shellCommand = createHostedWorkloadShellCommand(args.root, command, "candidate_prepare");
|
|
2148
2271
|
await args.execFileAsync("sh", ["-c", shellCommand], {
|
|
2149
2272
|
cwd: args.root,
|
|
2150
2273
|
env: createHostedWorkloadPrepareEnv(args.root),
|
|
@@ -2152,20 +2275,20 @@ async function runSubjectPrepareCommand(args) {
|
|
|
2152
2275
|
timeout: args.timeoutMs,
|
|
2153
2276
|
});
|
|
2154
2277
|
await publishCommandStepEvent(args.eventPublisher, {
|
|
2155
|
-
step: "
|
|
2278
|
+
step: "candidate_prepare",
|
|
2156
2279
|
status: "succeeded",
|
|
2157
2280
|
role,
|
|
2158
2281
|
});
|
|
2159
2282
|
}
|
|
2160
2283
|
catch (error) {
|
|
2161
2284
|
await publishCommandStepEvent(args.eventPublisher, {
|
|
2162
|
-
step: "
|
|
2285
|
+
step: "candidate_prepare",
|
|
2163
2286
|
status: "failed",
|
|
2164
2287
|
exitCode: readExitCode(error),
|
|
2165
2288
|
error: error instanceof Error ? error.message : String(error),
|
|
2166
2289
|
role,
|
|
2167
2290
|
});
|
|
2168
|
-
throw new Error(`
|
|
2291
|
+
throw new Error(`Candidate prepare command failed: ${error instanceof Error ? error.message : String(error)}`);
|
|
2169
2292
|
}
|
|
2170
2293
|
}
|
|
2171
2294
|
async function createRuntimeWorkspaceRoot(args, fs, os, path, prefix) {
|
|
@@ -2204,10 +2327,10 @@ async function createRuntimeWorkspaceRoot(args, fs, os, path, prefix) {
|
|
|
2204
2327
|
};
|
|
2205
2328
|
}
|
|
2206
2329
|
function stepEventRole(step) {
|
|
2207
|
-
if (step.kind === "
|
|
2208
|
-
return "
|
|
2330
|
+
if (step.kind === "improver") {
|
|
2331
|
+
return "improver";
|
|
2209
2332
|
}
|
|
2210
|
-
if (step.kind === "
|
|
2333
|
+
if (step.kind === "candidate") {
|
|
2211
2334
|
return "runner";
|
|
2212
2335
|
}
|
|
2213
2336
|
if (step.kind === "engine") {
|
|
@@ -2219,10 +2342,10 @@ function adapterOperationUsageSummary(result) {
|
|
|
2219
2342
|
if (hasExplicitUsageRole(result.usage)) {
|
|
2220
2343
|
return completeUsageSummary(result.usage);
|
|
2221
2344
|
}
|
|
2222
|
-
if (result.operation === "
|
|
2223
|
-
return assignUsageRole("
|
|
2345
|
+
if (result.operation === "candidate.improve") {
|
|
2346
|
+
return assignUsageRole("improver", result.usage);
|
|
2224
2347
|
}
|
|
2225
|
-
if (result.operation === "
|
|
2348
|
+
if (result.operation === "candidate.run") {
|
|
2226
2349
|
return assignUsageRole("runner", result.usage);
|
|
2227
2350
|
}
|
|
2228
2351
|
if (result.operation === "engine.run") {
|
|
@@ -2239,16 +2362,16 @@ function attemptUsageSummary(workloadUsage, resultUsage) {
|
|
|
2239
2362
|
}
|
|
2240
2363
|
function hasExplicitUsageRole(usage) {
|
|
2241
2364
|
const normalized = completeUsageSummary(usage);
|
|
2242
|
-
return Boolean(normalized?.
|
|
2365
|
+
return Boolean(normalized?.improver || normalized?.runner || normalized?.engine);
|
|
2243
2366
|
}
|
|
2244
|
-
function
|
|
2245
|
-
if (result.
|
|
2246
|
-
return result.
|
|
2367
|
+
function createCandidatePatchFromResult(result, spec) {
|
|
2368
|
+
if (result.candidatePatch) {
|
|
2369
|
+
return result.candidatePatch;
|
|
2247
2370
|
}
|
|
2248
2371
|
const changedEditPaths = result.fileChanges
|
|
2249
2372
|
.map(normalizeRelativePath)
|
|
2250
2373
|
.filter((filePath) => !filePath.startsWith(".workbench/") &&
|
|
2251
|
-
|
|
2374
|
+
isCandidateEditPath(filePath, improveEdits(spec)));
|
|
2252
2375
|
const changedSet = new Set(changedEditPaths);
|
|
2253
2376
|
const files = result.files
|
|
2254
2377
|
.filter((file) => changedSet.has(normalizeRelativePath(file.path)))
|
|
@@ -2260,7 +2383,7 @@ function createSubjectPatchFromResult(result, spec) {
|
|
|
2260
2383
|
...(result.feedback !== undefined ? { feedback: result.feedback } : {}),
|
|
2261
2384
|
};
|
|
2262
2385
|
}
|
|
2263
|
-
function
|
|
2386
|
+
function isCandidateEditPath(filePath, edits) {
|
|
2264
2387
|
const normalized = normalizeRelativePath(filePath);
|
|
2265
2388
|
return edits.some((entry) => {
|
|
2266
2389
|
const editPath = normalizeRelativePath(entry).replace(/\/+$/u, "");
|
|
@@ -2320,21 +2443,33 @@ export async function stageWorkbenchRunWorkload(root, workload) {
|
|
|
2320
2443
|
]);
|
|
2321
2444
|
await fs.mkdir(inputDir(root), { recursive: true });
|
|
2322
2445
|
await fs.mkdir(outputDir(root), { recursive: true });
|
|
2446
|
+
await clearMutableWorkspaceFiles(root);
|
|
2323
2447
|
if (purpose === "attempt") {
|
|
2324
|
-
await fs.mkdir(
|
|
2448
|
+
await fs.mkdir(candidateDir(root), { recursive: true });
|
|
2325
2449
|
await fs.mkdir(caseDir(root), { recursive: true });
|
|
2326
2450
|
const engineCase = requireWorkloadEngineCase(workload, "Attempt staging");
|
|
2327
|
-
await writeSurfaceFiles(
|
|
2451
|
+
await writeSurfaceFiles(candidateDir(root), workload.candidateFiles);
|
|
2328
2452
|
await writeSurfaceFiles(caseDir(root), engineCasePublicFiles(engineCase));
|
|
2329
2453
|
return;
|
|
2330
2454
|
}
|
|
2331
2455
|
if (purpose === "improve") {
|
|
2332
|
-
await
|
|
2333
|
-
await writeSurfaceFiles(subjectDir(root), workload.subjectFiles);
|
|
2456
|
+
await writeSurfaceFiles(root, workload.candidateFiles.filter((file) => isMutableWorkspaceSnapshotPath(file.path)));
|
|
2334
2457
|
await fs.mkdir(tracesDir(root), { recursive: true });
|
|
2335
2458
|
await writeSurfaceFiles(tracesDir(root), workload.traceFiles);
|
|
2336
2459
|
}
|
|
2337
2460
|
}
|
|
2461
|
+
async function clearMutableWorkspaceFiles(root) {
|
|
2462
|
+
const fs = await importNodeModule(nodeBuiltin("fs/promises"));
|
|
2463
|
+
const path = await importNodeModule(nodeBuiltin("path"));
|
|
2464
|
+
const entries = await fs.readdir(root, { withFileTypes: true }).catch(() => []);
|
|
2465
|
+
await Promise.all(entries.map(async (entry) => {
|
|
2466
|
+
const relativePath = normalizeRelativePath(entry.name);
|
|
2467
|
+
if (!isMutableWorkspaceSnapshotPath(relativePath)) {
|
|
2468
|
+
return;
|
|
2469
|
+
}
|
|
2470
|
+
await fs.rm(path.join(root, entry.name), { recursive: true, force: true });
|
|
2471
|
+
}));
|
|
2472
|
+
}
|
|
2338
2473
|
async function stageWorkbenchEnginePrivateFiles(root, workload) {
|
|
2339
2474
|
if (readWorkloadExecutionPurpose(workload) !== "attempt") {
|
|
2340
2475
|
return;
|
|
@@ -2417,7 +2552,7 @@ function adapterFilePathWithinRoot(filePath, sourceRoot) {
|
|
|
2417
2552
|
}
|
|
2418
2553
|
async function readHostedRunFailureResult(root, workload, options) {
|
|
2419
2554
|
const traceFiles = await readRuntimeTraceFiles(root, workload);
|
|
2420
|
-
const outputFiles = filterRuntimeOutputFiles(await readSurfaceFiles(outputDir(root)));
|
|
2555
|
+
const outputFiles = filterRuntimeOutputFiles(await readSurfaceFiles(outputDir(root), { ignorePath: isWorkbenchInternalOutputPath }));
|
|
2421
2556
|
const startedAt = options.startedAt ?? new Date().toISOString();
|
|
2422
2557
|
const finishedAt = new Date().toISOString();
|
|
2423
2558
|
const files = [...outputFiles, ...traceFiles].sort((left, right) => left.path.localeCompare(right.path));
|
|
@@ -2433,13 +2568,13 @@ async function readHostedRunFailureResult(root, workload, options) {
|
|
|
2433
2568
|
async function readWorkbenchRunWorkloadResult(root, workload, options = {}) {
|
|
2434
2569
|
const path = await importNodeModule(nodeBuiltin("path"));
|
|
2435
2570
|
const traceFiles = await readRuntimeTraceFiles(root, workload);
|
|
2436
|
-
const outputFiles = filterRuntimeOutputFiles(await readSurfaceFiles(outputDir(root)));
|
|
2571
|
+
const outputFiles = filterRuntimeOutputFiles(await readSurfaceFiles(outputDir(root), { ignorePath: isWorkbenchInternalOutputPath }));
|
|
2437
2572
|
const outputExitCode = await readOptionalNumber(path.join(outputDir(root), "exit_code"));
|
|
2438
2573
|
const startedAt = options.startedAt ?? new Date().toISOString();
|
|
2439
2574
|
const finishedAt = new Date().toISOString();
|
|
2440
2575
|
const purpose = readWorkloadExecutionPurpose(workload);
|
|
2441
2576
|
const primaryOperation = purpose === "improve"
|
|
2442
|
-
? "
|
|
2577
|
+
? "candidate.improve"
|
|
2443
2578
|
: "engine.run";
|
|
2444
2579
|
const primaryResult = [...(options.operationResults ?? [])]
|
|
2445
2580
|
.reverse()
|
|
@@ -2453,9 +2588,9 @@ async function readWorkbenchRunWorkloadResult(root, workload, options = {}) {
|
|
|
2453
2588
|
const cases = normalizeResultCases(resultPayload.cases);
|
|
2454
2589
|
const includeResultScoring = purpose === "attempt";
|
|
2455
2590
|
const files = [...outputFiles, ...traceFiles].sort((left, right) => left.path.localeCompare(right.path));
|
|
2456
|
-
const
|
|
2591
|
+
const candidatePatch = purpose === "improve" ? primaryResult?.value : undefined;
|
|
2457
2592
|
const engineResult = purpose === "attempt" ? primaryResult?.value : undefined;
|
|
2458
|
-
const declaredChanges =
|
|
2593
|
+
const declaredChanges = candidatePatch?.fileChanges ??
|
|
2459
2594
|
(Array.isArray(resultPayload.fileChanges)
|
|
2460
2595
|
? resultPayload.fileChanges.filter((entry) => typeof entry === "string")
|
|
2461
2596
|
: files.map((file) => file.path));
|
|
@@ -2463,7 +2598,7 @@ async function readWorkbenchRunWorkloadResult(root, workload, options = {}) {
|
|
|
2463
2598
|
files,
|
|
2464
2599
|
fileChanges: declaredChanges,
|
|
2465
2600
|
...(options.operationResults ? { operationResults: [...options.operationResults] } : {}),
|
|
2466
|
-
...(
|
|
2601
|
+
...(candidatePatch ? { candidatePatch } : {}),
|
|
2467
2602
|
...(engineResult ? { result: engineResult } : {}),
|
|
2468
2603
|
...(includeResultScoring && metrics ? { metrics } : {}),
|
|
2469
2604
|
...(includeResultScoring && cases ? { cases } : {}),
|
|
@@ -2537,8 +2672,8 @@ async function writeWorkbenchAdapterRequest(root, workload, execution, step, aut
|
|
|
2537
2672
|
await fs.mkdir(path.dirname(requestPath), { recursive: true });
|
|
2538
2673
|
const casePrompt = workload.engineCaseSpec?.prompt;
|
|
2539
2674
|
const adapter = step.adapter ?? execution.adapter;
|
|
2540
|
-
const
|
|
2541
|
-
|
|
2675
|
+
const candidateCommand = adapterProtocolCommandSpec(workload.spec.run, "candidate.run", manifests).command;
|
|
2676
|
+
const payload = {
|
|
2542
2677
|
protocol: "workbench.adapter.v3",
|
|
2543
2678
|
id: execution.id,
|
|
2544
2679
|
jobId: workload.job.id,
|
|
@@ -2554,17 +2689,17 @@ async function writeWorkbenchAdapterRequest(root, workload, execution, step, aut
|
|
|
2554
2689
|
name: workload.spec.benchmark.name,
|
|
2555
2690
|
description: workload.spec.benchmark.description,
|
|
2556
2691
|
},
|
|
2557
|
-
|
|
2558
|
-
id: workload.
|
|
2559
|
-
path: workload.spec.
|
|
2560
|
-
...(workload.spec.
|
|
2692
|
+
candidate: {
|
|
2693
|
+
id: workload.candidateId,
|
|
2694
|
+
path: workload.spec.candidate.files.path,
|
|
2695
|
+
...(workload.spec.candidate.prepare ? { prepare: { ...workload.spec.candidate.prepare } } : {}),
|
|
2561
2696
|
run: {
|
|
2562
2697
|
...workload.spec.run,
|
|
2563
|
-
command:
|
|
2698
|
+
command: candidateCommand,
|
|
2564
2699
|
},
|
|
2565
2700
|
},
|
|
2566
|
-
...(workload.spec.
|
|
2567
|
-
? {
|
|
2701
|
+
...(workload.spec.candidate.improve
|
|
2702
|
+
? { improve: { edits: [...workload.spec.candidate.improve.edits] } }
|
|
2568
2703
|
: {}),
|
|
2569
2704
|
attempt: {
|
|
2570
2705
|
attemptIndex: workload.attemptIndex,
|
|
@@ -2580,21 +2715,41 @@ async function writeWorkbenchAdapterRequest(root, workload, execution, step, aut
|
|
|
2580
2715
|
workspace: root,
|
|
2581
2716
|
output: outputDir(root),
|
|
2582
2717
|
result: workbenchAdapterOperationResultPath(outputDir(root)),
|
|
2583
|
-
|
|
2718
|
+
...(readWorkloadExecutionPurpose(workload) === "attempt" ? { candidate: candidateDir(root) } : {}),
|
|
2584
2719
|
...(workload.engineCaseSpec ? { case: caseDir(root) } : {}),
|
|
2585
2720
|
traces: tracesDir(root),
|
|
2586
2721
|
...(step.kind === "engine" ? { enginePrivate: runtimeEnginePrivateDir(root) } : {}),
|
|
2587
2722
|
},
|
|
2588
|
-
}
|
|
2723
|
+
};
|
|
2724
|
+
await fs.writeFile(requestPath, `${JSON.stringify(payload, null, 2)}\n`);
|
|
2725
|
+
await writeSurfaceFiles(outputDir(root), [
|
|
2726
|
+
textSurfaceFile(`.workbench/traces/${workload.job.id}/${step.label}/request.json`, `${JSON.stringify(sanitizeAdapterRequestTracePayload(payload), null, 2)}\n`),
|
|
2727
|
+
]);
|
|
2589
2728
|
return requestPath;
|
|
2590
2729
|
}
|
|
2591
|
-
function
|
|
2592
|
-
|
|
2730
|
+
function sanitizeAdapterRequestTracePayload(value) {
|
|
2731
|
+
if (Array.isArray(value)) {
|
|
2732
|
+
return value.map((entry) => sanitizeAdapterRequestTracePayload(entry));
|
|
2733
|
+
}
|
|
2734
|
+
if (!value || typeof value !== "object") {
|
|
2735
|
+
return (value ?? null);
|
|
2736
|
+
}
|
|
2737
|
+
const sanitized = {};
|
|
2738
|
+
for (const [key, entry] of Object.entries(value)) {
|
|
2739
|
+
if (key === "auth" || key === "enginePrivate") {
|
|
2740
|
+
continue;
|
|
2741
|
+
}
|
|
2742
|
+
sanitized[key] = sanitizeAdapterRequestTracePayload(entry);
|
|
2743
|
+
}
|
|
2744
|
+
return sanitized;
|
|
2745
|
+
}
|
|
2746
|
+
function improveEdits(spec) {
|
|
2747
|
+
return spec.candidate.improve?.edits ?? [];
|
|
2593
2748
|
}
|
|
2594
|
-
function
|
|
2595
|
-
const edits =
|
|
2749
|
+
function requireImproveEdits(spec) {
|
|
2750
|
+
const edits = improveEdits(spec);
|
|
2596
2751
|
if (edits.length === 0) {
|
|
2597
|
-
throw new Error("
|
|
2752
|
+
throw new Error("Candidate improve configuration must declare at least one entry in edits.");
|
|
2598
2753
|
}
|
|
2599
2754
|
return edits;
|
|
2600
2755
|
}
|
|
@@ -2691,8 +2846,8 @@ function requireWorkloadEngineCase(workload, label) {
|
|
|
2691
2846
|
}
|
|
2692
2847
|
return workload.engineCase;
|
|
2693
2848
|
}
|
|
2694
|
-
function
|
|
2695
|
-
return `${inputDir(root)}/
|
|
2849
|
+
function candidateDir(root) {
|
|
2850
|
+
return `${inputDir(root)}/candidate`;
|
|
2696
2851
|
}
|
|
2697
2852
|
function caseDir(root) {
|
|
2698
2853
|
return `${inputDir(root)}/case`;
|
|
@@ -2727,7 +2882,7 @@ async function writeSurfaceFiles(root, files) {
|
|
|
2727
2882
|
}
|
|
2728
2883
|
}
|
|
2729
2884
|
}
|
|
2730
|
-
async function readSurfaceFiles(root) {
|
|
2885
|
+
async function readSurfaceFiles(root, options = {}) {
|
|
2731
2886
|
const fs = await importNodeModule(nodeBuiltin("fs/promises"));
|
|
2732
2887
|
const path = await importNodeModule(nodeBuiltin("path"));
|
|
2733
2888
|
const utf8Decoder = new TextDecoder("utf-8", { fatal: true });
|
|
@@ -2738,6 +2893,10 @@ async function readSurfaceFiles(root) {
|
|
|
2738
2893
|
.catch(() => []);
|
|
2739
2894
|
for (const entry of entries) {
|
|
2740
2895
|
const absolutePath = path.join(directory, entry.name);
|
|
2896
|
+
const relativePath = normalizeRelativePath(path.relative(root, absolutePath).replace(/\\/gu, "/"));
|
|
2897
|
+
if (options.ignorePath?.(relativePath)) {
|
|
2898
|
+
continue;
|
|
2899
|
+
}
|
|
2741
2900
|
if (entry.isDirectory()) {
|
|
2742
2901
|
await walk(absolutePath);
|
|
2743
2902
|
continue;
|
|
@@ -2745,9 +2904,18 @@ async function readSurfaceFiles(root) {
|
|
|
2745
2904
|
if (!entry.isFile()) {
|
|
2746
2905
|
continue;
|
|
2747
2906
|
}
|
|
2748
|
-
|
|
2749
|
-
|
|
2750
|
-
|
|
2907
|
+
let body;
|
|
2908
|
+
let stats;
|
|
2909
|
+
try {
|
|
2910
|
+
body = await fs.readFile(absolutePath);
|
|
2911
|
+
stats = await fs.stat(absolutePath);
|
|
2912
|
+
}
|
|
2913
|
+
catch (error) {
|
|
2914
|
+
if (isVanishedWalkEntry(error)) {
|
|
2915
|
+
continue;
|
|
2916
|
+
}
|
|
2917
|
+
throw error;
|
|
2918
|
+
}
|
|
2751
2919
|
const content = encodeSurfaceSnapshotContent(body, utf8Decoder);
|
|
2752
2920
|
files.push({
|
|
2753
2921
|
path: relativePath,
|
|
@@ -2761,6 +2929,10 @@ async function readSurfaceFiles(root) {
|
|
|
2761
2929
|
await walk(root);
|
|
2762
2930
|
return files.sort((left, right) => left.path.localeCompare(right.path));
|
|
2763
2931
|
}
|
|
2932
|
+
function isVanishedWalkEntry(error) {
|
|
2933
|
+
const code = error?.code;
|
|
2934
|
+
return code === "ENOENT" || code === "ENOTDIR";
|
|
2935
|
+
}
|
|
2764
2936
|
function encodeSurfaceSnapshotContent(body, utf8Decoder) {
|
|
2765
2937
|
try {
|
|
2766
2938
|
return {
|
|
@@ -2943,7 +3115,13 @@ function evaluateSample(args) {
|
|
|
2943
3115
|
if (metrics.score === undefined) {
|
|
2944
3116
|
metrics.score = sampleScore;
|
|
2945
3117
|
}
|
|
2946
|
-
const cases =
|
|
3118
|
+
const cases = runtimeTimedCaseResults({
|
|
3119
|
+
caseId: args.caseId,
|
|
3120
|
+
status: "completed",
|
|
3121
|
+
durationMs,
|
|
3122
|
+
metrics,
|
|
3123
|
+
cases: args.workload.cases,
|
|
3124
|
+
});
|
|
2947
3125
|
const feedback = {
|
|
2948
3126
|
...(args.workload.summary !== undefined
|
|
2949
3127
|
? { summary: args.workload.summary }
|
|
@@ -2956,10 +3134,10 @@ function evaluateSample(args) {
|
|
|
2956
3134
|
return {
|
|
2957
3135
|
id: `${args.caseId}__sample_${String(args.sampleIndex + 1).padStart(3, "0")}`,
|
|
2958
3136
|
index: args.sampleIndex,
|
|
2959
|
-
|
|
2960
|
-
id: args.
|
|
2961
|
-
kind: "
|
|
2962
|
-
label: args.
|
|
3137
|
+
candidate: {
|
|
3138
|
+
id: args.candidateId,
|
|
3139
|
+
kind: "candidate",
|
|
3140
|
+
label: args.candidateId,
|
|
2963
3141
|
},
|
|
2964
3142
|
status: "completed",
|
|
2965
3143
|
startedAt: args.startedAt,
|
|
@@ -2967,7 +3145,7 @@ function evaluateSample(args) {
|
|
|
2967
3145
|
durationMs,
|
|
2968
3146
|
metrics,
|
|
2969
3147
|
...(usage ? { usage } : {}),
|
|
2970
|
-
|
|
3148
|
+
cases,
|
|
2971
3149
|
feedback,
|
|
2972
3150
|
};
|
|
2973
3151
|
}
|
|
@@ -2976,7 +3154,7 @@ function normalizeSampleJobOutput(value) {
|
|
|
2976
3154
|
return null;
|
|
2977
3155
|
}
|
|
2978
3156
|
const record = value;
|
|
2979
|
-
if (record.ok !== true || typeof record.
|
|
3157
|
+
if (record.ok !== true || typeof record.candidateId !== "string") {
|
|
2980
3158
|
return null;
|
|
2981
3159
|
}
|
|
2982
3160
|
const files = Array.isArray(record.files)
|
|
@@ -2991,7 +3169,7 @@ function normalizeSampleJobOutput(value) {
|
|
|
2991
3169
|
return null;
|
|
2992
3170
|
}
|
|
2993
3171
|
return {
|
|
2994
|
-
|
|
3172
|
+
candidateId: record.candidateId,
|
|
2995
3173
|
attemptIndex: record.attemptIndex,
|
|
2996
3174
|
sample,
|
|
2997
3175
|
fileChanges: Array.isArray(record.fileChanges)
|
|
@@ -3006,9 +3184,57 @@ function normalizeSampleJobOutput(value) {
|
|
|
3006
3184
|
function normalizeEvaluationSampleOutputs(args) {
|
|
3007
3185
|
return args.jobs.flatMap((job) => {
|
|
3008
3186
|
const output = normalizeSampleJobOutput(job.output);
|
|
3009
|
-
|
|
3187
|
+
if (!output) {
|
|
3188
|
+
return [];
|
|
3189
|
+
}
|
|
3190
|
+
const caseId = readJobString(job.input, "caseId") ?? output.sample.cases?.[0]?.id ?? null;
|
|
3191
|
+
const durationMs = runtimeJobDurationMs(job) ?? output.sample.durationMs;
|
|
3192
|
+
const sample = caseId && typeof durationMs === "number" && Number.isFinite(durationMs)
|
|
3193
|
+
? {
|
|
3194
|
+
...output.sample,
|
|
3195
|
+
cases: runtimeTimedCaseResults({
|
|
3196
|
+
caseId,
|
|
3197
|
+
status: output.sample.status === "error" ? "error" : "completed",
|
|
3198
|
+
durationMs,
|
|
3199
|
+
metrics: output.sample.metrics ?? {},
|
|
3200
|
+
cases: output.sample.cases,
|
|
3201
|
+
}),
|
|
3202
|
+
}
|
|
3203
|
+
: output.sample;
|
|
3204
|
+
return [{
|
|
3205
|
+
jobs: [job],
|
|
3206
|
+
output: {
|
|
3207
|
+
...output,
|
|
3208
|
+
sample,
|
|
3209
|
+
},
|
|
3210
|
+
}];
|
|
3010
3211
|
});
|
|
3011
3212
|
}
|
|
3213
|
+
function runtimeTimedCaseResults(args) {
|
|
3214
|
+
const cases = args.cases?.length
|
|
3215
|
+
? args.cases
|
|
3216
|
+
: [{
|
|
3217
|
+
id: args.caseId,
|
|
3218
|
+
status: args.status,
|
|
3219
|
+
metrics: args.metrics,
|
|
3220
|
+
}];
|
|
3221
|
+
return cases.map((entry) => ({
|
|
3222
|
+
...entry,
|
|
3223
|
+
status: entry.status ?? args.status,
|
|
3224
|
+
metrics: entry.metrics ?? args.metrics,
|
|
3225
|
+
durationMs: args.durationMs,
|
|
3226
|
+
}));
|
|
3227
|
+
}
|
|
3228
|
+
function runtimeJobDurationMs(job) {
|
|
3229
|
+
if (typeof job.startedAt !== "string" || typeof job.finishedAt !== "string") {
|
|
3230
|
+
return undefined;
|
|
3231
|
+
}
|
|
3232
|
+
const startedMs = Date.parse(job.startedAt);
|
|
3233
|
+
const finishedMs = Date.parse(job.finishedAt);
|
|
3234
|
+
return Number.isFinite(startedMs) && Number.isFinite(finishedMs)
|
|
3235
|
+
? Math.max(0, finishedMs - startedMs)
|
|
3236
|
+
: undefined;
|
|
3237
|
+
}
|
|
3012
3238
|
function meanFinite(values) {
|
|
3013
3239
|
const finite = values.filter((value) => typeof value === "number" && Number.isFinite(value));
|
|
3014
3240
|
if (finite.length === 0) {
|
|
@@ -3039,12 +3265,12 @@ function withJobUsage(sample, _jobs, attemptJob) {
|
|
|
3039
3265
|
usage,
|
|
3040
3266
|
};
|
|
3041
3267
|
}
|
|
3042
|
-
function
|
|
3268
|
+
function normalizeCandidateRevisionJobOutput(value) {
|
|
3043
3269
|
if (!value || typeof value !== "object" || Array.isArray(value)) {
|
|
3044
3270
|
return null;
|
|
3045
3271
|
}
|
|
3046
3272
|
const record = value;
|
|
3047
|
-
if (record.ok !== true || typeof record.
|
|
3273
|
+
if (record.ok !== true || typeof record.candidateId !== "string") {
|
|
3048
3274
|
return null;
|
|
3049
3275
|
}
|
|
3050
3276
|
const files = Array.isArray(record.files)
|
|
@@ -3056,7 +3282,7 @@ function normalizeSubjectRevisionJobOutput(value) {
|
|
|
3056
3282
|
}
|
|
3057
3283
|
const usage = normalizeUsageSummary(record.usage);
|
|
3058
3284
|
return {
|
|
3059
|
-
|
|
3285
|
+
candidateId: record.candidateId,
|
|
3060
3286
|
attemptIndex: record.attemptIndex,
|
|
3061
3287
|
baseId: typeof record.baseId === "string" && record.baseId.length > 0
|
|
3062
3288
|
? record.baseId
|
|
@@ -3072,7 +3298,7 @@ function normalizeSubjectRevisionJobOutput(value) {
|
|
|
3072
3298
|
...(usage ? { usage } : {}),
|
|
3073
3299
|
};
|
|
3074
3300
|
}
|
|
3075
|
-
function errorEvaluationSamplesFromJobs(jobs,
|
|
3301
|
+
function errorEvaluationSamplesFromJobs(jobs, candidateId, attemptIndex, completedSampleKeys) {
|
|
3076
3302
|
const groups = new Map();
|
|
3077
3303
|
for (const job of jobs) {
|
|
3078
3304
|
const key = evaluationSampleGroupKeyFromJob(job);
|
|
@@ -3082,10 +3308,10 @@ function errorEvaluationSamplesFromJobs(jobs, subjectId, attemptIndex, completed
|
|
|
3082
3308
|
groups.set(key, [...(groups.get(key) ?? []), job]);
|
|
3083
3309
|
}
|
|
3084
3310
|
return [...groups.values()]
|
|
3085
|
-
.map((group) => errorEvaluationSampleFromJobGroup(group,
|
|
3311
|
+
.map((group) => errorEvaluationSampleFromJobGroup(group, candidateId, attemptIndex))
|
|
3086
3312
|
.filter((sample) => sample !== null);
|
|
3087
3313
|
}
|
|
3088
|
-
function errorEvaluationSampleFromJobGroup(jobs,
|
|
3314
|
+
function errorEvaluationSampleFromJobGroup(jobs, candidateId, attemptIndex) {
|
|
3089
3315
|
const job = jobs[0];
|
|
3090
3316
|
if (!job) {
|
|
3091
3317
|
return null;
|
|
@@ -3097,25 +3323,27 @@ function errorEvaluationSampleFromJobGroup(jobs, subjectId, attemptIndex) {
|
|
|
3097
3323
|
}
|
|
3098
3324
|
const startedAt = minIsoTimestamp(jobs.map((entry) => entry.startedAt ?? entry.createdAt));
|
|
3099
3325
|
const finishedAt = maxIsoTimestamp(jobs.map((entry) => entry.finishedAt ?? entry.updatedAt ?? entry.startedAt));
|
|
3326
|
+
const durationMs = startedAt && finishedAt
|
|
3327
|
+
? Math.max(0, Date.parse(finishedAt) - Date.parse(startedAt))
|
|
3328
|
+
: undefined;
|
|
3100
3329
|
const error = summarizeEvaluationJobErrors(jobs) ?? "Evaluation job did not produce a valid sample.";
|
|
3101
3330
|
return {
|
|
3102
3331
|
id: `${caseId}__sample_${String(sampleIndex + 1).padStart(3, "0")}`,
|
|
3103
3332
|
index: sampleIndex,
|
|
3104
|
-
|
|
3105
|
-
id:
|
|
3106
|
-
kind: "
|
|
3107
|
-
label:
|
|
3333
|
+
candidate: {
|
|
3334
|
+
id: candidateId,
|
|
3335
|
+
kind: "candidate",
|
|
3336
|
+
label: candidateId,
|
|
3108
3337
|
},
|
|
3109
3338
|
status: "error",
|
|
3110
3339
|
...(startedAt ? { startedAt } : {}),
|
|
3111
3340
|
...(finishedAt ? { finishedAt } : {}),
|
|
3112
|
-
...(
|
|
3113
|
-
? { durationMs: Math.max(0, Date.parse(finishedAt) - Date.parse(startedAt)) }
|
|
3114
|
-
: {}),
|
|
3341
|
+
...(durationMs !== undefined ? { durationMs } : {}),
|
|
3115
3342
|
...(error ? { error } : {}),
|
|
3116
3343
|
cases: [{
|
|
3117
3344
|
id: caseId,
|
|
3118
3345
|
status: "error",
|
|
3346
|
+
...(durationMs !== undefined ? { durationMs } : {}),
|
|
3119
3347
|
metrics: {},
|
|
3120
3348
|
...(error ? { feedback: { summary: error } } : {}),
|
|
3121
3349
|
}],
|
|
@@ -3171,13 +3399,13 @@ function compareSampleOutputs(left, right) {
|
|
|
3171
3399
|
}
|
|
3172
3400
|
return left.sample.id.localeCompare(right.sample.id);
|
|
3173
3401
|
}
|
|
3174
|
-
function createEvaluationRecord(
|
|
3175
|
-
const samples = mergeEvaluationSampleRecords(rawSamples).map((sample) =>
|
|
3402
|
+
function createEvaluationRecord(candidateId, candidateName, rawSamples) {
|
|
3403
|
+
const samples = mergeEvaluationSampleRecords(rawSamples).map((sample) => candidateName
|
|
3176
3404
|
? {
|
|
3177
3405
|
...sample,
|
|
3178
|
-
|
|
3179
|
-
...sample.
|
|
3180
|
-
label:
|
|
3406
|
+
candidate: {
|
|
3407
|
+
...sample.candidate,
|
|
3408
|
+
label: candidateName,
|
|
3181
3409
|
},
|
|
3182
3410
|
}
|
|
3183
3411
|
: sample);
|
|
@@ -3191,10 +3419,10 @@ function createEvaluationRecord(subjectId, subjectName, rawSamples) {
|
|
|
3191
3419
|
const errorSampleCount = samples.filter((sample) => sample.status === "error")
|
|
3192
3420
|
.length;
|
|
3193
3421
|
return {
|
|
3194
|
-
|
|
3195
|
-
id:
|
|
3196
|
-
kind: "
|
|
3197
|
-
...(
|
|
3422
|
+
candidate: {
|
|
3423
|
+
id: candidateId,
|
|
3424
|
+
kind: "candidate",
|
|
3425
|
+
...(candidateName ? { label: candidateName } : {}),
|
|
3198
3426
|
},
|
|
3199
3427
|
status: samples.length > 0 && completedSampleCount === samples.length
|
|
3200
3428
|
? "completed"
|
|
@@ -3215,7 +3443,7 @@ function createEvaluationRecord(subjectId, subjectName, rawSamples) {
|
|
|
3215
3443
|
samples,
|
|
3216
3444
|
};
|
|
3217
3445
|
}
|
|
3218
|
-
function
|
|
3446
|
+
function normalizedCandidateDisplayName(value) {
|
|
3219
3447
|
const normalized = value?.trim();
|
|
3220
3448
|
return normalized ? normalized : null;
|
|
3221
3449
|
}
|
|
@@ -3263,7 +3491,7 @@ function mergeEvaluationSampleGroup(group) {
|
|
|
3263
3491
|
return {
|
|
3264
3492
|
id: `sample_${String(first.index + 1).padStart(3, "0")}`,
|
|
3265
3493
|
index: first.index,
|
|
3266
|
-
|
|
3494
|
+
candidate: first.candidate,
|
|
3267
3495
|
status: mergeEvaluationSampleStatus(group),
|
|
3268
3496
|
...(startedAt ? { startedAt } : {}),
|
|
3269
3497
|
...(finishedAt ? { finishedAt } : {}),
|
|
@@ -3355,34 +3583,28 @@ function aggregateCaseStatus(results) {
|
|
|
3355
3583
|
}
|
|
3356
3584
|
return undefined;
|
|
3357
3585
|
}
|
|
3358
|
-
function
|
|
3359
|
-
|
|
3360
|
-
|
|
3361
|
-
|
|
3362
|
-
|
|
3363
|
-
}
|
|
3364
|
-
function selectSubject(args) {
|
|
3365
|
-
let selected = args.previousSubject;
|
|
3366
|
-
for (const subject of args.subjects) {
|
|
3367
|
-
if (!selected || hasHigherScore(subject, selected)) {
|
|
3368
|
-
selected = subject;
|
|
3586
|
+
function selectCandidate(args) {
|
|
3587
|
+
let selected = args.previousCandidate;
|
|
3588
|
+
for (const candidate of args.candidates) {
|
|
3589
|
+
if (!selected || hasHigherScore(candidate, selected)) {
|
|
3590
|
+
selected = candidate;
|
|
3369
3591
|
}
|
|
3370
3592
|
}
|
|
3371
3593
|
return selected;
|
|
3372
3594
|
}
|
|
3373
|
-
function hasHigherScore(
|
|
3374
|
-
const
|
|
3375
|
-
const incumbentValue =
|
|
3376
|
-
if (
|
|
3595
|
+
function hasHigherScore(candidate, incumbent) {
|
|
3596
|
+
const candidateValue = readEvaluationMean(candidate.eval, "score");
|
|
3597
|
+
const incumbentValue = readEvaluationMean(incumbent.eval, "score");
|
|
3598
|
+
if (candidateValue == null) {
|
|
3377
3599
|
return false;
|
|
3378
3600
|
}
|
|
3379
3601
|
if (incumbentValue == null) {
|
|
3380
3602
|
return true;
|
|
3381
3603
|
}
|
|
3382
|
-
return
|
|
3604
|
+
return candidateValue > incumbentValue;
|
|
3383
3605
|
}
|
|
3384
|
-
function
|
|
3385
|
-
const direct =
|
|
3606
|
+
function readEvaluationMean(evaluation, metric) {
|
|
3607
|
+
const direct = evaluation?.metrics?.[metric]?.mean;
|
|
3386
3608
|
return typeof direct === "number" && Number.isFinite(direct) ? direct : null;
|
|
3387
3609
|
}
|
|
3388
3610
|
function metricStats(values) {
|
|
@@ -3501,7 +3723,7 @@ function isEvaluationSampleRecord(value) {
|
|
|
3501
3723
|
!Array.isArray(value) &&
|
|
3502
3724
|
typeof record.id === "string" &&
|
|
3503
3725
|
typeof record.index === "number" &&
|
|
3504
|
-
typeof record.
|
|
3726
|
+
typeof record.candidate === "object" &&
|
|
3505
3727
|
isEvaluationSampleStatus(record.status) &&
|
|
3506
3728
|
hasOperationalCaseStatuses(record.cases));
|
|
3507
3729
|
}
|