@workbench-ai/workbench 0.0.48 → 0.0.50
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/adapter-project.js +3 -3
- package/dist/benchmark-fingerprint.d.ts +1 -1
- package/dist/benchmark-fingerprint.d.ts.map +1 -1
- package/dist/benchmark-fingerprint.js +4 -6
- package/dist/command-model.d.ts.map +1 -1
- package/dist/command-model.js +144 -119
- package/dist/dev-open/client.css +48 -11
- package/dist/dev-open/client.js +149 -149
- package/dist/dev-open-server.d.ts +9 -22
- package/dist/dev-open-server.d.ts.map +1 -1
- package/dist/dev-open-server.js +42 -38
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +1472 -505
- package/dist/init-scaffold.d.ts +4 -4
- package/dist/init-scaffold.d.ts.map +1 -1
- package/dist/init-scaffold.js +2 -2
- package/dist/init-template-pack.d.ts +4 -4
- package/dist/init-template-pack.d.ts.map +1 -1
- package/dist/init-template-pack.js +47 -59
- package/dist/local-archive.d.ts +11 -11
- package/dist/local-archive.d.ts.map +1 -1
- package/dist/local-archive.js +87 -74
- package/dist/project-source.d.ts +14 -17
- package/dist/project-source.d.ts.map +1 -1
- package/dist/project-source.js +80 -151
- package/package.json +4 -4
package/dist/index.js
CHANGED
|
@@ -5,7 +5,7 @@ import { createRequire } from "node:module";
|
|
|
5
5
|
import os from "node:os";
|
|
6
6
|
import path from "node:path";
|
|
7
7
|
import { Writable } from "node:stream";
|
|
8
|
-
import {
|
|
8
|
+
import { createCandidateFilePreview, createBaselineCandidateJob as createRuntimeBaselineCandidateJob, evaluationScorecardId, evaluationMeanMetrics, executeWorkbenchExecutionJob, engineResolveBindingForSpec, filterCandidateSourceFiles, workbenchExecutionPurpose, workbenchRunExecutionFingerprint, createWorkbenchAdapterAuthBundle, createOptimizerTraceInputFiles, DOCKER_SANDBOX_BACKEND, localWorkbenchAdapterAuthStore, materializeWorkbenchRunResult, normalizeSurfaceFiles, planWorkbenchExecutionJobsForPurpose, runWorkbenchExecutionDag, resolveEngineCaseExecutionConfig, resolveWorkbenchResolvedSourceYaml, summarizeCandidateFiles, validateWorkbenchRunEnvelope, parseWorkbenchAdapterAuthTarget, } from "@workbench-ai/workbench-core";
|
|
9
9
|
import { assertWorkbenchAdapterOperationResultOk, collectWorkbenchAdapterAuthRequirements, WORKBENCH_ADAPTER_RESULT_FILE, WORKBENCH_ADAPTER_RESULT_PROTOCOL, normalizeWorkbenchAdapterOperationRequest, readWorkbenchAdapterOperationResult, workbenchAdapterOperationCommand, workbenchAdapterOperationResultPath, withDefaultWorkbenchAdapterAuthProfiles as applyDefaultWorkbenchAdapterAuthProfiles, } from "@workbench-ai/workbench-protocol";
|
|
10
10
|
import { builtinLocalTraceAdapter, builtinLocalTraceAdapters, sortLocalTraceRefs, } from "@workbench-ai/workbench-built-in-adapters/local-traces";
|
|
11
11
|
import { commandUsage, HOSTED_WATCH_LIFECYCLE_NOTE, LOCAL_DEV_OPEN_LIFECYCLE_NOTE, rootUsage, } from "./command-model.js";
|
|
@@ -13,10 +13,10 @@ import { startLocalWorkbenchDevServer } from "./dev-open-server.js";
|
|
|
13
13
|
import { createWorkbenchInitScaffold, } from "./init-scaffold.js";
|
|
14
14
|
import { defaultAdapterManifests, composeRuntimeDockerfileWithAdapters, resolveDefaultWorkbenchAdapter, resolveProjectAdapterSource, resolveWorkbenchAdaptersForProject, WORKBENCH_ADAPTER_MANIFEST_FILE, } from "./adapter-project.js";
|
|
15
15
|
import { createAdapterCommandEnv } from "./adapter-command-env.js";
|
|
16
|
-
import {
|
|
16
|
+
import { loadLocalArchive, loadLocalArchiveIndex, materializeCandidateRoot, readLocalCandidate, readLocalCandidateFiles, readLocalJobs, saveLocalArchive, saveLocalJobs, setLocalActive, upsertLocalRun, upsertLocalCandidate, upsertLocalEvaluation, } from "./local-archive.js";
|
|
17
17
|
import { WorkspaceSnapshotError, } from "./workspace-snapshot.js";
|
|
18
18
|
import { readLocalProjectSource, WORKBENCH_BENCHMARK_FILE, } from "./project-source.js";
|
|
19
|
-
import { localBenchmarkFingerprint,
|
|
19
|
+
import { localBenchmarkFingerprint, localCandidateFingerprint, } from "./benchmark-fingerprint.js";
|
|
20
20
|
const require = createRequire(import.meta.url);
|
|
21
21
|
function getCliVersion() {
|
|
22
22
|
const manifest = require("../package.json");
|
|
@@ -87,7 +87,10 @@ export async function runCli(argv, io = {
|
|
|
87
87
|
return await runRemoteCommand(argv.slice(1), io);
|
|
88
88
|
}
|
|
89
89
|
if (argv[0] === "eval") {
|
|
90
|
-
return await
|
|
90
|
+
return await localEvaluateCandidate(argv.slice(1), io, runtimeOptions);
|
|
91
|
+
}
|
|
92
|
+
if (argv[0] === "retry") {
|
|
93
|
+
return await localRetry(argv.slice(1), io, runtimeOptions);
|
|
91
94
|
}
|
|
92
95
|
if (argv[0] === "improve") {
|
|
93
96
|
return await localRun(argv.slice(1), io, runtimeOptions);
|
|
@@ -117,14 +120,14 @@ export async function runCli(argv, io = {
|
|
|
117
120
|
return await localRunList(rest, io);
|
|
118
121
|
case "runs show":
|
|
119
122
|
return await localRunShow(rest, io);
|
|
120
|
-
case "
|
|
121
|
-
return await
|
|
122
|
-
case "
|
|
123
|
-
return await
|
|
124
|
-
case "
|
|
125
|
-
return await
|
|
126
|
-
case "
|
|
127
|
-
return await
|
|
123
|
+
case "candidates list":
|
|
124
|
+
return await localCandidateList(rest, io);
|
|
125
|
+
case "candidates show":
|
|
126
|
+
return await localCandidateShow(rest, io);
|
|
127
|
+
case "candidates files":
|
|
128
|
+
return await localCandidateFiles(rest, io);
|
|
129
|
+
case "candidates preview":
|
|
130
|
+
return await localCandidatePreview(rest, io);
|
|
128
131
|
default:
|
|
129
132
|
break;
|
|
130
133
|
}
|
|
@@ -163,7 +166,7 @@ function commandPathForHelp(argv) {
|
|
|
163
166
|
["list", "show"].includes(positionals[1] ?? "")) {
|
|
164
167
|
return positionals.slice(0, 2).join(" ");
|
|
165
168
|
}
|
|
166
|
-
if (positionals[0] === "
|
|
169
|
+
if (positionals[0] === "candidates" &&
|
|
167
170
|
["list", "show", "files", "preview"].includes(positionals[1] ?? "")) {
|
|
168
171
|
return positionals.slice(0, 2).join(" ");
|
|
169
172
|
}
|
|
@@ -175,6 +178,8 @@ async function runCloudCommand(argv, io) {
|
|
|
175
178
|
switch (command) {
|
|
176
179
|
case "eval":
|
|
177
180
|
return await startHostedWorkflow("eval", rest, io);
|
|
181
|
+
case "retry":
|
|
182
|
+
return await retryHostedWorkflow(rest, io);
|
|
178
183
|
case "improve":
|
|
179
184
|
return await startHostedWorkflow("improve", rest, io);
|
|
180
185
|
case "open":
|
|
@@ -209,20 +214,20 @@ async function runCloudCommand(argv, io) {
|
|
|
209
214
|
return await runShow(subRest, io);
|
|
210
215
|
case "runs cancel":
|
|
211
216
|
return await runCancel(subRest, io);
|
|
212
|
-
case "
|
|
213
|
-
return await
|
|
214
|
-
case "
|
|
215
|
-
return await
|
|
216
|
-
case "
|
|
217
|
-
return await
|
|
218
|
-
case "
|
|
219
|
-
return await
|
|
220
|
-
case "
|
|
221
|
-
return await
|
|
222
|
-
case "
|
|
223
|
-
return await
|
|
224
|
-
case "
|
|
225
|
-
return await
|
|
217
|
+
case "candidates list":
|
|
218
|
+
return await candidateList(subRest, io);
|
|
219
|
+
case "candidates show":
|
|
220
|
+
return await candidateShow(subRest, io);
|
|
221
|
+
case "candidates files":
|
|
222
|
+
return await candidateFiles(subRest, io);
|
|
223
|
+
case "candidates preview":
|
|
224
|
+
return await candidatePreview(subRest, io);
|
|
225
|
+
case "candidates pull":
|
|
226
|
+
return await candidateExport(subRest, io);
|
|
227
|
+
case "candidates publish":
|
|
228
|
+
return await candidateVisibility(subRest, io, "public");
|
|
229
|
+
case "candidates unpublish":
|
|
230
|
+
return await candidateVisibility(subRest, io, "private");
|
|
226
231
|
default:
|
|
227
232
|
throw new UsageError(`Unknown command: cloud ${argv.join(" ")}`);
|
|
228
233
|
}
|
|
@@ -313,7 +318,7 @@ async function localInit(argv, io) {
|
|
|
313
318
|
specPath,
|
|
314
319
|
kind: scaffold.kind,
|
|
315
320
|
name: scaffold.name,
|
|
316
|
-
|
|
321
|
+
candidateRoot: scaffold.candidateRoot,
|
|
317
322
|
}, parsed, io, () => `Initialized ${scaffold.kind} Workbench source directory at ${workspace}`);
|
|
318
323
|
return 0;
|
|
319
324
|
}
|
|
@@ -358,20 +363,20 @@ function buildWorkbenchCheckPlan(source) {
|
|
|
358
363
|
files: sourceFileCount(source),
|
|
359
364
|
yaml: [
|
|
360
365
|
path.relative(source.dir, source.benchmarkPath) || "benchmark.yaml",
|
|
361
|
-
path.relative(source.dir, source.
|
|
362
|
-
...(source.optimizerSource !== undefined
|
|
363
|
-
? [path.relative(source.dir, source.optimizerPath ?? "") || "optimizer YAML"]
|
|
364
|
-
: []),
|
|
366
|
+
path.relative(source.dir, source.candidateSpecPath) || "candidate YAML",
|
|
365
367
|
],
|
|
366
368
|
dockerfile: source.dockerfilePath,
|
|
367
369
|
},
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
370
|
+
candidate: {
|
|
371
|
+
name: source.spec.candidate.name,
|
|
372
|
+
selectedRunId: source.spec.candidate.selectedRunId,
|
|
373
|
+
runCount: Object.keys(source.spec.candidate.runs).length,
|
|
374
|
+
filesPath: source.spec.candidate.files.path,
|
|
375
|
+
files: source.candidateFiles.length,
|
|
371
376
|
},
|
|
372
|
-
|
|
377
|
+
improve: source.spec.candidate.improve
|
|
373
378
|
? {
|
|
374
|
-
edits: [...source.spec.
|
|
379
|
+
edits: [...source.spec.candidate.improve.edits],
|
|
375
380
|
}
|
|
376
381
|
: null,
|
|
377
382
|
engine: {
|
|
@@ -394,8 +399,8 @@ function buildWorkbenchCheckPlan(source) {
|
|
|
394
399
|
};
|
|
395
400
|
}
|
|
396
401
|
function formatWorkbenchCheckPlan(plan, warningSuffix) {
|
|
397
|
-
const edits = plan.
|
|
398
|
-
? plan.
|
|
402
|
+
const edits = plan.improve?.edits.length
|
|
403
|
+
? plan.improve.edits.join(", ")
|
|
399
404
|
: "-";
|
|
400
405
|
const network = plan.environment.network.egress;
|
|
401
406
|
const resources = plan.environment.resources;
|
|
@@ -404,11 +409,12 @@ function formatWorkbenchCheckPlan(plan, warningSuffix) {
|
|
|
404
409
|
`Benchmark: ${plan.benchmarkName}`,
|
|
405
410
|
`Description: ${plan.benchmarkDescription}`,
|
|
406
411
|
`Source: ${plan.source.files} file(s) (${plan.source.yaml.join(", ")}, ${plan.source.dockerfile})`,
|
|
407
|
-
`
|
|
408
|
-
`
|
|
412
|
+
`Candidate: ${plan.candidate.name} (${plan.candidate.runCount} run(s), selected ${plan.candidate.selectedRunId})`,
|
|
413
|
+
`Candidate files: ${plan.candidate.filesPath} (${plan.candidate.files} file(s))`,
|
|
414
|
+
`Improve edits: ${edits}`,
|
|
409
415
|
`Engine cases: ${plan.engine.cases} case(s) from ${formatAdapterSummary(plan.engine.resolver)} at ${plan.engine.path} (${plan.engine.files} file(s))`,
|
|
410
416
|
`Environment: ${plan.environment.dockerfile}, network ${network}, ${resources.cpu} CPU, ${resources.memoryGb}GB RAM, ${resources.timeoutMinutes}m timeout`,
|
|
411
|
-
`Execution: improve ${plan.adapters.improve ? formatAdapterSummary(plan.adapters.improve) : "not configured"},
|
|
417
|
+
`Execution: improve ${plan.adapters.improve ? formatAdapterSummary(plan.adapters.improve) : "not configured"}, candidate run ${formatAdapterSummary(plan.adapters.run)}, engine ${formatAdapterSummary(plan.adapters.engine)}`,
|
|
412
418
|
...adapterSourceLines(plan.adapters.sources),
|
|
413
419
|
].join("\n");
|
|
414
420
|
}
|
|
@@ -493,18 +499,206 @@ function splitWorkspaceError(error) {
|
|
|
493
499
|
const message = error instanceof Error ? error.message : String(error);
|
|
494
500
|
return message.split(/\n+/u).map((entry) => entry.trim()).filter(Boolean);
|
|
495
501
|
}
|
|
502
|
+
async function localRetry(argv, io, runtimeOptions) {
|
|
503
|
+
const parsed = parseArgs(argv);
|
|
504
|
+
rejectUnknownFlags(parsed, new Set(["dir", "json"]));
|
|
505
|
+
rejectUnexpectedPositionals(parsed, "workbench retry", 1);
|
|
506
|
+
const targetId = parsed.positionals[0];
|
|
507
|
+
if (!targetId) {
|
|
508
|
+
throw new UsageError("Missing required TARGET_ID.");
|
|
509
|
+
}
|
|
510
|
+
const workspace = resolveDir(parsed);
|
|
511
|
+
const target = await resolveLocalRetryTarget(workspace, targetId);
|
|
512
|
+
const captured = createCapturingIo(io);
|
|
513
|
+
const code = target.workflow === "eval"
|
|
514
|
+
? await localEvaluateCandidate([
|
|
515
|
+
"--dir",
|
|
516
|
+
workspace,
|
|
517
|
+
"--candidate",
|
|
518
|
+
target.candidateId,
|
|
519
|
+
"--runs",
|
|
520
|
+
target.candidateRunId,
|
|
521
|
+
"--samples",
|
|
522
|
+
String(target.samples),
|
|
523
|
+
"--json",
|
|
524
|
+
], captured.io, runtimeOptions)
|
|
525
|
+
: await localRun([
|
|
526
|
+
"--dir",
|
|
527
|
+
workspace,
|
|
528
|
+
"--from",
|
|
529
|
+
target.candidateId,
|
|
530
|
+
"--runs",
|
|
531
|
+
target.candidateRunId,
|
|
532
|
+
"--budget",
|
|
533
|
+
String(target.budget ?? 1),
|
|
534
|
+
"--samples",
|
|
535
|
+
String(target.samples),
|
|
536
|
+
"--json",
|
|
537
|
+
], captured.io, runtimeOptions);
|
|
538
|
+
const commandOutput = parseCapturedJson(captured.stdoutText());
|
|
539
|
+
await preserveLocalActiveCandidate(workspace, target.preserveActiveId);
|
|
540
|
+
const outputRecord = readRecord(commandOutput) ?? {};
|
|
541
|
+
const result = {
|
|
542
|
+
ok: code === 0 && outputRecord.ok !== false,
|
|
543
|
+
retried: {
|
|
544
|
+
id: target.sourceId,
|
|
545
|
+
kind: target.sourceKind,
|
|
546
|
+
workflow: target.workflow,
|
|
547
|
+
},
|
|
548
|
+
};
|
|
549
|
+
assignRetryResultString(result, "runId", outputRecord.runId);
|
|
550
|
+
assignRetryResultString(result, "evaluationId", outputRecord.evaluationId);
|
|
551
|
+
assignRetryResultString(result, "candidateId", outputRecord.candidateId);
|
|
552
|
+
assignRetryResultString(result, "activeCandidateId", outputRecord.activeCandidateId);
|
|
553
|
+
const localView = localRetryViewHint(outputRecord.localView);
|
|
554
|
+
if (localView) {
|
|
555
|
+
result.localView = localView;
|
|
556
|
+
}
|
|
557
|
+
const failedJobCount = numberValue(outputRecord.failedJobCount);
|
|
558
|
+
if (failedJobCount !== null) {
|
|
559
|
+
result.failedJobCount = failedJobCount;
|
|
560
|
+
}
|
|
561
|
+
const error = stringValue(outputRecord.error);
|
|
562
|
+
if (error) {
|
|
563
|
+
result.error = error;
|
|
564
|
+
}
|
|
565
|
+
writeOutput(result, parsed, io, formatRetryCommandResult);
|
|
566
|
+
return code;
|
|
567
|
+
}
|
|
568
|
+
async function resolveLocalRetryTarget(workspace, targetId) {
|
|
569
|
+
const snapshot = await loadLocalArchive(workspace);
|
|
570
|
+
const evaluation = snapshot.evaluations.find((entry) => entry.id === targetId);
|
|
571
|
+
if (evaluation) {
|
|
572
|
+
const run = snapshot.runs.find((entry) => entry.id === evaluation.runId) ?? null;
|
|
573
|
+
return localEvaluationRetryTarget(snapshot, evaluation, run, "evaluation", targetId);
|
|
574
|
+
}
|
|
575
|
+
const run = snapshot.runs.find((entry) => entry.id === targetId);
|
|
576
|
+
if (!run) {
|
|
577
|
+
throw new UsageError(`Run or evaluation not found: ${targetId}`);
|
|
578
|
+
}
|
|
579
|
+
if (run.status !== "finished") {
|
|
580
|
+
throw new UsageError(`Run ${run.id} is ${run.status}; wait for it to finish before retrying.`);
|
|
581
|
+
}
|
|
582
|
+
if (!runSummaryFailed(run)) {
|
|
583
|
+
throw new UsageError(`Run ${run.id} did not fail; use workbench ${run.workflow} to intentionally run it again.`);
|
|
584
|
+
}
|
|
585
|
+
if (run.workflow === "eval") {
|
|
586
|
+
const evaluations = snapshot.evaluations.filter((entry) => entry.runId === run.id);
|
|
587
|
+
if (evaluations.length !== 1) {
|
|
588
|
+
throw new UsageError(evaluations.length === 0
|
|
589
|
+
? `Run ${run.id} has no evaluation record to retry.`
|
|
590
|
+
: `Run ${run.id} has multiple evaluations; retry a specific evaluation id instead.`);
|
|
591
|
+
}
|
|
592
|
+
return localEvaluationRetryTarget(snapshot, evaluations[0], run, "run", targetId);
|
|
593
|
+
}
|
|
594
|
+
const candidateRunId = run.candidateRunId;
|
|
595
|
+
if (!run.candidateId || !candidateRunId) {
|
|
596
|
+
throw new UsageError(`Run ${run.id} is missing retry metadata; use workbench improve --from with an explicit candidate id.`);
|
|
597
|
+
}
|
|
598
|
+
return {
|
|
599
|
+
sourceId: targetId,
|
|
600
|
+
sourceKind: "run",
|
|
601
|
+
workflow: "improve",
|
|
602
|
+
candidateId: run.candidateId,
|
|
603
|
+
candidateRunId,
|
|
604
|
+
samples: run.samples,
|
|
605
|
+
budget: run.budget,
|
|
606
|
+
preserveActiveId: snapshot.activeId,
|
|
607
|
+
};
|
|
608
|
+
}
|
|
609
|
+
function localEvaluationRetryTarget(snapshot, evaluation, run, sourceKind, sourceId) {
|
|
610
|
+
if (!evaluationScorecardFailed(evaluation, run)) {
|
|
611
|
+
throw new UsageError(`Evaluation ${evaluation.id} did not fail; use workbench eval to intentionally run it again.`);
|
|
612
|
+
}
|
|
613
|
+
if (!snapshot.candidates.some((entry) => entry.id === evaluation.candidateId)) {
|
|
614
|
+
throw new UsageError(`Candidate not found for evaluation ${evaluation.id}: ${evaluation.candidateId}`);
|
|
615
|
+
}
|
|
616
|
+
const candidateRunId = evaluation.candidateRunId ?? run?.candidateRunId;
|
|
617
|
+
if (!candidateRunId) {
|
|
618
|
+
throw new UsageError(`Evaluation ${evaluation.id} is missing its candidate run configuration.`);
|
|
619
|
+
}
|
|
620
|
+
return {
|
|
621
|
+
sourceId,
|
|
622
|
+
sourceKind,
|
|
623
|
+
workflow: "eval",
|
|
624
|
+
candidateId: evaluation.candidateId,
|
|
625
|
+
candidateRunId,
|
|
626
|
+
samples: evaluation.sampleCount || run?.samples || 1,
|
|
627
|
+
preserveActiveId: snapshot.activeId,
|
|
628
|
+
};
|
|
629
|
+
}
|
|
630
|
+
async function preserveLocalActiveCandidate(workspace, activeId) {
|
|
631
|
+
let snapshot = await loadLocalArchive(workspace);
|
|
632
|
+
if (activeId && !snapshot.candidates.some((candidate) => candidate.id === activeId)) {
|
|
633
|
+
return;
|
|
634
|
+
}
|
|
635
|
+
if (snapshot.activeId === activeId) {
|
|
636
|
+
return;
|
|
637
|
+
}
|
|
638
|
+
snapshot = setLocalActive(snapshot, activeId);
|
|
639
|
+
await saveLocalArchive(workspace, snapshot);
|
|
640
|
+
}
|
|
641
|
+
function evaluationScorecardFailed(evaluation, run) {
|
|
642
|
+
return evaluation.errorSampleCount > 0 ||
|
|
643
|
+
evaluation.status !== "completed" ||
|
|
644
|
+
runSummaryFailed(run);
|
|
645
|
+
}
|
|
646
|
+
function runSummaryFailed(run) {
|
|
647
|
+
return run?.outcome === "error" || run?.outcome === "cancelled";
|
|
648
|
+
}
|
|
649
|
+
function createCapturingIo(io) {
|
|
650
|
+
const chunks = [];
|
|
651
|
+
const stdout = new class extends Writable {
|
|
652
|
+
_write(chunk, _encoding, callback) {
|
|
653
|
+
chunks.push(Buffer.isBuffer(chunk) ? chunk.toString("utf8") : String(chunk));
|
|
654
|
+
callback();
|
|
655
|
+
}
|
|
656
|
+
}();
|
|
657
|
+
return {
|
|
658
|
+
io: {
|
|
659
|
+
stdin: io.stdin,
|
|
660
|
+
stdout,
|
|
661
|
+
stderr: io.stderr,
|
|
662
|
+
},
|
|
663
|
+
stdoutText: () => chunks.join(""),
|
|
664
|
+
};
|
|
665
|
+
}
|
|
666
|
+
function parseCapturedJson(value) {
|
|
667
|
+
const trimmed = value.trim();
|
|
668
|
+
if (!trimmed) {
|
|
669
|
+
return {};
|
|
670
|
+
}
|
|
671
|
+
try {
|
|
672
|
+
return JSON.parse(trimmed);
|
|
673
|
+
}
|
|
674
|
+
catch {
|
|
675
|
+
return { output: trimmed };
|
|
676
|
+
}
|
|
677
|
+
}
|
|
678
|
+
function localRetryViewHint(value) {
|
|
679
|
+
const record = readRecord(value);
|
|
680
|
+
const command = stringValue(record?.command);
|
|
681
|
+
const note = stringValue(record?.note);
|
|
682
|
+
return command && note ? { command, note } : undefined;
|
|
683
|
+
}
|
|
684
|
+
function assignRetryResultString(result, key, value) {
|
|
685
|
+
const normalized = stringValue(value);
|
|
686
|
+
if (normalized) {
|
|
687
|
+
result[key] = normalized;
|
|
688
|
+
}
|
|
689
|
+
}
|
|
496
690
|
async function localRun(argv, io, runtimeOptions) {
|
|
497
691
|
const parsed = parseArgs(argv);
|
|
498
|
-
rejectUnknownFlags(parsed, new Set(["dir", "
|
|
692
|
+
rejectUnknownFlags(parsed, new Set(["dir", "runs", "from", "budget", "samples", "rerun", "json"]));
|
|
499
693
|
const budget = parsePositiveInt(parsed.flags.budget, 1, "budget");
|
|
500
694
|
const samples = parsePositiveInt(parsed.flags.samples, 1, "samples");
|
|
501
695
|
const sourceArg = resolveSourceDir(parsed);
|
|
502
696
|
const projectSource = await readLocalProjectSource(sourceArg, {
|
|
503
|
-
|
|
697
|
+
runId: singleRequestedRunId(asOptionalString(parsed.flags.runs), "workbench improve"),
|
|
504
698
|
});
|
|
505
699
|
const workspace = projectSource.dir;
|
|
506
|
-
if (!projectSource.spec.
|
|
507
|
-
throw new UsageError("
|
|
700
|
+
if (!projectSource.spec.improve || !projectSource.spec.candidate.improve) {
|
|
701
|
+
throw new UsageError("Candidate improve configuration is required for workbench improve.");
|
|
508
702
|
}
|
|
509
703
|
const executionProject = await resolveLocalProjectForExecution(workspace, projectSource.specSource);
|
|
510
704
|
const { spec, adapterManifests } = executionProject;
|
|
@@ -522,10 +716,8 @@ async function localRun(argv, io, runtimeOptions) {
|
|
|
522
716
|
});
|
|
523
717
|
const environmentRefs = await ensureLocalDockerfileEnvironments(workspace, spec, engineCases);
|
|
524
718
|
const benchmarkFingerprint = await readLocalBenchmarkFingerprint(workspace);
|
|
525
|
-
const
|
|
526
|
-
const
|
|
527
|
-
let snapshot = await loadLocalArchive(workspace);
|
|
528
|
-
const baseSubject = await ensureLocalImproveBaseSubject({
|
|
719
|
+
const executionFingerprint = localRunExecutionFingerprint(projectSource);
|
|
720
|
+
const baseCandidate = await ensureLocalImproveBaseCandidate({
|
|
529
721
|
parsed,
|
|
530
722
|
sourceArg,
|
|
531
723
|
workspace,
|
|
@@ -534,9 +726,47 @@ async function localRun(argv, io, runtimeOptions) {
|
|
|
534
726
|
io,
|
|
535
727
|
runtimeOptions,
|
|
536
728
|
});
|
|
537
|
-
let
|
|
729
|
+
let snapshot = await loadLocalArchive(workspace);
|
|
730
|
+
if (parsed.flags.rerun !== true) {
|
|
731
|
+
const reusableRun = findReusableLocalImproveRun(snapshot.runs, {
|
|
732
|
+
benchmarkFingerprint,
|
|
733
|
+
candidateId: baseCandidate.id,
|
|
734
|
+
candidateRunId: projectSource.spec.candidate.selectedRunId,
|
|
735
|
+
executionFingerprint,
|
|
736
|
+
budget,
|
|
737
|
+
samples,
|
|
738
|
+
});
|
|
739
|
+
if (reusableRun) {
|
|
740
|
+
const evaluation = snapshot.evaluations.find((entry) => entry.runId === reusableRun.id) ?? null;
|
|
741
|
+
const outputCandidateId = reusableRun.outputCandidateId ?? reusableRun.candidateId ?? baseCandidate.id;
|
|
742
|
+
const outputCandidate = readLocalCandidate(snapshot, outputCandidateId);
|
|
743
|
+
const activeCandidate = snapshot.activeId
|
|
744
|
+
? readLocalCandidate(snapshot, snapshot.activeId)
|
|
745
|
+
: null;
|
|
746
|
+
const result = {
|
|
747
|
+
ok: true,
|
|
748
|
+
reused: true,
|
|
749
|
+
runId: reusableRun.id,
|
|
750
|
+
evaluationId: evaluation?.id ?? null,
|
|
751
|
+
outputCandidateId,
|
|
752
|
+
outputCandidate,
|
|
753
|
+
activeCandidateId: snapshot.activeId,
|
|
754
|
+
activeCandidate,
|
|
755
|
+
completedJobCount: 0,
|
|
756
|
+
failedJobCount: 0,
|
|
757
|
+
localView: localDevViewHint(workspace, reusableRun.id),
|
|
758
|
+
};
|
|
759
|
+
writeOutput(result, parsed, io, () => `Reused improve run ${reusableRun.id}. Use --rerun to intentionally run it again.`);
|
|
760
|
+
return 0;
|
|
761
|
+
}
|
|
762
|
+
}
|
|
763
|
+
const runId = `run_local_${Date.now().toString(36)}`;
|
|
764
|
+
const startedAt = new Date().toISOString();
|
|
765
|
+
let currentBaseId = baseCandidate.id;
|
|
766
|
+
let outputCandidateId = null;
|
|
538
767
|
let completedJobCount = 0;
|
|
539
768
|
let failedJobCount = 0;
|
|
769
|
+
let attemptsExecuted = 0;
|
|
540
770
|
const failedJobs = [];
|
|
541
771
|
const events = [
|
|
542
772
|
createLocalEvent("run_started", startedAt, {
|
|
@@ -544,232 +774,313 @@ async function localRun(argv, io, runtimeOptions) {
|
|
|
544
774
|
detail: { budget, samples, strategy: "greedy" },
|
|
545
775
|
}),
|
|
546
776
|
];
|
|
547
|
-
const
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
spec,
|
|
576
|
-
|
|
577
|
-
purpose: "improve",
|
|
578
|
-
now: new Date().toISOString(),
|
|
579
|
-
baseFiles,
|
|
580
|
-
traceFiles: subjectRevisionTraceFiles,
|
|
581
|
-
...(environmentRefs.defaultRef ? { environmentRef: environmentRefs.defaultRef } : {}),
|
|
582
|
-
baseId: activeSubject.id,
|
|
583
|
-
})[0];
|
|
584
|
-
const subjectRevisionJobs = await executeLocalDevelopmentDag({
|
|
585
|
-
jobs: [plannedSubjectRevision],
|
|
586
|
-
spec,
|
|
587
|
-
adapterManifests,
|
|
588
|
-
adapterFiles: normalizeSurfaceFiles(projectSource.adapterFiles),
|
|
589
|
-
baseFiles,
|
|
590
|
-
engineResolveFiles,
|
|
591
|
-
engineCases,
|
|
592
|
-
traceFiles: subjectRevisionTraceFiles,
|
|
593
|
-
capacity: devCapacity,
|
|
777
|
+
const runningRun = {
|
|
778
|
+
id: runId,
|
|
779
|
+
workflow: "improve",
|
|
780
|
+
benchmarkFingerprint,
|
|
781
|
+
status: "running",
|
|
782
|
+
candidateId: baseCandidate.id,
|
|
783
|
+
candidateRunId: projectSource.spec.candidate.selectedRunId,
|
|
784
|
+
candidateRunName: projectSource.spec.candidate.selectedRunName,
|
|
785
|
+
startedAt,
|
|
786
|
+
improver: formatSpecImprover(spec),
|
|
787
|
+
engineRun: spec.engineRun.use,
|
|
788
|
+
strategy: "greedy",
|
|
789
|
+
budget,
|
|
790
|
+
repairBudget: 0,
|
|
791
|
+
attemptsRequested: budget,
|
|
792
|
+
attemptsExecuted: 0,
|
|
793
|
+
samples,
|
|
794
|
+
executionFingerprint,
|
|
795
|
+
activeCandidateId: snapshot.activeId,
|
|
796
|
+
outputCandidateId: null,
|
|
797
|
+
};
|
|
798
|
+
snapshot = upsertLocalRun(snapshot, runningRun, events);
|
|
799
|
+
await saveLocalArchive(workspace, snapshot);
|
|
800
|
+
try {
|
|
801
|
+
const devCapacity = await localDevelopmentCapacity(workspace);
|
|
802
|
+
const baselineTraceJobs = selectLocalOptimizerBaselineTraceJobs(snapshot, await readLocalJobs(workspace), {
|
|
803
|
+
benchmarkFingerprint,
|
|
804
|
+
candidateId: baseCandidate.id,
|
|
805
|
+
candidateRunId: projectSource.spec.candidate.selectedRunId,
|
|
806
|
+
executionFingerprint,
|
|
594
807
|
});
|
|
595
|
-
const
|
|
596
|
-
const
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
808
|
+
const runTraceJobs = [];
|
|
809
|
+
const attempts = budget;
|
|
810
|
+
for (let attemptIndex = 0; attemptIndex < attempts; attemptIndex += 1) {
|
|
811
|
+
snapshot = await loadLocalArchive(workspace);
|
|
812
|
+
const activeCandidate = readLocalCandidate(snapshot, currentBaseId);
|
|
813
|
+
const baseFiles = filterCandidateSourceFiles(readLocalCandidateFiles(snapshot, activeCandidate.id));
|
|
814
|
+
if (baseFiles.length === 0) {
|
|
815
|
+
throw new UsageError("Candidate snapshot must include at least one file.");
|
|
816
|
+
}
|
|
817
|
+
const candidateRevisionTraceFiles = createOptimizerTraceInputFiles({
|
|
818
|
+
jobs: [...baselineTraceJobs, ...runTraceJobs],
|
|
819
|
+
});
|
|
820
|
+
const candidateId = `candidate_${runId.replace(/^run_/u, "")}_${String(attemptIndex + 1).padStart(3, "0")}`;
|
|
821
|
+
const plannedCandidateRevision = planWorkbenchExecutionJobsForPurpose({
|
|
602
822
|
ownerUserId: "local",
|
|
603
823
|
projectId: "local",
|
|
604
824
|
runId,
|
|
605
|
-
|
|
825
|
+
candidateId,
|
|
606
826
|
attemptIndex,
|
|
607
827
|
samples,
|
|
608
|
-
now: new Date().toISOString(),
|
|
609
828
|
caseIds,
|
|
610
829
|
engineCases,
|
|
611
830
|
spec,
|
|
612
|
-
environmentRefsByCase: environmentRefs.byCase,
|
|
613
831
|
workflow: "improve",
|
|
614
|
-
purpose: "
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
832
|
+
purpose: "improve",
|
|
833
|
+
now: new Date().toISOString(),
|
|
834
|
+
baseFiles,
|
|
835
|
+
traceFiles: candidateRevisionTraceFiles,
|
|
836
|
+
...(environmentRefs.defaultRef ? { environmentRef: environmentRefs.defaultRef } : {}),
|
|
837
|
+
baseId: activeCandidate.id,
|
|
838
|
+
})[0];
|
|
839
|
+
const candidateRevisionJobs = await executeLocalDevelopmentDag({
|
|
840
|
+
jobs: [plannedCandidateRevision],
|
|
618
841
|
spec,
|
|
619
842
|
adapterManifests,
|
|
620
843
|
adapterFiles: normalizeSurfaceFiles(projectSource.adapterFiles),
|
|
621
|
-
baseFiles
|
|
844
|
+
baseFiles,
|
|
622
845
|
engineResolveFiles,
|
|
623
846
|
engineCases,
|
|
847
|
+
traceFiles: candidateRevisionTraceFiles,
|
|
624
848
|
capacity: devCapacity,
|
|
625
849
|
});
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
850
|
+
const candidateRevision = candidateRevisionJobs[0];
|
|
851
|
+
const completedJobs = [candidateRevision];
|
|
852
|
+
if (candidateRevision.status === "succeeded") {
|
|
853
|
+
const candidateRevisionFiles = completedJobOutputFiles(candidateRevision).length > 0
|
|
854
|
+
? normalizeSurfaceFiles(completedJobOutputFiles(candidateRevision).filter((file) => !file.path.startsWith(".workbench/")))
|
|
855
|
+
: baseFiles;
|
|
856
|
+
const attemptJobs = planWorkbenchExecutionJobsForPurpose({
|
|
857
|
+
ownerUserId: "local",
|
|
858
|
+
projectId: "local",
|
|
859
|
+
runId,
|
|
860
|
+
candidateId,
|
|
861
|
+
attemptIndex,
|
|
862
|
+
samples,
|
|
863
|
+
now: new Date().toISOString(),
|
|
864
|
+
caseIds,
|
|
865
|
+
engineCases,
|
|
866
|
+
spec,
|
|
867
|
+
environmentRefsByCase: environmentRefs.byCase,
|
|
868
|
+
workflow: "improve",
|
|
869
|
+
purpose: "attempt",
|
|
870
|
+
});
|
|
871
|
+
const dagJobs = await executeLocalDevelopmentDag({
|
|
872
|
+
jobs: [candidateRevision, ...attemptJobs],
|
|
873
|
+
spec,
|
|
874
|
+
adapterManifests,
|
|
875
|
+
adapterFiles: normalizeSurfaceFiles(projectSource.adapterFiles),
|
|
876
|
+
baseFiles: candidateRevisionFiles,
|
|
877
|
+
engineResolveFiles,
|
|
878
|
+
engineCases,
|
|
879
|
+
capacity: devCapacity,
|
|
880
|
+
});
|
|
881
|
+
completedJobs.splice(0, completedJobs.length, ...dagJobs);
|
|
882
|
+
}
|
|
883
|
+
runTraceJobs.push(...completedJobs);
|
|
884
|
+
const materialized = materializeWorkbenchRunResult({
|
|
885
|
+
runId,
|
|
886
|
+
benchmarkFingerprint,
|
|
887
|
+
sourceYaml: projectSource.specSource,
|
|
888
|
+
benchmarkSourceFiles: authoredBenchmarkSourceFiles(projectSource),
|
|
889
|
+
startedAt,
|
|
890
|
+
spec,
|
|
891
|
+
jobs: completedJobs,
|
|
892
|
+
previousCandidate: activeCandidate,
|
|
893
|
+
existingCandidateCount: snapshot.candidates.length,
|
|
894
|
+
});
|
|
895
|
+
for (const candidate of materialized.candidates) {
|
|
896
|
+
outputCandidateId = candidate.id;
|
|
897
|
+
snapshot = upsertLocalCandidate(snapshot, candidate, materialized.candidateFiles[candidate.id] ?? []);
|
|
898
|
+
events.push(createLocalEvent("candidate_created", candidate.createdAt, {
|
|
899
|
+
runId,
|
|
900
|
+
candidateId: candidate.id,
|
|
901
|
+
baseId: candidate.baseId,
|
|
902
|
+
status: candidate.status,
|
|
903
|
+
metrics: evaluationMeanMetrics(candidate.eval),
|
|
904
|
+
}));
|
|
905
|
+
}
|
|
906
|
+
for (const evaluation of materialized.evaluations) {
|
|
907
|
+
snapshot = upsertLocalEvaluation(snapshot, evaluation);
|
|
908
|
+
}
|
|
909
|
+
snapshot = setLocalActive(snapshot, materialized.activeCandidateId);
|
|
910
|
+
currentBaseId = materialized.activeCandidateId ?? currentBaseId;
|
|
911
|
+
completedJobCount += materialized.completedJobCount;
|
|
912
|
+
failedJobCount += materialized.failedJobCount;
|
|
913
|
+
failedJobs.push(...completedJobs
|
|
914
|
+
.filter((job) => job.status === "failed")
|
|
915
|
+
.map((job) => ({
|
|
916
|
+
id: job.id,
|
|
917
|
+
purpose: workbenchExecutionPurpose(job),
|
|
918
|
+
error: job.error ?? "Job failed without an error message.",
|
|
919
|
+
})));
|
|
920
|
+
events.push(createLocalEvent("active_changed", new Date().toISOString(), {
|
|
643
921
|
runId,
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
status:
|
|
647
|
-
metrics:
|
|
922
|
+
candidateId: materialized.activeCandidateId ?? undefined,
|
|
923
|
+
activeId: materialized.activeCandidateId ?? undefined,
|
|
924
|
+
status: materialized.selectedCandidate?.status,
|
|
925
|
+
metrics: evaluationMeanMetrics(materialized.selectedCandidate?.eval),
|
|
648
926
|
}));
|
|
927
|
+
await saveLocalJobs(workspace, completedJobs);
|
|
928
|
+
await saveLocalArchive(workspace, snapshot);
|
|
929
|
+
attemptsExecuted += 1;
|
|
649
930
|
}
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
.
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
931
|
+
snapshot = await loadLocalArchive(workspace);
|
|
932
|
+
const finishedAt = new Date().toISOString();
|
|
933
|
+
const run = {
|
|
934
|
+
id: runId,
|
|
935
|
+
workflow: "improve",
|
|
936
|
+
benchmarkFingerprint,
|
|
937
|
+
status: "finished",
|
|
938
|
+
candidateId: baseCandidate.id,
|
|
939
|
+
candidateRunId: projectSource.spec.candidate.selectedRunId,
|
|
940
|
+
candidateRunName: projectSource.spec.candidate.selectedRunName,
|
|
941
|
+
startedAt,
|
|
942
|
+
finishedAt,
|
|
943
|
+
durationMs: Math.max(0, Date.parse(finishedAt) - Date.parse(startedAt)),
|
|
944
|
+
improver: formatSpecImprover(spec),
|
|
945
|
+
engineRun: spec.engineRun.use,
|
|
946
|
+
strategy: "greedy",
|
|
947
|
+
budget,
|
|
948
|
+
repairBudget: 0,
|
|
949
|
+
attemptsRequested: budget,
|
|
950
|
+
attemptsExecuted,
|
|
951
|
+
samples,
|
|
952
|
+
executionFingerprint,
|
|
953
|
+
stoppedReason: "budget_exhausted",
|
|
954
|
+
outcome: failedJobCount > 0 ? "error" : "ok",
|
|
955
|
+
activeCandidateId: snapshot.activeId,
|
|
956
|
+
outputCandidateId: outputCandidateId ?? snapshot.activeId,
|
|
957
|
+
};
|
|
958
|
+
events.push(createLocalEvent("run_finished", finishedAt, {
|
|
665
959
|
runId,
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
960
|
+
detail: {
|
|
961
|
+
outcome: run.outcome ?? null,
|
|
962
|
+
attemptsExecuted: run.attemptsExecuted,
|
|
963
|
+
durationMs: run.durationMs ?? null,
|
|
964
|
+
},
|
|
670
965
|
}));
|
|
671
|
-
|
|
966
|
+
snapshot = upsertLocalRun(snapshot, run, events.slice(1));
|
|
672
967
|
await saveLocalArchive(workspace, snapshot);
|
|
968
|
+
const outputCandidate = run.outputCandidateId
|
|
969
|
+
? readLocalCandidate(snapshot, run.outputCandidateId)
|
|
970
|
+
: null;
|
|
971
|
+
const activeCandidate = snapshot.activeId
|
|
972
|
+
? readLocalCandidate(snapshot, snapshot.activeId)
|
|
973
|
+
: null;
|
|
974
|
+
const result = {
|
|
975
|
+
ok: failedJobCount === 0,
|
|
976
|
+
runId,
|
|
977
|
+
outputCandidateId: run.outputCandidateId,
|
|
978
|
+
outputCandidate,
|
|
979
|
+
activeCandidateId: snapshot.activeId,
|
|
980
|
+
activeCandidate,
|
|
981
|
+
completedJobCount,
|
|
982
|
+
failedJobCount,
|
|
983
|
+
failedJobs,
|
|
984
|
+
localView: localDevViewHint(workspace, runId),
|
|
985
|
+
};
|
|
986
|
+
writeOutput(result, parsed, io, () => {
|
|
987
|
+
const outputMetricValue = outputCandidate ? formatCandidateEvaluationScore(outputCandidate) : "n/a";
|
|
988
|
+
const activeMetricValue = activeCandidate ? formatCandidateEvaluationScore(activeCandidate) : "n/a";
|
|
989
|
+
const firstFailure = result.failedJobs[0];
|
|
990
|
+
const failureDetail = firstFailure
|
|
991
|
+
? `\nFirst failed job ${firstFailure.id}${firstFailure.purpose ? ` (${firstFailure.purpose})` : ""}: ${firstFailure.error}`
|
|
992
|
+
: "";
|
|
993
|
+
const viewDetail = failedJobCount === 0
|
|
994
|
+
? `\nOpen local view: ${result.localView.command}\n${result.localView.note}`
|
|
995
|
+
: "";
|
|
996
|
+
return `Run ${runId} finished. Output candidate: ${formatLocalCandidateLabel(outputCandidate)} (score: ${outputMetricValue}). Active candidate: ${formatLocalCandidateLabel(activeCandidate)} (score: ${activeMetricValue}).${failureDetail}${viewDetail}`;
|
|
997
|
+
});
|
|
998
|
+
return failedJobCount === 0 ? 0 : 1;
|
|
999
|
+
}
|
|
1000
|
+
catch (error) {
|
|
1001
|
+
await markLocalRunFailed({
|
|
1002
|
+
workspace,
|
|
1003
|
+
run: {
|
|
1004
|
+
...runningRun,
|
|
1005
|
+
attemptsExecuted,
|
|
1006
|
+
outputCandidateId,
|
|
1007
|
+
},
|
|
1008
|
+
startedAt,
|
|
1009
|
+
error,
|
|
1010
|
+
}).catch(() => undefined);
|
|
1011
|
+
throw error;
|
|
673
1012
|
}
|
|
674
|
-
snapshot = await loadLocalArchive(workspace);
|
|
675
|
-
const finishedAt = new Date().toISOString();
|
|
676
|
-
const run = {
|
|
677
|
-
id: runId,
|
|
678
|
-
workflow: "improve",
|
|
679
|
-
benchmarkFingerprint,
|
|
680
|
-
status: "finished",
|
|
681
|
-
startedAt,
|
|
682
|
-
finishedAt,
|
|
683
|
-
durationMs: Math.max(0, Date.parse(finishedAt) - Date.parse(startedAt)),
|
|
684
|
-
optimizer: formatSpecOptimizer(spec),
|
|
685
|
-
engineRun: spec.engineRun.use,
|
|
686
|
-
strategy: "greedy",
|
|
687
|
-
budget,
|
|
688
|
-
repairBudget: 0,
|
|
689
|
-
attemptsRequested: budget,
|
|
690
|
-
attemptsExecuted: budget,
|
|
691
|
-
samples,
|
|
692
|
-
stoppedReason: "budget_exhausted",
|
|
693
|
-
outcome: failedJobCount > 0 ? "error" : "ok",
|
|
694
|
-
};
|
|
695
|
-
events.push(createLocalEvent("run_finished", finishedAt, {
|
|
696
|
-
runId,
|
|
697
|
-
detail: {
|
|
698
|
-
outcome: run.outcome ?? null,
|
|
699
|
-
attemptsExecuted: run.attemptsExecuted,
|
|
700
|
-
durationMs: run.durationMs ?? null,
|
|
701
|
-
},
|
|
702
|
-
}));
|
|
703
|
-
snapshot = appendLocalRun(snapshot, run, events);
|
|
704
|
-
await saveLocalArchive(workspace, snapshot);
|
|
705
|
-
const selected = snapshot.activeId
|
|
706
|
-
? readLocalSubject(snapshot, snapshot.activeId)
|
|
707
|
-
: null;
|
|
708
|
-
const result = {
|
|
709
|
-
ok: failedJobCount === 0,
|
|
710
|
-
runId,
|
|
711
|
-
activeSubjectId: snapshot.activeId,
|
|
712
|
-
selectedSubject: selected,
|
|
713
|
-
completedJobCount,
|
|
714
|
-
failedJobCount,
|
|
715
|
-
failedJobs,
|
|
716
|
-
localView: localDevViewHint(workspace, runId),
|
|
717
|
-
};
|
|
718
|
-
writeOutput(result, parsed, io, () => {
|
|
719
|
-
const metricValue = selected?.metrics?.score ?? "n/a";
|
|
720
|
-
const firstFailure = result.failedJobs[0];
|
|
721
|
-
const failureDetail = firstFailure
|
|
722
|
-
? `\nFirst failed job ${firstFailure.id}${firstFailure.purpose ? ` (${firstFailure.purpose})` : ""}: ${firstFailure.error}`
|
|
723
|
-
: "";
|
|
724
|
-
const viewDetail = failedJobCount === 0
|
|
725
|
-
? `\nOpen local view: ${result.localView.command}\n${result.localView.note}`
|
|
726
|
-
: "";
|
|
727
|
-
return `Run ${runId} finished. Active subject: ${snapshot.activeId ?? "none"} (score: ${metricValue}).${failureDetail}${viewDetail}`;
|
|
728
|
-
});
|
|
729
|
-
return failedJobCount === 0 ? 0 : 1;
|
|
730
1013
|
}
|
|
731
|
-
async function
|
|
1014
|
+
async function ensureLocalImproveBaseCandidate(args) {
|
|
732
1015
|
let snapshot = await loadLocalArchive(args.workspace);
|
|
733
1016
|
const explicitBase = asOptionalString(args.parsed.flags.from);
|
|
734
1017
|
const benchmarkFingerprint = await readLocalBenchmarkFingerprint(args.workspace);
|
|
735
1018
|
if (explicitBase) {
|
|
736
|
-
let
|
|
737
|
-
if (
|
|
738
|
-
throw new UsageError(`Base
|
|
1019
|
+
let candidate = readLocalCandidate(snapshot, explicitBase);
|
|
1020
|
+
if (candidate.benchmarkFingerprint !== benchmarkFingerprint) {
|
|
1021
|
+
throw new UsageError(`Base candidate ${explicitBase} belongs to benchmark ${candidate.benchmarkFingerprint}, not ${benchmarkFingerprint}.`);
|
|
739
1022
|
}
|
|
740
|
-
if (!
|
|
741
|
-
throw new UsageError(`Base
|
|
1023
|
+
if (!candidate.candidateFingerprint) {
|
|
1024
|
+
throw new UsageError(`Base candidate ${explicitBase} is missing a candidate fingerprint.`);
|
|
742
1025
|
}
|
|
743
|
-
if (
|
|
744
|
-
const code = await
|
|
1026
|
+
if (candidate.status !== "evaluated" && !candidate.eval) {
|
|
1027
|
+
const code = await localEvaluateCandidate([
|
|
1028
|
+
"--dir",
|
|
1029
|
+
args.workspace,
|
|
1030
|
+
"--candidate",
|
|
1031
|
+
explicitBase,
|
|
1032
|
+
"--runs",
|
|
1033
|
+
args.projectSource.spec.candidate.selectedRunId,
|
|
1034
|
+
"--samples",
|
|
1035
|
+
String(args.samples),
|
|
1036
|
+
...(args.parsed.flags.rerun === true ? ["--rerun"] : []),
|
|
1037
|
+
"--json",
|
|
1038
|
+
], createSilentIo(args.io), args.runtimeOptions);
|
|
745
1039
|
if (code !== 0) {
|
|
746
|
-
throw new UsageError(`Base
|
|
1040
|
+
throw new UsageError(`Base candidate ${explicitBase} eval failed; improve was not started.`);
|
|
747
1041
|
}
|
|
748
1042
|
snapshot = await loadLocalArchive(args.workspace);
|
|
749
|
-
|
|
1043
|
+
candidate = readLocalCandidate(snapshot, explicitBase);
|
|
750
1044
|
}
|
|
751
|
-
return
|
|
1045
|
+
return candidate;
|
|
752
1046
|
}
|
|
753
|
-
const
|
|
754
|
-
const existing = snapshot.
|
|
755
|
-
|
|
756
|
-
(
|
|
1047
|
+
const candidateFingerprint = localCandidateFingerprint(args.projectSource);
|
|
1048
|
+
const existing = snapshot.candidates.find((candidate) => candidate.benchmarkFingerprint === benchmarkFingerprint &&
|
|
1049
|
+
candidate.candidateFingerprint === candidateFingerprint &&
|
|
1050
|
+
(candidate.status === "evaluated" || Boolean(candidate.eval)));
|
|
757
1051
|
if (existing) {
|
|
758
1052
|
return existing;
|
|
759
1053
|
}
|
|
760
1054
|
const evalArgs = args.parsed.positionals.length > 0
|
|
761
|
-
? [
|
|
762
|
-
|
|
763
|
-
|
|
1055
|
+
? [
|
|
1056
|
+
args.sourceArg,
|
|
1057
|
+
"--runs",
|
|
1058
|
+
args.projectSource.spec.candidate.selectedRunId,
|
|
1059
|
+
"--samples",
|
|
1060
|
+
String(args.samples),
|
|
1061
|
+
...(args.parsed.flags.rerun === true ? ["--rerun"] : []),
|
|
1062
|
+
"--json",
|
|
1063
|
+
]
|
|
1064
|
+
: [
|
|
1065
|
+
"--dir",
|
|
1066
|
+
args.workspace,
|
|
1067
|
+
"--runs",
|
|
1068
|
+
args.projectSource.spec.candidate.selectedRunId,
|
|
1069
|
+
"--samples",
|
|
1070
|
+
String(args.samples),
|
|
1071
|
+
...(args.parsed.flags.rerun === true ? ["--rerun"] : []),
|
|
1072
|
+
"--json",
|
|
1073
|
+
];
|
|
1074
|
+
const code = await localEvaluateCandidate(evalArgs, createSilentIo(args.io), args.runtimeOptions);
|
|
764
1075
|
if (code !== 0) {
|
|
765
|
-
throw new UsageError("Parent
|
|
1076
|
+
throw new UsageError("Parent candidate eval failed; improve was not started.");
|
|
766
1077
|
}
|
|
767
1078
|
snapshot = await loadLocalArchive(args.workspace);
|
|
768
|
-
const evaluated = snapshot.
|
|
769
|
-
|
|
770
|
-
(
|
|
1079
|
+
const evaluated = snapshot.candidates.find((candidate) => candidate.benchmarkFingerprint === benchmarkFingerprint &&
|
|
1080
|
+
candidate.candidateFingerprint === candidateFingerprint &&
|
|
1081
|
+
(candidate.status === "evaluated" || Boolean(candidate.eval)));
|
|
771
1082
|
if (!evaluated) {
|
|
772
|
-
throw new UsageError("Parent
|
|
1083
|
+
throw new UsageError("Parent candidate eval did not produce an evaluated candidate.");
|
|
773
1084
|
}
|
|
774
1085
|
return evaluated;
|
|
775
1086
|
}
|
|
@@ -785,13 +1096,62 @@ function createSilentIo(io) {
|
|
|
785
1096
|
stderr: io.stderr,
|
|
786
1097
|
};
|
|
787
1098
|
}
|
|
788
|
-
|
|
1099
|
+
function selectLocalOptimizerBaselineTraceJobs(snapshot, jobs, target) {
|
|
1100
|
+
const runById = new Map(snapshot.runs.map((run) => [run.id, run]));
|
|
1101
|
+
const evaluation = snapshot.evaluations
|
|
1102
|
+
.filter((entry) => {
|
|
1103
|
+
const run = runById.get(entry.runId);
|
|
1104
|
+
return entry.benchmarkFingerprint === target.benchmarkFingerprint &&
|
|
1105
|
+
entry.candidateId === target.candidateId &&
|
|
1106
|
+
entry.candidateRunId === target.candidateRunId &&
|
|
1107
|
+
run?.executionFingerprint === target.executionFingerprint;
|
|
1108
|
+
})
|
|
1109
|
+
.sort((left, right) => right.updatedAt.localeCompare(left.updatedAt) ||
|
|
1110
|
+
right.runId.localeCompare(left.runId))[0] ?? null;
|
|
1111
|
+
if (!evaluation) {
|
|
1112
|
+
return [];
|
|
1113
|
+
}
|
|
1114
|
+
return jobs.filter((job) => job.runId === evaluation.runId);
|
|
1115
|
+
}
|
|
1116
|
+
async function localEvaluateCandidate(argv, io, runtimeOptions) {
|
|
789
1117
|
void runtimeOptions;
|
|
790
1118
|
const parsed = parseArgs(argv);
|
|
791
|
-
rejectUnknownFlags(parsed, new Set(["dir", "
|
|
1119
|
+
rejectUnknownFlags(parsed, new Set(["dir", "candidate", "runs", "samples", "rerun", "json"]));
|
|
792
1120
|
const samples = parsePositiveInt(parsed.flags.samples, 1, "samples");
|
|
793
1121
|
const sourceArg = resolveSourceDir(parsed);
|
|
794
|
-
const
|
|
1122
|
+
const runsFlag = asOptionalString(parsed.flags.runs);
|
|
1123
|
+
const defaultProjectSource = await readLocalProjectSource(sourceArg);
|
|
1124
|
+
const selectedRunIds = resolveCandidateRunSelection(defaultProjectSource, runsFlag);
|
|
1125
|
+
if (selectedRunIds.length > 1) {
|
|
1126
|
+
let failed = 0;
|
|
1127
|
+
for (const runId of selectedRunIds) {
|
|
1128
|
+
const args = [
|
|
1129
|
+
"--dir",
|
|
1130
|
+
defaultProjectSource.dir,
|
|
1131
|
+
"--runs",
|
|
1132
|
+
runId,
|
|
1133
|
+
"--samples",
|
|
1134
|
+
String(samples),
|
|
1135
|
+
...(readOptionalCandidateFlag(parsed) ? ["--candidate", readOptionalCandidateFlag(parsed)] : []),
|
|
1136
|
+
...(parsed.flags.rerun === true ? ["--rerun"] : []),
|
|
1137
|
+
"--json",
|
|
1138
|
+
];
|
|
1139
|
+
const code = await localEvaluateCandidate(args, createSilentIo(io), runtimeOptions);
|
|
1140
|
+
if (code !== 0) {
|
|
1141
|
+
failed += 1;
|
|
1142
|
+
}
|
|
1143
|
+
}
|
|
1144
|
+
writeOutput({
|
|
1145
|
+
ok: failed === 0,
|
|
1146
|
+
candidateId: defaultProjectSource.candidateName,
|
|
1147
|
+
candidateRunIds: selectedRunIds,
|
|
1148
|
+
failedRunCount: failed,
|
|
1149
|
+
}, parsed, io, () => `Evaluated ${selectedRunIds.length} candidate run(s); ${failed} failed.`);
|
|
1150
|
+
return failed === 0 ? 0 : 1;
|
|
1151
|
+
}
|
|
1152
|
+
const projectSource = selectedRunIds[0] === defaultProjectSource.candidateRunId
|
|
1153
|
+
? defaultProjectSource
|
|
1154
|
+
: await readLocalProjectSource(sourceArg, { runId: selectedRunIds[0] });
|
|
795
1155
|
const workspace = projectSource.dir;
|
|
796
1156
|
const executionProject = await resolveLocalProjectForExecution(workspace, projectSource.specSource);
|
|
797
1157
|
const { spec, adapterManifests } = executionProject;
|
|
@@ -810,114 +1170,367 @@ async function localEvaluateSubject(argv, io, runtimeOptions) {
|
|
|
810
1170
|
const environmentRefs = await ensureLocalDockerfileEnvironments(workspace, spec, engineCases);
|
|
811
1171
|
let snapshot = await loadLocalArchive(workspace);
|
|
812
1172
|
const benchmarkFingerprint = await readLocalBenchmarkFingerprint(workspace);
|
|
813
|
-
const
|
|
814
|
-
const
|
|
815
|
-
const
|
|
816
|
-
|
|
817
|
-
|
|
818
|
-
const
|
|
819
|
-
const
|
|
820
|
-
|
|
821
|
-
|
|
1173
|
+
const executionFingerprint = localRunExecutionFingerprint(projectSource);
|
|
1174
|
+
const sourceCandidateFingerprint = localCandidateFingerprint(projectSource);
|
|
1175
|
+
const explicitCandidateId = readOptionalCandidateFlag(parsed);
|
|
1176
|
+
const existingSourceCandidate = snapshot.candidates.find((candidate) => candidate.benchmarkFingerprint === benchmarkFingerprint &&
|
|
1177
|
+
candidate.candidateFingerprint === sourceCandidateFingerprint);
|
|
1178
|
+
const candidateId = explicitCandidateId ?? existingSourceCandidate?.id ?? `candidate_${sourceCandidateFingerprint.slice(0, 12)}`;
|
|
1179
|
+
const existingCandidate = snapshot.candidates.find((candidate) => candidate.id === candidateId);
|
|
1180
|
+
const activeCandidateIdBeforeEval = snapshot.activeId;
|
|
1181
|
+
const selectedCandidateRunId = projectSource.spec.candidate.selectedRunId;
|
|
1182
|
+
const files = filterCandidateSourceFiles(existingCandidate
|
|
1183
|
+
? readLocalCandidateFiles(snapshot, candidateId)
|
|
1184
|
+
: normalizeSurfaceFiles(projectSource.candidateFiles));
|
|
1185
|
+
const evaluationWork = parsed.flags.rerun !== true
|
|
1186
|
+
? await resolveLocalEvaluationWork(workspace, snapshot, {
|
|
1187
|
+
benchmarkFingerprint,
|
|
1188
|
+
candidateId,
|
|
1189
|
+
candidateFingerprint: existingCandidate?.candidateFingerprint ?? sourceCandidateFingerprint,
|
|
1190
|
+
candidateRunId: selectedCandidateRunId,
|
|
1191
|
+
executionFingerprint,
|
|
1192
|
+
samples,
|
|
1193
|
+
caseIds,
|
|
1194
|
+
})
|
|
1195
|
+
: null;
|
|
1196
|
+
const reusableEvaluation = evaluationWork?.reusableEvaluation ?? null;
|
|
1197
|
+
if (reusableEvaluation) {
|
|
1198
|
+
const result = {
|
|
1199
|
+
ok: true,
|
|
1200
|
+
reused: true,
|
|
1201
|
+
runId: reusableEvaluation.runId,
|
|
1202
|
+
evaluation: reusableEvaluation,
|
|
1203
|
+
evaluationId: reusableEvaluation.id,
|
|
1204
|
+
candidateId,
|
|
1205
|
+
completedJobCount: 0,
|
|
1206
|
+
failedJobCount: 0,
|
|
1207
|
+
localView: localDevViewHint(workspace, reusableEvaluation.runId),
|
|
1208
|
+
};
|
|
1209
|
+
writeOutput(result, parsed, io, () => `Reused evaluation ${reusableEvaluation.id}. Use --rerun to intentionally run it again.`);
|
|
1210
|
+
return 0;
|
|
1211
|
+
}
|
|
1212
|
+
const selectedPairs = evaluationWork?.missingPairs.length
|
|
1213
|
+
? evaluationWork.missingPairs
|
|
1214
|
+
: allCaseSamplePairs(caseIds, samples);
|
|
822
1215
|
const runId = `eval_local_${Date.now().toString(36)}`;
|
|
823
|
-
const
|
|
1216
|
+
const evaluatedCandidateId = candidateId;
|
|
824
1217
|
const startedAt = new Date().toISOString();
|
|
825
|
-
const
|
|
826
|
-
ownerUserId: "local",
|
|
827
|
-
projectId: "local",
|
|
1218
|
+
const runStartedEvent = createLocalEvent("run_started", startedAt, {
|
|
828
1219
|
runId,
|
|
829
|
-
|
|
830
|
-
|
|
831
|
-
files,
|
|
832
|
-
now: startedAt,
|
|
833
|
-
baseId: null,
|
|
1220
|
+
candidateId: evaluatedCandidateId,
|
|
1221
|
+
detail: { samples, strategy: "direct" },
|
|
834
1222
|
});
|
|
835
|
-
const
|
|
836
|
-
const attemptJobs = planWorkbenchExecutionJobsForPurpose({
|
|
837
|
-
ownerUserId: "local",
|
|
838
|
-
projectId: "local",
|
|
839
|
-
runId,
|
|
840
|
-
subjectId: evaluatedSubjectId,
|
|
841
|
-
attemptIndex: 0,
|
|
842
|
-
samples,
|
|
843
|
-
now: startedAt,
|
|
844
|
-
caseIds,
|
|
845
|
-
engineCases,
|
|
846
|
-
spec,
|
|
847
|
-
environmentRefsByCase: environmentRefs.byCase,
|
|
848
|
-
workflow: "eval",
|
|
849
|
-
purpose: "attempt",
|
|
850
|
-
});
|
|
851
|
-
const dagJobs = await executeLocalDevelopmentDag({
|
|
852
|
-
jobs: [baseline, ...attemptJobs],
|
|
853
|
-
spec,
|
|
854
|
-
adapterManifests,
|
|
855
|
-
adapterFiles: normalizeSurfaceFiles(projectSource.adapterFiles),
|
|
856
|
-
baseFiles: files,
|
|
857
|
-
engineResolveFiles,
|
|
858
|
-
engineCases,
|
|
859
|
-
capacity: await localDevelopmentCapacity(workspace),
|
|
860
|
-
});
|
|
861
|
-
completedJobs.splice(0, completedJobs.length, ...dagJobs);
|
|
862
|
-
const materialized = materializeWorkbenchRunResult({
|
|
863
|
-
runId,
|
|
864
|
-
benchmarkFingerprint,
|
|
865
|
-
sourceYaml: projectSource.specSource,
|
|
866
|
-
benchmarkSourceFiles: authoredBenchmarkSourceFiles(projectSource),
|
|
867
|
-
subjectFingerprint: existingSubject?.subjectFingerprint ?? sourceSubjectFingerprint,
|
|
868
|
-
...(!existingSubject || existingSubject.subjectFingerprint === sourceSubjectFingerprint
|
|
869
|
-
? { subjectSourceFiles: authoredSubjectSourceFiles(projectSource) }
|
|
870
|
-
: {}),
|
|
871
|
-
startedAt,
|
|
872
|
-
spec,
|
|
873
|
-
jobs: completedJobs,
|
|
874
|
-
previousSubject: null,
|
|
875
|
-
existingSubjectCount: snapshot.subjects.length,
|
|
876
|
-
});
|
|
877
|
-
for (const subjectRecord of materialized.subjects) {
|
|
878
|
-
snapshot = upsertLocalSubject(snapshot, subjectRecord, materialized.subjectFiles[subjectRecord.id] ?? []);
|
|
879
|
-
}
|
|
880
|
-
if (materialized.activeSubjectId) {
|
|
881
|
-
snapshot = setLocalActive(snapshot, materialized.activeSubjectId);
|
|
882
|
-
}
|
|
883
|
-
for (const evaluation of materialized.evaluations) {
|
|
884
|
-
snapshot = upsertLocalEvaluation(snapshot, evaluation);
|
|
885
|
-
}
|
|
886
|
-
const finishedAt = new Date().toISOString();
|
|
887
|
-
snapshot = appendLocalRun(snapshot, {
|
|
1223
|
+
const runningRun = {
|
|
888
1224
|
id: runId,
|
|
889
1225
|
workflow: "eval",
|
|
890
1226
|
benchmarkFingerprint,
|
|
891
|
-
status: "
|
|
1227
|
+
status: "running",
|
|
1228
|
+
candidateId: evaluatedCandidateId,
|
|
1229
|
+
candidateRunId: projectSource.spec.candidate.selectedRunId,
|
|
1230
|
+
candidateRunName: projectSource.spec.candidate.selectedRunName,
|
|
892
1231
|
startedAt,
|
|
893
|
-
|
|
894
|
-
durationMs: Math.max(0, Date.parse(finishedAt) - Date.parse(startedAt)),
|
|
895
|
-
optimizer: "none",
|
|
1232
|
+
improver: "none",
|
|
896
1233
|
engineRun: spec.engineRun.use,
|
|
897
1234
|
strategy: "direct",
|
|
898
1235
|
budget: 1,
|
|
899
1236
|
repairBudget: 0,
|
|
900
1237
|
attemptsRequested: 1,
|
|
901
|
-
attemptsExecuted:
|
|
1238
|
+
attemptsExecuted: 0,
|
|
902
1239
|
samples,
|
|
903
|
-
|
|
904
|
-
|
|
905
|
-
|
|
906
|
-
|
|
1240
|
+
executionFingerprint,
|
|
1241
|
+
activeCandidateId: activeCandidateIdBeforeEval,
|
|
1242
|
+
outputCandidateId: evaluatedCandidateId,
|
|
1243
|
+
};
|
|
1244
|
+
snapshot = upsertLocalRun(snapshot, runningRun, [runStartedEvent]);
|
|
907
1245
|
await saveLocalArchive(workspace, snapshot);
|
|
908
|
-
|
|
909
|
-
|
|
910
|
-
|
|
911
|
-
|
|
912
|
-
|
|
913
|
-
|
|
914
|
-
|
|
915
|
-
|
|
916
|
-
|
|
917
|
-
|
|
1246
|
+
try {
|
|
1247
|
+
const baseline = createRuntimeBaselineCandidateJob({
|
|
1248
|
+
ownerUserId: "local",
|
|
1249
|
+
projectId: "local",
|
|
1250
|
+
runId,
|
|
1251
|
+
candidateId: evaluatedCandidateId,
|
|
1252
|
+
attemptIndex: 0,
|
|
1253
|
+
files,
|
|
1254
|
+
now: startedAt,
|
|
1255
|
+
baseId: null,
|
|
1256
|
+
});
|
|
1257
|
+
const attemptJobs = planWorkbenchExecutionJobsForPurpose({
|
|
1258
|
+
ownerUserId: "local",
|
|
1259
|
+
projectId: "local",
|
|
1260
|
+
runId,
|
|
1261
|
+
candidateId: evaluatedCandidateId,
|
|
1262
|
+
attemptIndex: 0,
|
|
1263
|
+
samples,
|
|
1264
|
+
now: startedAt,
|
|
1265
|
+
caseIds: orderedCaseIdsForPairs(caseIds, selectedPairs),
|
|
1266
|
+
sampleIndexesByCase: sampleIndexesByCase(selectedPairs),
|
|
1267
|
+
engineCases,
|
|
1268
|
+
spec,
|
|
1269
|
+
environmentRefsByCase: environmentRefs.byCase,
|
|
1270
|
+
workflow: "eval",
|
|
1271
|
+
purpose: "attempt",
|
|
1272
|
+
});
|
|
1273
|
+
const dagJobs = await executeLocalDevelopmentDag({
|
|
1274
|
+
jobs: [baseline, ...attemptJobs],
|
|
1275
|
+
spec,
|
|
1276
|
+
adapterManifests,
|
|
1277
|
+
adapterFiles: normalizeSurfaceFiles(projectSource.adapterFiles),
|
|
1278
|
+
baseFiles: files,
|
|
1279
|
+
engineResolveFiles,
|
|
1280
|
+
engineCases,
|
|
1281
|
+
capacity: await localDevelopmentCapacity(workspace),
|
|
1282
|
+
});
|
|
1283
|
+
const materializationJobs = [
|
|
1284
|
+
...(evaluationWork?.priorAttemptJobs ?? []),
|
|
1285
|
+
...dagJobs,
|
|
1286
|
+
];
|
|
1287
|
+
const currentRunJobs = dagJobs.filter((job) => job.runId === runId);
|
|
1288
|
+
const currentRunCompletedJobCount = currentRunJobs.filter((job) => job.status === "succeeded").length;
|
|
1289
|
+
const currentRunFailedJobCount = currentRunJobs.filter((job) => job.status === "failed").length;
|
|
1290
|
+
const materialized = materializeWorkbenchRunResult({
|
|
1291
|
+
runId,
|
|
1292
|
+
benchmarkFingerprint,
|
|
1293
|
+
sourceYaml: projectSource.specSource,
|
|
1294
|
+
benchmarkSourceFiles: authoredBenchmarkSourceFiles(projectSource),
|
|
1295
|
+
candidateFingerprint: existingCandidate?.candidateFingerprint ?? sourceCandidateFingerprint,
|
|
1296
|
+
...(!existingCandidate || existingCandidate.candidateFingerprint === sourceCandidateFingerprint
|
|
1297
|
+
? { candidateSourceFiles: authoredCandidateSourceFiles(projectSource) }
|
|
1298
|
+
: {}),
|
|
1299
|
+
startedAt,
|
|
1300
|
+
spec,
|
|
1301
|
+
jobs: materializationJobs,
|
|
1302
|
+
previousCandidate: existingCandidate ?? null,
|
|
1303
|
+
existingCandidateCount: snapshot.candidates.length,
|
|
1304
|
+
});
|
|
1305
|
+
for (const candidateRecord of materialized.candidates) {
|
|
1306
|
+
snapshot = upsertLocalCandidate(snapshot, candidateRecord, materialized.candidateFiles[candidateRecord.id] ?? []);
|
|
1307
|
+
}
|
|
1308
|
+
if (materialized.activeCandidateId) {
|
|
1309
|
+
snapshot = setLocalActive(snapshot, materialized.activeCandidateId);
|
|
1310
|
+
}
|
|
1311
|
+
for (const evaluation of materialized.evaluations) {
|
|
1312
|
+
snapshot = upsertLocalEvaluation(snapshot, evaluation);
|
|
1313
|
+
}
|
|
1314
|
+
const activeCandidateId = activeCandidateIdBeforeEval ?? materialized.activeCandidateId ?? null;
|
|
1315
|
+
const finishedAt = new Date().toISOString();
|
|
1316
|
+
if (activeCandidateId) {
|
|
1317
|
+
snapshot = setLocalActive(snapshot, activeCandidateId);
|
|
1318
|
+
}
|
|
1319
|
+
const runFinishedEvent = createLocalEvent("run_finished", finishedAt, {
|
|
1320
|
+
runId,
|
|
1321
|
+
candidateId: evaluatedCandidateId,
|
|
1322
|
+
detail: {
|
|
1323
|
+
outcome: currentRunFailedJobCount > 0 ? "error" : "ok",
|
|
1324
|
+
attemptsExecuted: 1,
|
|
1325
|
+
durationMs: Math.max(0, Date.parse(finishedAt) - Date.parse(startedAt)),
|
|
1326
|
+
},
|
|
1327
|
+
});
|
|
1328
|
+
snapshot = upsertLocalRun(snapshot, {
|
|
1329
|
+
id: runId,
|
|
1330
|
+
workflow: "eval",
|
|
1331
|
+
benchmarkFingerprint,
|
|
1332
|
+
status: "finished",
|
|
1333
|
+
candidateId: evaluatedCandidateId,
|
|
1334
|
+
candidateRunId: projectSource.spec.candidate.selectedRunId,
|
|
1335
|
+
candidateRunName: projectSource.spec.candidate.selectedRunName,
|
|
1336
|
+
startedAt,
|
|
1337
|
+
finishedAt,
|
|
1338
|
+
durationMs: Math.max(0, Date.parse(finishedAt) - Date.parse(startedAt)),
|
|
1339
|
+
improver: "none",
|
|
1340
|
+
engineRun: spec.engineRun.use,
|
|
1341
|
+
strategy: "direct",
|
|
1342
|
+
budget: 1,
|
|
1343
|
+
repairBudget: 0,
|
|
1344
|
+
attemptsRequested: 1,
|
|
1345
|
+
attemptsExecuted: 1,
|
|
1346
|
+
samples,
|
|
1347
|
+
executionFingerprint,
|
|
1348
|
+
stoppedReason: "completed",
|
|
1349
|
+
outcome: currentRunFailedJobCount > 0 ? "error" : "ok",
|
|
1350
|
+
activeCandidateId,
|
|
1351
|
+
outputCandidateId: evaluatedCandidateId,
|
|
1352
|
+
}, [runFinishedEvent]);
|
|
1353
|
+
await saveLocalJobs(workspace, currentRunJobs);
|
|
1354
|
+
await saveLocalArchive(workspace, snapshot);
|
|
1355
|
+
const evaluation = materialized.evaluations[0] ?? null;
|
|
1356
|
+
const result = {
|
|
1357
|
+
ok: currentRunFailedJobCount === 0,
|
|
1358
|
+
runId,
|
|
1359
|
+
evaluation,
|
|
1360
|
+
evaluationId: evaluation?.id ?? null,
|
|
1361
|
+
candidateId: evaluatedCandidateId,
|
|
1362
|
+
activeCandidateId,
|
|
1363
|
+
completedJobCount: currentRunCompletedJobCount,
|
|
1364
|
+
failedJobCount: currentRunFailedJobCount,
|
|
1365
|
+
localView: localDevViewHint(workspace, runId),
|
|
1366
|
+
};
|
|
1367
|
+
writeOutput(result, parsed, io, ({ evaluationId, candidateId }) => `Evaluation ${evaluationId ?? runId} finished for candidate ${candidateId}.\nOpen local view: ${result.localView.command}\n${result.localView.note}`);
|
|
1368
|
+
return currentRunFailedJobCount === 0 ? 0 : 1;
|
|
1369
|
+
}
|
|
1370
|
+
catch (error) {
|
|
1371
|
+
await markLocalRunFailed({
|
|
1372
|
+
workspace,
|
|
1373
|
+
run: runningRun,
|
|
1374
|
+
startedAt,
|
|
1375
|
+
error,
|
|
1376
|
+
}).catch(() => undefined);
|
|
1377
|
+
throw error;
|
|
1378
|
+
}
|
|
1379
|
+
}
|
|
1380
|
+
async function resolveLocalEvaluationWork(workspace, snapshot, target) {
|
|
1381
|
+
const runById = new Map(snapshot.runs.map((run) => [run.id, run]));
|
|
1382
|
+
const matchingEvaluations = snapshot.evaluations.filter((evaluation) => {
|
|
1383
|
+
const run = runById.get(evaluation.runId);
|
|
1384
|
+
return evaluation.benchmarkFingerprint === target.benchmarkFingerprint &&
|
|
1385
|
+
evaluation.candidateId === target.candidateId &&
|
|
1386
|
+
evaluation.candidateFingerprint === target.candidateFingerprint &&
|
|
1387
|
+
evaluation.candidateRunId === target.candidateRunId &&
|
|
1388
|
+
run?.executionFingerprint === target.executionFingerprint;
|
|
1389
|
+
});
|
|
1390
|
+
const reusableEvaluation = matchingEvaluations
|
|
1391
|
+
.filter((evaluation) => evaluation.status === "completed" &&
|
|
1392
|
+
evaluation.errorSampleCount === 0 &&
|
|
1393
|
+
evaluation.completedSampleCount >= target.samples)
|
|
1394
|
+
.sort((left, right) => right.updatedAt.localeCompare(left.updatedAt) ||
|
|
1395
|
+
right.id.localeCompare(left.id))[0] ?? null;
|
|
1396
|
+
if (reusableEvaluation) {
|
|
1397
|
+
return {
|
|
1398
|
+
reusableEvaluation,
|
|
1399
|
+
missingPairs: [],
|
|
1400
|
+
priorAttemptJobs: [],
|
|
1401
|
+
};
|
|
1402
|
+
}
|
|
1403
|
+
const matchingRunIds = new Set(matchingEvaluations.map((evaluation) => evaluation.runId));
|
|
1404
|
+
if (matchingRunIds.size === 0) {
|
|
1405
|
+
return null;
|
|
1406
|
+
}
|
|
1407
|
+
const allPairs = allCaseSamplePairs(target.caseIds, target.samples);
|
|
1408
|
+
const desiredKeys = new Set(allPairs.map(caseSamplePairKey));
|
|
1409
|
+
const previousJobs = await readLocalJobs(workspace);
|
|
1410
|
+
const priorAttemptJobsByPair = latestCompletedAttemptJobsByPair(previousJobs.filter((job) => matchingRunIds.has(job.runId) &&
|
|
1411
|
+
job.candidateId === target.candidateId), desiredKeys);
|
|
1412
|
+
const missingPairs = allPairs.filter((pair) => !priorAttemptJobsByPair.has(caseSamplePairKey(pair)));
|
|
1413
|
+
if (missingPairs.length === allPairs.length) {
|
|
1414
|
+
return null;
|
|
1415
|
+
}
|
|
1416
|
+
return {
|
|
1417
|
+
reusableEvaluation: null,
|
|
1418
|
+
missingPairs,
|
|
1419
|
+
priorAttemptJobs: [...priorAttemptJobsByPair.values()],
|
|
918
1420
|
};
|
|
919
|
-
|
|
920
|
-
|
|
1421
|
+
}
|
|
1422
|
+
async function markLocalRunFailed(args) {
|
|
1423
|
+
const latest = await loadLocalArchive(args.workspace);
|
|
1424
|
+
const current = latest.runs.find((run) => run.id === args.run.id);
|
|
1425
|
+
if (current?.status === "finished") {
|
|
1426
|
+
return;
|
|
1427
|
+
}
|
|
1428
|
+
const finishedAt = new Date().toISOString();
|
|
1429
|
+
const message = errorMessage(args.error);
|
|
1430
|
+
const failedRun = {
|
|
1431
|
+
...args.run,
|
|
1432
|
+
status: "finished",
|
|
1433
|
+
finishedAt,
|
|
1434
|
+
durationMs: Math.max(0, Date.parse(finishedAt) - Date.parse(args.startedAt)),
|
|
1435
|
+
outcome: "error",
|
|
1436
|
+
error: message,
|
|
1437
|
+
};
|
|
1438
|
+
await saveLocalArchive(args.workspace, upsertLocalRun(latest, failedRun, [
|
|
1439
|
+
createLocalEvent("run_finished", finishedAt, {
|
|
1440
|
+
runId: args.run.id,
|
|
1441
|
+
candidateId: args.run.candidateId ?? undefined,
|
|
1442
|
+
detail: {
|
|
1443
|
+
outcome: "error",
|
|
1444
|
+
error: message,
|
|
1445
|
+
attemptsExecuted: failedRun.attemptsExecuted,
|
|
1446
|
+
durationMs: failedRun.durationMs ?? null,
|
|
1447
|
+
},
|
|
1448
|
+
}),
|
|
1449
|
+
]));
|
|
1450
|
+
}
|
|
1451
|
+
function errorMessage(error) {
|
|
1452
|
+
return error instanceof Error ? error.message : String(error);
|
|
1453
|
+
}
|
|
1454
|
+
function allCaseSamplePairs(caseIds, samples) {
|
|
1455
|
+
return caseIds.flatMap((caseId) => Array.from({ length: samples }, (_, sampleIndex) => ({
|
|
1456
|
+
caseId,
|
|
1457
|
+
sampleIndex,
|
|
1458
|
+
})));
|
|
1459
|
+
}
|
|
1460
|
+
function orderedCaseIdsForPairs(caseIds, pairs) {
|
|
1461
|
+
const selected = new Set(pairs.map((pair) => pair.caseId));
|
|
1462
|
+
return caseIds.filter((caseId) => selected.has(caseId));
|
|
1463
|
+
}
|
|
1464
|
+
function sampleIndexesByCase(pairs) {
|
|
1465
|
+
const byCase = new Map();
|
|
1466
|
+
for (const pair of pairs) {
|
|
1467
|
+
byCase.set(pair.caseId, [...(byCase.get(pair.caseId) ?? []), pair.sampleIndex]);
|
|
1468
|
+
}
|
|
1469
|
+
for (const [caseId, indexes] of byCase.entries()) {
|
|
1470
|
+
byCase.set(caseId, [...new Set(indexes)].sort((left, right) => left - right));
|
|
1471
|
+
}
|
|
1472
|
+
return byCase;
|
|
1473
|
+
}
|
|
1474
|
+
function latestCompletedAttemptJobsByPair(jobs, desiredKeys) {
|
|
1475
|
+
const byPair = new Map();
|
|
1476
|
+
for (const job of jobs) {
|
|
1477
|
+
if (job.status !== "succeeded" || executionPurposeFromJobInput(job.input) !== "attempt") {
|
|
1478
|
+
continue;
|
|
1479
|
+
}
|
|
1480
|
+
const pair = caseSamplePairFromJob(job);
|
|
1481
|
+
if (!pair) {
|
|
1482
|
+
continue;
|
|
1483
|
+
}
|
|
1484
|
+
const key = caseSamplePairKey(pair);
|
|
1485
|
+
if (!desiredKeys.has(key)) {
|
|
1486
|
+
continue;
|
|
1487
|
+
}
|
|
1488
|
+
const previous = byPair.get(key);
|
|
1489
|
+
if (!previous || compareJobRecency(job, previous) > 0) {
|
|
1490
|
+
byPair.set(key, job);
|
|
1491
|
+
}
|
|
1492
|
+
}
|
|
1493
|
+
return byPair;
|
|
1494
|
+
}
|
|
1495
|
+
function caseSamplePairFromJob(job) {
|
|
1496
|
+
const input = readRecord(job.input);
|
|
1497
|
+
const execution = readRecord(input?.execution);
|
|
1498
|
+
const metadata = readRecord(execution?.metadata);
|
|
1499
|
+
const caseId = stringValue(input?.caseId) ?? stringValue(metadata?.caseId);
|
|
1500
|
+
const sampleIndex = integerValue(input?.sampleIndex) ?? integerValue(metadata?.sampleIndex);
|
|
1501
|
+
return caseId && sampleIndex !== null
|
|
1502
|
+
? { caseId, sampleIndex }
|
|
1503
|
+
: null;
|
|
1504
|
+
}
|
|
1505
|
+
function executionPurposeFromJobInput(inputValue) {
|
|
1506
|
+
const input = readRecord(inputValue);
|
|
1507
|
+
const execution = readRecord(input?.execution);
|
|
1508
|
+
return stringValue(execution?.purpose);
|
|
1509
|
+
}
|
|
1510
|
+
function caseSamplePairKey(pair) {
|
|
1511
|
+
return `${pair.caseId}\0${pair.sampleIndex}`;
|
|
1512
|
+
}
|
|
1513
|
+
function compareJobRecency(left, right) {
|
|
1514
|
+
return jobRecencyTimestamp(left).localeCompare(jobRecencyTimestamp(right)) ||
|
|
1515
|
+
left.id.localeCompare(right.id);
|
|
1516
|
+
}
|
|
1517
|
+
function jobRecencyTimestamp(job) {
|
|
1518
|
+
return job.finishedAt ?? job.updatedAt ?? job.startedAt ?? job.createdAt ?? "";
|
|
1519
|
+
}
|
|
1520
|
+
function findReusableLocalImproveRun(runs, target) {
|
|
1521
|
+
return runs
|
|
1522
|
+
.filter((run) => run.workflow === "improve" &&
|
|
1523
|
+
run.benchmarkFingerprint === target.benchmarkFingerprint &&
|
|
1524
|
+
run.candidateId === target.candidateId &&
|
|
1525
|
+
run.candidateRunId === target.candidateRunId &&
|
|
1526
|
+
run.executionFingerprint === target.executionFingerprint &&
|
|
1527
|
+
run.budget === target.budget &&
|
|
1528
|
+
run.samples === target.samples &&
|
|
1529
|
+
run.status === "finished" &&
|
|
1530
|
+
run.outcome === "ok" &&
|
|
1531
|
+
Boolean(run.outputCandidateId))
|
|
1532
|
+
.sort((left, right) => (right.finishedAt ?? right.startedAt).localeCompare(left.finishedAt ?? left.startedAt) ||
|
|
1533
|
+
right.id.localeCompare(left.id))[0] ?? null;
|
|
921
1534
|
}
|
|
922
1535
|
function localDevViewHint(workspace, runId) {
|
|
923
1536
|
const runFlag = runId ? ` --run ${shellQuote(runId)}` : "";
|
|
@@ -935,20 +1548,26 @@ function localDevOpenUrl(baseUrl, snapshot, runId) {
|
|
|
935
1548
|
.reverse()
|
|
936
1549
|
.find((entry) => entry.runId === runId);
|
|
937
1550
|
if (!evaluation) {
|
|
938
|
-
return new URL("
|
|
1551
|
+
return new URL("candidates", baseUrl).toString();
|
|
939
1552
|
}
|
|
940
1553
|
const params = new URLSearchParams({ evaluation: evaluation.id });
|
|
941
|
-
return new URL(`
|
|
1554
|
+
return new URL(`candidates/${encodeURIComponent(evaluation.candidateId)}?${params.toString()}`, baseUrl).toString();
|
|
942
1555
|
}
|
|
943
1556
|
async function readLocalBenchmarkFingerprint(workspace) {
|
|
944
1557
|
return localBenchmarkFingerprint(await readLocalProjectSource(workspace));
|
|
945
1558
|
}
|
|
946
|
-
function
|
|
1559
|
+
function localRunExecutionFingerprint(projectSource) {
|
|
1560
|
+
return workbenchRunExecutionFingerprint({
|
|
1561
|
+
sourceYaml: projectSource.specSource,
|
|
1562
|
+
adapterFiles: normalizeSurfaceFiles(projectSource.adapterFiles),
|
|
1563
|
+
});
|
|
1564
|
+
}
|
|
1565
|
+
function authoredCandidateSourceFiles(projectSource) {
|
|
947
1566
|
return [{
|
|
948
|
-
path: path.relative(projectSource.dir, projectSource.
|
|
1567
|
+
path: path.relative(projectSource.dir, projectSource.candidateSpecPath).split(path.sep).join("/"),
|
|
949
1568
|
kind: "text",
|
|
950
1569
|
encoding: "utf8",
|
|
951
|
-
content: projectSource.
|
|
1570
|
+
content: projectSource.candidateSource,
|
|
952
1571
|
executable: false,
|
|
953
1572
|
}];
|
|
954
1573
|
}
|
|
@@ -1155,72 +1774,72 @@ function requireValidRunEnvelope(args) {
|
|
|
1155
1774
|
}
|
|
1156
1775
|
async function localRestore(argv, io) {
|
|
1157
1776
|
const parsed = parseArgs(argv);
|
|
1158
|
-
rejectUnknownFlags(parsed, new Set(["dir", "
|
|
1777
|
+
rejectUnknownFlags(parsed, new Set(["dir", "candidate", "dry-run", "yes", "json"]));
|
|
1159
1778
|
const workspace = resolveDir(parsed);
|
|
1160
1779
|
const spec = await readLocalSpecIfValid(workspace);
|
|
1161
1780
|
if (!spec) {
|
|
1162
1781
|
throw new UsageError("restore requires a valid Workbench project.");
|
|
1163
1782
|
}
|
|
1164
|
-
const
|
|
1783
|
+
const candidateRoot = spec.candidate.files.path;
|
|
1165
1784
|
const snapshot = await loadLocalArchive(workspace);
|
|
1166
|
-
const
|
|
1167
|
-
const files =
|
|
1785
|
+
const candidateId = readCandidateIdFlag(parsed, snapshot);
|
|
1786
|
+
const files = readLocalCandidateFiles(snapshot, candidateId);
|
|
1168
1787
|
if (parsed.flags["dry-run"] === true) {
|
|
1169
|
-
writeOutput({ ok: true,
|
|
1788
|
+
writeOutput({ ok: true, candidateId: candidateId, fileCount: files.length }, parsed, io, () => `Restore would write ${files.length} file(s) from ${candidateId}.`);
|
|
1170
1789
|
return 0;
|
|
1171
1790
|
}
|
|
1172
1791
|
if (parsed.flags.yes !== true) {
|
|
1173
1792
|
throw new UsageError("restore requires --dry-run to preview or --yes to apply source directory changes.");
|
|
1174
1793
|
}
|
|
1175
|
-
const changedPaths = await
|
|
1176
|
-
const next = setLocalActive(snapshot,
|
|
1794
|
+
const changedPaths = await materializeCandidateRoot(workspace, candidateRoot, files);
|
|
1795
|
+
const next = setLocalActive(snapshot, candidateId);
|
|
1177
1796
|
await saveLocalArchive(workspace, next);
|
|
1178
|
-
writeOutput({ ok: true,
|
|
1797
|
+
writeOutput({ ok: true, activeCandidateId: candidateId, changedPaths }, parsed, io, () => `Restored ${candidateId} to ${candidateRoot}.`);
|
|
1179
1798
|
return 0;
|
|
1180
1799
|
}
|
|
1181
|
-
async function
|
|
1800
|
+
async function localCandidateList(argv, io) {
|
|
1182
1801
|
const parsed = parseArgs(argv);
|
|
1183
1802
|
rejectUnknownFlags(parsed, new Set(["dir", "json"]));
|
|
1184
1803
|
const snapshot = await loadLocalArchive(resolveDir(parsed));
|
|
1185
|
-
writeOutput(snapshot.
|
|
1186
|
-
.map((
|
|
1187
|
-
.join("\n") || "No
|
|
1804
|
+
writeOutput(snapshot.candidates, parsed, io, (candidates) => candidates
|
|
1805
|
+
.map((candidate) => `${candidate.id}\t${candidate.status}\tevaluation ${formatCandidateEvaluationScore(candidate)}${snapshot.activeId === candidate.id ? "\tactive" : ""}`)
|
|
1806
|
+
.join("\n") || "No candidates.");
|
|
1188
1807
|
return 0;
|
|
1189
1808
|
}
|
|
1190
|
-
async function
|
|
1809
|
+
async function localCandidateShow(argv, io) {
|
|
1191
1810
|
const parsed = parseArgs(argv);
|
|
1192
|
-
rejectUnknownFlags(parsed, new Set(["dir", "
|
|
1811
|
+
rejectUnknownFlags(parsed, new Set(["dir", "candidate", "json"]));
|
|
1193
1812
|
const snapshot = await loadLocalArchive(resolveDir(parsed));
|
|
1194
|
-
const
|
|
1195
|
-
const
|
|
1196
|
-
writeOutput(
|
|
1813
|
+
const candidateId = readCandidateIdFlag(parsed, snapshot);
|
|
1814
|
+
const candidate = readLocalCandidate(snapshot, candidateId);
|
|
1815
|
+
writeOutput(candidate, parsed, io, (record) => [
|
|
1197
1816
|
`${record.id}\t${record.status}`,
|
|
1198
1817
|
`benchmark\t${record.benchmarkFingerprint}`,
|
|
1199
|
-
`
|
|
1200
|
-
`
|
|
1818
|
+
`candidate\t${record.candidateFingerprint ?? record.candidateFingerprint}`,
|
|
1819
|
+
`evaluation\t${formatCandidateEvaluationSummary(record)}`,
|
|
1201
1820
|
...(record.baseId ? [`base\t${record.baseId}`] : []),
|
|
1202
1821
|
].join("\n"));
|
|
1203
1822
|
return 0;
|
|
1204
1823
|
}
|
|
1205
|
-
async function
|
|
1824
|
+
async function localCandidateFiles(argv, io) {
|
|
1206
1825
|
const parsed = parseArgs(argv);
|
|
1207
|
-
rejectUnknownFlags(parsed, new Set(["dir", "
|
|
1826
|
+
rejectUnknownFlags(parsed, new Set(["dir", "candidate", "json"]));
|
|
1208
1827
|
const snapshot = await loadLocalArchive(resolveDir(parsed));
|
|
1209
|
-
const
|
|
1210
|
-
const
|
|
1211
|
-
const files =
|
|
1828
|
+
const candidateId = readCandidateIdFlag(parsed, snapshot);
|
|
1829
|
+
const candidate = readLocalCandidate(snapshot, candidateId);
|
|
1830
|
+
const files = summarizeCandidateFiles(readLocalCandidateFiles(snapshot, candidateId), candidate.fileChanges);
|
|
1212
1831
|
writeOutput(files, parsed, io, (records) => records
|
|
1213
1832
|
.map((file) => `${file.path}\t${file.status}\t${file.preview_kind}`)
|
|
1214
1833
|
.join("\n") || "No files.");
|
|
1215
1834
|
return 0;
|
|
1216
1835
|
}
|
|
1217
|
-
async function
|
|
1836
|
+
async function localCandidatePreview(argv, io) {
|
|
1218
1837
|
const parsed = parseArgs(argv);
|
|
1219
|
-
rejectUnknownFlags(parsed, new Set(["dir", "
|
|
1838
|
+
rejectUnknownFlags(parsed, new Set(["dir", "candidate", "path", "output", "view", "json"]));
|
|
1220
1839
|
const snapshot = await loadLocalArchive(resolveDir(parsed));
|
|
1221
|
-
const
|
|
1222
|
-
const preview =
|
|
1223
|
-
files:
|
|
1840
|
+
const candidateId = readCandidateIdFlag(parsed, snapshot);
|
|
1841
|
+
const preview = createCandidateFilePreview({
|
|
1842
|
+
files: readLocalCandidateFiles(snapshot, candidateId),
|
|
1224
1843
|
path: requireFlag(parsed, "path"),
|
|
1225
1844
|
view: readPreviewMode(parsed),
|
|
1226
1845
|
});
|
|
@@ -1755,7 +2374,7 @@ function createAdapterScaffoldFiles(id) {
|
|
|
1755
2374
|
"setup:",
|
|
1756
2375
|
" - npm install --global .",
|
|
1757
2376
|
"operations:",
|
|
1758
|
-
"
|
|
2377
|
+
" candidate.run: {}",
|
|
1759
2378
|
"",
|
|
1760
2379
|
].join("\n");
|
|
1761
2380
|
const packageJson = `${JSON.stringify({
|
|
@@ -1777,11 +2396,11 @@ const request = requestPath && fs.existsSync(requestPath)
|
|
|
1777
2396
|
? JSON.parse(fs.readFileSync(requestPath, "utf8"))
|
|
1778
2397
|
: {};
|
|
1779
2398
|
fs.mkdirSync(outputRoot, { recursive: true });
|
|
1780
|
-
const operation = request.operation || "
|
|
2399
|
+
const operation = request.operation || "candidate.run";
|
|
1781
2400
|
const resultPath = process.env.WORKBENCH_RESULT || request.paths?.result || path.join(outputRoot, "workbench-result.json");
|
|
1782
2401
|
|
|
1783
2402
|
let value;
|
|
1784
|
-
if (operation === "
|
|
2403
|
+
if (operation === "candidate.run") {
|
|
1785
2404
|
const task = request.context?.case?.prompt || "No case prompt was provided.";
|
|
1786
2405
|
fs.writeFileSync(path.join(outputRoot, "adapter-output.txt"), [
|
|
1787
2406
|
"adapter: ${id}",
|
|
@@ -1790,7 +2409,7 @@ if (operation === "subject.run") {
|
|
|
1790
2409
|
"",
|
|
1791
2410
|
].join("\\n"));
|
|
1792
2411
|
} else {
|
|
1793
|
-
console.error("${id} only implements
|
|
2412
|
+
console.error("${id} only implements candidate.run.");
|
|
1794
2413
|
process.exit(2);
|
|
1795
2414
|
}
|
|
1796
2415
|
|
|
@@ -2065,7 +2684,7 @@ async function resolveAdapterForAuthTarget(dir, targetRaw) {
|
|
|
2065
2684
|
const adapters = await resolveWorkbenchAdaptersForProject(dir, spec);
|
|
2066
2685
|
const adapter = adapters.find((entry) => entry.manifest.id === target.adapterId);
|
|
2067
2686
|
if (!adapter) {
|
|
2068
|
-
throw new UsageError(`Adapter ${target.adapterId} is not used by this benchmark source. Add it to the benchmark
|
|
2687
|
+
throw new UsageError(`Adapter ${target.adapterId} is not used by this benchmark source. Add it to the benchmark or candidate YAML before connecting auth.`);
|
|
2069
2688
|
}
|
|
2070
2689
|
if (!adapter.manifest.auth) {
|
|
2071
2690
|
throw new UsageError(`Adapter ${target.adapterId} does not declare auth.`);
|
|
@@ -2728,15 +3347,209 @@ async function starProject(argv, io, starred) {
|
|
|
2728
3347
|
});
|
|
2729
3348
|
return 0;
|
|
2730
3349
|
}
|
|
3350
|
+
async function retryHostedWorkflow(argv, io) {
|
|
3351
|
+
const parsed = parseArgs(argv);
|
|
3352
|
+
rejectUnknownFlags(parsed, new Set([
|
|
3353
|
+
"dir",
|
|
3354
|
+
"benchmark",
|
|
3355
|
+
"watch",
|
|
3356
|
+
"interval-ms",
|
|
3357
|
+
"timeout-ms",
|
|
3358
|
+
"json",
|
|
3359
|
+
]));
|
|
3360
|
+
rejectUnexpectedPositionals(parsed, "workbench cloud retry", 1);
|
|
3361
|
+
const targetId = parsed.positionals[0];
|
|
3362
|
+
if (!targetId) {
|
|
3363
|
+
throw new UsageError("Missing required TARGET_ID.");
|
|
3364
|
+
}
|
|
3365
|
+
if (parsed.flags.watch !== true && (parsed.flags["interval-ms"] !== undefined ||
|
|
3366
|
+
parsed.flags["timeout-ms"] !== undefined)) {
|
|
3367
|
+
throw new UsageError("--interval-ms and --timeout-ms require --watch.");
|
|
3368
|
+
}
|
|
3369
|
+
const target = await resolveHostedTarget(parsed, { requireProjectIdentity: true });
|
|
3370
|
+
const retryTarget = await resolveHostedRetryTarget(target, targetId);
|
|
3371
|
+
const watchIntervalMs = parsed.flags.watch === true
|
|
3372
|
+
? parsePositiveInt(parsed.flags["interval-ms"], 1000, "interval-ms")
|
|
3373
|
+
: undefined;
|
|
3374
|
+
const watchTimeoutMs = parsed.flags.watch === true
|
|
3375
|
+
? parseOptionalPositiveInt(parsed.flags["timeout-ms"], "timeout-ms")
|
|
3376
|
+
: undefined;
|
|
3377
|
+
const response = await apiRequest(projectApiPath(target.projectId, "/runs"), {
|
|
3378
|
+
method: "POST",
|
|
3379
|
+
body: retryTarget.request,
|
|
3380
|
+
}, target.baseUrl);
|
|
3381
|
+
const startedRun = withRunUrls(target, response.run);
|
|
3382
|
+
if (parsed.flags.watch === true) {
|
|
3383
|
+
if (parsed.flags.json !== true) {
|
|
3384
|
+
io.stdout.write(`${formatHostedRunStarted(startedRun, retryTarget.workflow).trimEnd()}\n${HOSTED_WATCH_LIFECYCLE_NOTE}\n`);
|
|
3385
|
+
}
|
|
3386
|
+
const watched = await watchHostedRun({
|
|
3387
|
+
parsed,
|
|
3388
|
+
target,
|
|
3389
|
+
runId: response.run.id,
|
|
3390
|
+
intervalMs: watchIntervalMs ?? 1000,
|
|
3391
|
+
timeoutMs: watchTimeoutMs,
|
|
3392
|
+
});
|
|
3393
|
+
const outputRun = withRunUrls(target, await withHostedRunFailureSummary(target, watched));
|
|
3394
|
+
const result = {
|
|
3395
|
+
ok: hostedRunSucceeded(watched),
|
|
3396
|
+
retried: {
|
|
3397
|
+
id: retryTarget.sourceId,
|
|
3398
|
+
kind: retryTarget.sourceKind,
|
|
3399
|
+
workflow: retryTarget.workflow,
|
|
3400
|
+
},
|
|
3401
|
+
runId: outputRun.id,
|
|
3402
|
+
candidateId: outputRun.outputCandidateId ?? outputRun.candidateId,
|
|
3403
|
+
activeCandidateId: outputRun.activeCandidateId ?? null,
|
|
3404
|
+
run: outputRun,
|
|
3405
|
+
...(outputRun.urls ? { urls: outputRun.urls } : {}),
|
|
3406
|
+
...(outputRun.failedJobCount !== undefined ? { failedJobCount: outputRun.failedJobCount } : {}),
|
|
3407
|
+
...(outputRun.error ? { error: outputRun.error } : {}),
|
|
3408
|
+
};
|
|
3409
|
+
writeOutput(result, parsed, io, formatRetryCommandResult);
|
|
3410
|
+
return hostedRunSucceeded(watched) ? 0 : 1;
|
|
3411
|
+
}
|
|
3412
|
+
const result = {
|
|
3413
|
+
ok: true,
|
|
3414
|
+
retried: {
|
|
3415
|
+
id: retryTarget.sourceId,
|
|
3416
|
+
kind: retryTarget.sourceKind,
|
|
3417
|
+
workflow: retryTarget.workflow,
|
|
3418
|
+
},
|
|
3419
|
+
runId: startedRun.id,
|
|
3420
|
+
candidateId: startedRun.outputCandidateId ?? startedRun.candidateId,
|
|
3421
|
+
activeCandidateId: startedRun.activeCandidateId ?? null,
|
|
3422
|
+
run: startedRun,
|
|
3423
|
+
...(startedRun.urls ? { urls: startedRun.urls } : {}),
|
|
3424
|
+
};
|
|
3425
|
+
writeOutput(result, parsed, io, formatRetryCommandResult);
|
|
3426
|
+
return 0;
|
|
3427
|
+
}
|
|
3428
|
+
async function resolveHostedRetryTarget(target, targetId) {
|
|
3429
|
+
if (targetId.startsWith("eval_")) {
|
|
3430
|
+
return await resolveHostedEvaluationRetryTarget(target, targetId);
|
|
3431
|
+
}
|
|
3432
|
+
const detail = await readHostedRunDetail(target, targetId);
|
|
3433
|
+
const run = detail.run;
|
|
3434
|
+
if (run.status !== "finished") {
|
|
3435
|
+
throw new UsageError(`Run ${run.id} is ${run.status}; wait for it to finish before retrying.`);
|
|
3436
|
+
}
|
|
3437
|
+
if (!hostedRunRecordFailed(run)) {
|
|
3438
|
+
throw new UsageError(`Run ${run.id} did not fail; use workbench cloud ${run.workflow ?? "eval"} to intentionally run it again.`);
|
|
3439
|
+
}
|
|
3440
|
+
if (run.workflow === "eval") {
|
|
3441
|
+
const candidateId = hostedRunEvaluationCandidateId(run, detail.jobs);
|
|
3442
|
+
if (!candidateId) {
|
|
3443
|
+
throw new UsageError(`Run ${run.id} has no candidate id to retry.`);
|
|
3444
|
+
}
|
|
3445
|
+
return {
|
|
3446
|
+
sourceId: targetId,
|
|
3447
|
+
sourceKind: "run",
|
|
3448
|
+
workflow: "eval",
|
|
3449
|
+
request: {
|
|
3450
|
+
workflow: "eval",
|
|
3451
|
+
samples: run.samples ?? 1,
|
|
3452
|
+
candidateId,
|
|
3453
|
+
sourceYaml: hostedRetrySourceYaml(run, run.id),
|
|
3454
|
+
preserveActive: true,
|
|
3455
|
+
...retrySampleSelectionFromJobs(detail.jobs),
|
|
3456
|
+
},
|
|
3457
|
+
};
|
|
3458
|
+
}
|
|
3459
|
+
if (run.workflow === "improve") {
|
|
3460
|
+
const baseCandidateId = stringValue(readRecord(run.input)?.baseCandidateId);
|
|
3461
|
+
if (!baseCandidateId) {
|
|
3462
|
+
throw new UsageError(`Run ${run.id} is missing its base candidate id.`);
|
|
3463
|
+
}
|
|
3464
|
+
return {
|
|
3465
|
+
sourceId: targetId,
|
|
3466
|
+
sourceKind: "run",
|
|
3467
|
+
workflow: "improve",
|
|
3468
|
+
request: {
|
|
3469
|
+
workflow: "improve",
|
|
3470
|
+
samples: run.samples ?? 1,
|
|
3471
|
+
budget: run.budget ?? run.attemptsRequested ?? 1,
|
|
3472
|
+
candidateId: baseCandidateId,
|
|
3473
|
+
sourceYaml: hostedRetrySourceYaml(run, run.id),
|
|
3474
|
+
preserveActive: true,
|
|
3475
|
+
},
|
|
3476
|
+
};
|
|
3477
|
+
}
|
|
3478
|
+
throw new UsageError(`Run ${run.id} has no retryable workflow.`);
|
|
3479
|
+
}
|
|
3480
|
+
async function resolveHostedEvaluationRetryTarget(target, evaluationId) {
|
|
3481
|
+
const snapshot = await apiRequest(projectApiPath(target.projectId, "/workbench/snapshot"), {}, target.baseUrl);
|
|
3482
|
+
const evaluation = snapshot.evaluations.find((entry) => entry.id === evaluationId);
|
|
3483
|
+
if (!evaluation) {
|
|
3484
|
+
throw new UsageError(`Hosted evaluation not found: ${evaluationId}`);
|
|
3485
|
+
}
|
|
3486
|
+
const run = snapshot.runs.find((entry) => entry.id === evaluation.runId) ?? null;
|
|
3487
|
+
if (!evaluationScorecardFailed(evaluation, run)) {
|
|
3488
|
+
throw new UsageError(`Evaluation ${evaluation.id} did not fail; use workbench cloud eval to intentionally run it again.`);
|
|
3489
|
+
}
|
|
3490
|
+
if (!run) {
|
|
3491
|
+
throw new UsageError(`Evaluation ${evaluation.id} is missing its run record.`);
|
|
3492
|
+
}
|
|
3493
|
+
const detail = await readHostedRunDetail(target, run.id);
|
|
3494
|
+
const detailedRun = detail.run;
|
|
3495
|
+
return {
|
|
3496
|
+
sourceId: evaluationId,
|
|
3497
|
+
sourceKind: "evaluation",
|
|
3498
|
+
workflow: "eval",
|
|
3499
|
+
request: {
|
|
3500
|
+
workflow: "eval",
|
|
3501
|
+
samples: evaluation.sampleCount || detailedRun.samples || 1,
|
|
3502
|
+
candidateId: evaluation.candidateId,
|
|
3503
|
+
sourceYaml: hostedRetrySourceYaml(detailedRun, detailedRun.id),
|
|
3504
|
+
preserveActive: true,
|
|
3505
|
+
...retrySampleSelectionFromJobs(detail.jobs),
|
|
3506
|
+
},
|
|
3507
|
+
};
|
|
3508
|
+
}
|
|
3509
|
+
function retrySampleSelectionFromJobs(jobs) {
|
|
3510
|
+
const selectedSamples = uniqueCaseSamplePairs(jobs
|
|
3511
|
+
.filter((job) => job.status !== "succeeded" &&
|
|
3512
|
+
executionPurposeFromJobInput(job.input) === "attempt")
|
|
3513
|
+
.map(caseSamplePairFromJob)
|
|
3514
|
+
.filter((pair) => pair !== null));
|
|
3515
|
+
return selectedSamples.length > 0
|
|
3516
|
+
? { selectedSamples }
|
|
3517
|
+
: {};
|
|
3518
|
+
}
|
|
3519
|
+
function uniqueCaseSamplePairs(pairs) {
|
|
3520
|
+
const byKey = new Map();
|
|
3521
|
+
for (const pair of pairs) {
|
|
3522
|
+
byKey.set(caseSamplePairKey(pair), pair);
|
|
3523
|
+
}
|
|
3524
|
+
return [...byKey.values()].sort((left, right) => left.caseId.localeCompare(right.caseId) ||
|
|
3525
|
+
left.sampleIndex - right.sampleIndex);
|
|
3526
|
+
}
|
|
3527
|
+
async function readHostedRunDetail(target, runId) {
|
|
3528
|
+
return await apiRequest(projectApiPath(target.projectId, `/runs/${encodeURIComponent(runId)}`), {}, target.baseUrl);
|
|
3529
|
+
}
|
|
3530
|
+
function hostedRetrySourceYaml(run, runId) {
|
|
3531
|
+
const sourceYaml = stringValue(readRecord(run.input)?.sourceYaml);
|
|
3532
|
+
if (!sourceYaml) {
|
|
3533
|
+
throw new UsageError(`Run ${runId} is missing its recorded source configuration.`);
|
|
3534
|
+
}
|
|
3535
|
+
return sourceYaml;
|
|
3536
|
+
}
|
|
3537
|
+
function hostedRunRecordFailed(run) {
|
|
3538
|
+
return run.outcome === "error" ||
|
|
3539
|
+
run.outcome === "cancelled" ||
|
|
3540
|
+
(run.failedJobCount ?? 0) > 0 ||
|
|
3541
|
+
Boolean(run.error);
|
|
3542
|
+
}
|
|
2731
3543
|
async function startHostedWorkflow(workflow, argv, io) {
|
|
2732
3544
|
const parsed = parseArgs(argv);
|
|
2733
3545
|
rejectUnknownFlags(parsed, new Set([
|
|
2734
3546
|
"dir",
|
|
2735
3547
|
"benchmark",
|
|
2736
3548
|
"base",
|
|
2737
|
-
"
|
|
3549
|
+
"runs",
|
|
2738
3550
|
"budget",
|
|
2739
3551
|
"samples",
|
|
3552
|
+
"rerun",
|
|
2740
3553
|
"watch",
|
|
2741
3554
|
"dry-run",
|
|
2742
3555
|
"interval-ms",
|
|
@@ -2746,42 +3559,69 @@ async function startHostedWorkflow(workflow, argv, io) {
|
|
|
2746
3559
|
if (parsed.positionals.length > 1) {
|
|
2747
3560
|
throw new UsageError(`workbench cloud ${workflow} accepts at most one source file or directory argument.`);
|
|
2748
3561
|
}
|
|
2749
|
-
const optimizerPath = asOptionalString(parsed.flags.optimizer);
|
|
2750
3562
|
const sourceArg = parsed.positionals[0] ?? asOptionalString(parsed.flags.dir) ?? process.cwd();
|
|
2751
3563
|
if (parsed.positionals.length > 0 && parsed.flags.dir !== undefined) {
|
|
2752
3564
|
throw new UsageError("Use either --dir or SOURCE, not both.");
|
|
2753
3565
|
}
|
|
2754
|
-
const
|
|
3566
|
+
const samples = parsePositiveInt(parsed.flags.samples, 1, "samples");
|
|
3567
|
+
const budget = workflow === "improve"
|
|
3568
|
+
? parsePositiveInt(parsed.flags.budget, 1, "budget")
|
|
3569
|
+
: undefined;
|
|
3570
|
+
if (parsed.flags.watch !== true && (parsed.flags["interval-ms"] !== undefined ||
|
|
3571
|
+
parsed.flags["timeout-ms"] !== undefined)) {
|
|
3572
|
+
throw new UsageError("--interval-ms and --timeout-ms require --watch.");
|
|
3573
|
+
}
|
|
3574
|
+
const runsFlag = asOptionalString(parsed.flags.runs);
|
|
3575
|
+
const defaultProjectSource = await readLocalProjectSource(path.resolve(sourceArg));
|
|
3576
|
+
const selectedRunIds = workflow === "eval"
|
|
3577
|
+
? resolveCandidateRunSelection(defaultProjectSource, runsFlag)
|
|
3578
|
+
: [singleRequestedRunId(runsFlag, `workbench cloud ${workflow}`) ?? defaultProjectSource.candidateRunId];
|
|
3579
|
+
if (workflow === "eval" && selectedRunIds.length > 1) {
|
|
3580
|
+
let failed = 0;
|
|
3581
|
+
const results = [];
|
|
3582
|
+
for (const runId of selectedRunIds) {
|
|
3583
|
+
const captured = createCapturingIo(io);
|
|
3584
|
+
const code = await startHostedWorkflow(workflow, hostedWorkflowArgsForRun({
|
|
3585
|
+
parsed,
|
|
3586
|
+
sourceDir: defaultProjectSource.dir,
|
|
3587
|
+
runId,
|
|
3588
|
+
}), captured.io);
|
|
3589
|
+
if (code !== 0) {
|
|
3590
|
+
failed += 1;
|
|
3591
|
+
}
|
|
3592
|
+
results.push(parseCapturedJson(captured.stdoutText()));
|
|
3593
|
+
}
|
|
3594
|
+
writeOutput({
|
|
3595
|
+
ok: failed === 0,
|
|
3596
|
+
candidateRunIds: selectedRunIds,
|
|
3597
|
+
failedRunCount: failed,
|
|
3598
|
+
results,
|
|
3599
|
+
}, parsed, io, () => `Processed ${selectedRunIds.length} hosted candidate run(s); ${failed} failed.`);
|
|
3600
|
+
return failed === 0 ? 0 : 1;
|
|
3601
|
+
}
|
|
3602
|
+
const baseCandidateId = asOptionalString(parsed.flags.base);
|
|
2755
3603
|
const request = workflow === "improve"
|
|
2756
3604
|
? {
|
|
2757
3605
|
workflow,
|
|
2758
|
-
budget
|
|
2759
|
-
samples
|
|
2760
|
-
...(
|
|
3606
|
+
budget,
|
|
3607
|
+
samples,
|
|
3608
|
+
...(baseCandidateId ? { candidateId: baseCandidateId } : {}),
|
|
2761
3609
|
}
|
|
2762
3610
|
: {
|
|
2763
3611
|
workflow,
|
|
2764
|
-
samples
|
|
2765
|
-
...(
|
|
3612
|
+
samples,
|
|
3613
|
+
...(baseCandidateId ? { candidateId: baseCandidateId } : {}),
|
|
2766
3614
|
};
|
|
2767
|
-
|
|
2768
|
-
|
|
2769
|
-
|
|
2770
|
-
|
|
2771
|
-
|
|
2772
|
-
|
|
2773
|
-
|
|
2774
|
-
const projectSource = await readLocalProjectSource(path.resolve(sourceArg), {
|
|
2775
|
-
optimizerPath,
|
|
2776
|
-
});
|
|
2777
|
-
if (workflow === "eval") {
|
|
2778
|
-
request.subjectSource = projectSource.subjectSource;
|
|
2779
|
-
request.subjectFiles = projectSource.subjectFiles;
|
|
2780
|
-
request.adapterFiles = projectSource.adapterFiles;
|
|
3615
|
+
const projectSource = selectedRunIds[0] === defaultProjectSource.candidateRunId
|
|
3616
|
+
? defaultProjectSource
|
|
3617
|
+
: await readLocalProjectSource(path.resolve(sourceArg), { runId: selectedRunIds[0] });
|
|
3618
|
+
request.sourceYaml = projectSource.specSource;
|
|
3619
|
+
request.adapterFiles = projectSource.adapterFiles;
|
|
3620
|
+
if (workflow === "eval" && !baseCandidateId) {
|
|
3621
|
+
request.candidateFiles = projectSource.candidateFiles;
|
|
2781
3622
|
}
|
|
2782
|
-
if (
|
|
2783
|
-
request.
|
|
2784
|
-
request.adapterFiles = projectSource.adapterFiles;
|
|
3623
|
+
if (parsed.flags.rerun === true) {
|
|
3624
|
+
request.rerun = true;
|
|
2785
3625
|
}
|
|
2786
3626
|
const watchIntervalMs = parsed.flags.watch === true
|
|
2787
3627
|
? parsePositiveInt(parsed.flags["interval-ms"], 1000, "interval-ms")
|
|
@@ -2808,11 +3648,13 @@ async function startHostedWorkflow(workflow, argv, io) {
|
|
|
2808
3648
|
sourceDir: projectSource.dir,
|
|
2809
3649
|
});
|
|
2810
3650
|
if (workflow === "improve") {
|
|
2811
|
-
request.
|
|
3651
|
+
request.candidateId = await ensureHostedImproveBaseCandidate({
|
|
2812
3652
|
parsed,
|
|
2813
3653
|
target,
|
|
2814
3654
|
samples: request.samples,
|
|
2815
|
-
|
|
3655
|
+
candidateId: baseCandidateId,
|
|
3656
|
+
sourceYaml: projectSource.specSource,
|
|
3657
|
+
adapterFiles: projectSource.adapterFiles,
|
|
2816
3658
|
intervalMs: watchIntervalMs ?? 1000,
|
|
2817
3659
|
timeoutMs: watchTimeoutMs,
|
|
2818
3660
|
});
|
|
@@ -2822,6 +3664,19 @@ async function startHostedWorkflow(workflow, argv, io) {
|
|
|
2822
3664
|
body: request,
|
|
2823
3665
|
}, target.baseUrl);
|
|
2824
3666
|
const startedRun = withRunUrls(target, response.run);
|
|
3667
|
+
const startedRunOutput = response.reused === true
|
|
3668
|
+
? { ...startedRun, reused: true }
|
|
3669
|
+
: startedRun;
|
|
3670
|
+
if (response.reused === true && response.run.status === "finished") {
|
|
3671
|
+
writeOutput({
|
|
3672
|
+
ok: hostedRunSucceeded(response.run),
|
|
3673
|
+
reused: true,
|
|
3674
|
+
workflow,
|
|
3675
|
+
runId: startedRun.id,
|
|
3676
|
+
...startedRun,
|
|
3677
|
+
}, parsed, io, () => `Reused hosted ${workflow} ${startedRun.id}. Use --rerun to intentionally run it again.`);
|
|
3678
|
+
return hostedRunSucceeded(response.run) ? 0 : 1;
|
|
3679
|
+
}
|
|
2825
3680
|
if (parsed.flags.watch === true) {
|
|
2826
3681
|
if (parsed.flags.json !== true) {
|
|
2827
3682
|
io.stdout.write(`${formatHostedRunStarted(startedRun, workflow).trimEnd()}\n${HOSTED_WATCH_LIFECYCLE_NOTE}\n`);
|
|
@@ -2837,23 +3692,23 @@ async function startHostedWorkflow(workflow, argv, io) {
|
|
|
2837
3692
|
writeOutput(withRunUrls(target, outputRun), parsed, io, formatHostedRunResult);
|
|
2838
3693
|
return hostedRunSucceeded(watched) ? 0 : 1;
|
|
2839
3694
|
}
|
|
2840
|
-
writeOutput(
|
|
3695
|
+
writeOutput(startedRunOutput, parsed, io, (run) => formatHostedRunStarted(run, workflow).trimEnd());
|
|
2841
3696
|
return 0;
|
|
2842
3697
|
}
|
|
2843
|
-
async function
|
|
2844
|
-
if (args.
|
|
2845
|
-
const
|
|
2846
|
-
if (!
|
|
2847
|
-
throw new UsageError(`Base
|
|
3698
|
+
async function ensureHostedImproveBaseCandidate(args) {
|
|
3699
|
+
if (args.candidateId) {
|
|
3700
|
+
const candidate = await readHostedCandidateSummary(args.target, args.candidateId);
|
|
3701
|
+
if (!candidate) {
|
|
3702
|
+
throw new UsageError(`Base candidate ${args.candidateId} was not found for the current benchmark.`);
|
|
2848
3703
|
}
|
|
2849
|
-
if (
|
|
2850
|
-
return args.
|
|
3704
|
+
if (hostedCandidateIsEvaluated(candidate)) {
|
|
3705
|
+
return args.candidateId;
|
|
2851
3706
|
}
|
|
2852
3707
|
}
|
|
2853
3708
|
else {
|
|
2854
|
-
const
|
|
2855
|
-
if (
|
|
2856
|
-
return
|
|
3709
|
+
const activeCandidate = await readEvaluatedActiveHostedCandidate(args.target);
|
|
3710
|
+
if (activeCandidate) {
|
|
3711
|
+
return activeCandidate.id;
|
|
2857
3712
|
}
|
|
2858
3713
|
}
|
|
2859
3714
|
const response = await apiRequest(projectApiPath(args.target.projectId, "/runs"), {
|
|
@@ -2861,7 +3716,9 @@ async function ensureHostedImproveBaseSubject(args) {
|
|
|
2861
3716
|
body: {
|
|
2862
3717
|
workflow: "eval",
|
|
2863
3718
|
samples: args.samples,
|
|
2864
|
-
...(args.
|
|
3719
|
+
...(args.candidateId ? { candidateId: args.candidateId } : {}),
|
|
3720
|
+
sourceYaml: args.sourceYaml,
|
|
3721
|
+
...(args.adapterFiles.length > 0 ? { adapterFiles: args.adapterFiles } : {}),
|
|
2865
3722
|
},
|
|
2866
3723
|
}, args.target.baseUrl);
|
|
2867
3724
|
const watched = await watchHostedRun({
|
|
@@ -2872,28 +3729,52 @@ async function ensureHostedImproveBaseSubject(args) {
|
|
|
2872
3729
|
timeoutMs: args.timeoutMs,
|
|
2873
3730
|
});
|
|
2874
3731
|
if (!hostedRunSucceeded(watched)) {
|
|
2875
|
-
throw new UsageError(`Parent
|
|
3732
|
+
throw new UsageError(`Parent candidate eval ${watched.id} failed; improve was not started.`);
|
|
2876
3733
|
}
|
|
2877
|
-
if (!watched.
|
|
2878
|
-
throw new UsageError(`Parent
|
|
3734
|
+
if (!watched.candidateId) {
|
|
3735
|
+
throw new UsageError(`Parent candidate eval ${watched.id} did not produce a candidate.`);
|
|
2879
3736
|
}
|
|
2880
|
-
return watched.
|
|
3737
|
+
return watched.candidateId;
|
|
2881
3738
|
}
|
|
2882
|
-
|
|
2883
|
-
const
|
|
2884
|
-
|
|
3739
|
+
function hostedWorkflowArgsForRun(args) {
|
|
3740
|
+
const next = ["--dir", args.sourceDir, "--runs", args.runId, "--json"];
|
|
3741
|
+
appendStringFlag(next, "benchmark", asOptionalString(args.parsed.flags.benchmark));
|
|
3742
|
+
appendStringFlag(next, "base", asOptionalString(args.parsed.flags.base));
|
|
3743
|
+
appendStringFlag(next, "samples", asOptionalString(args.parsed.flags.samples));
|
|
3744
|
+
appendStringFlag(next, "budget", asOptionalString(args.parsed.flags.budget));
|
|
3745
|
+
appendStringFlag(next, "interval-ms", asOptionalString(args.parsed.flags["interval-ms"]));
|
|
3746
|
+
appendStringFlag(next, "timeout-ms", asOptionalString(args.parsed.flags["timeout-ms"]));
|
|
3747
|
+
if (args.parsed.flags.watch === true) {
|
|
3748
|
+
next.push("--watch");
|
|
3749
|
+
}
|
|
3750
|
+
if (args.parsed.flags["dry-run"] === true) {
|
|
3751
|
+
next.push("--dry-run");
|
|
3752
|
+
}
|
|
3753
|
+
if (args.parsed.flags.rerun === true) {
|
|
3754
|
+
next.push("--rerun");
|
|
3755
|
+
}
|
|
3756
|
+
return next;
|
|
3757
|
+
}
|
|
3758
|
+
function appendStringFlag(args, name, value) {
|
|
3759
|
+
if (value !== undefined) {
|
|
3760
|
+
args.push(`--${name}`, value);
|
|
3761
|
+
}
|
|
3762
|
+
}
|
|
3763
|
+
async function readHostedCandidateSummary(target, candidateId) {
|
|
3764
|
+
const response = await apiRequest(projectApiPath(target.projectId, "/candidates"), {}, target.baseUrl);
|
|
3765
|
+
return response.candidates.find((entry) => entry.id === candidateId) ?? null;
|
|
2885
3766
|
}
|
|
2886
|
-
async function
|
|
3767
|
+
async function readEvaluatedActiveHostedCandidate(target) {
|
|
2887
3768
|
const response = await apiRequest(projectApiPath(target.projectId), {}, target.baseUrl);
|
|
2888
|
-
const
|
|
2889
|
-
if (!
|
|
3769
|
+
const activeCandidateId = response.benchmark.activeCandidateId;
|
|
3770
|
+
if (!activeCandidateId) {
|
|
2890
3771
|
return null;
|
|
2891
3772
|
}
|
|
2892
|
-
const
|
|
2893
|
-
return
|
|
3773
|
+
const candidate = await readHostedCandidateSummary(target, activeCandidateId);
|
|
3774
|
+
return candidate && hostedCandidateIsEvaluated(candidate) ? candidate : null;
|
|
2894
3775
|
}
|
|
2895
|
-
function
|
|
2896
|
-
return
|
|
3776
|
+
function hostedCandidateIsEvaluated(candidate) {
|
|
3777
|
+
return candidate.status === "evaluated" || candidate.eval != null;
|
|
2897
3778
|
}
|
|
2898
3779
|
async function benchmarkList(argv, io) {
|
|
2899
3780
|
const parsed = parseArgs(argv);
|
|
@@ -2905,7 +3786,7 @@ async function benchmarkList(argv, io) {
|
|
|
2905
3786
|
return "No hosted Workbench benchmarks.";
|
|
2906
3787
|
}
|
|
2907
3788
|
return projects
|
|
2908
|
-
.map((project) => `${project.id}\t${project.name}\t${project.runCount} runs\t${project.
|
|
3789
|
+
.map((project) => `${project.id}\t${project.name}\t${project.runCount} runs\t${project.candidateCount} candidates`)
|
|
2909
3790
|
.join("\n");
|
|
2910
3791
|
});
|
|
2911
3792
|
return 0;
|
|
@@ -2924,7 +3805,7 @@ async function benchmarkShow(argv, io) {
|
|
|
2924
3805
|
const response = await apiRequest(benchmarkApiPath(projectRef), {}, await effectiveBaseUrl(origin?.baseUrl));
|
|
2925
3806
|
writeOutput(response.benchmark, parsed, io, (project) => {
|
|
2926
3807
|
const record = project;
|
|
2927
|
-
return `${record.name} (${record.id})\n${record.runs.length} runs\n${record.
|
|
3808
|
+
return `${record.name} (${record.id})\n${record.runs.length} runs\n${record.candidates.length} candidates`;
|
|
2928
3809
|
});
|
|
2929
3810
|
return 0;
|
|
2930
3811
|
}
|
|
@@ -3012,61 +3893,61 @@ async function benchmarkStarred(argv, io) {
|
|
|
3012
3893
|
});
|
|
3013
3894
|
return 0;
|
|
3014
3895
|
}
|
|
3015
|
-
async function
|
|
3896
|
+
async function candidateList(argv, io) {
|
|
3016
3897
|
const parsed = parseArgs(argv);
|
|
3017
3898
|
rejectUnknownFlags(parsed, new Set(["dir", "benchmark", "json"]));
|
|
3018
|
-
rejectUnexpectedPositionals(parsed, "workbench cloud
|
|
3899
|
+
rejectUnexpectedPositionals(parsed, "workbench cloud candidates list", 0);
|
|
3019
3900
|
const target = await resolveHostedTarget(parsed);
|
|
3020
|
-
const response = await apiRequest(projectApiPath(target.projectId, "/
|
|
3021
|
-
writeOutput(response.
|
|
3022
|
-
if (
|
|
3023
|
-
return "No
|
|
3901
|
+
const response = await apiRequest(projectApiPath(target.projectId, "/candidates"), {}, target.baseUrl);
|
|
3902
|
+
writeOutput(response.candidates, parsed, io, (candidates) => {
|
|
3903
|
+
if (candidates.length === 0) {
|
|
3904
|
+
return "No candidates yet.";
|
|
3024
3905
|
}
|
|
3025
|
-
return
|
|
3026
|
-
.map((
|
|
3906
|
+
return candidates
|
|
3907
|
+
.map((candidate) => `${candidate.id}\t${candidate.status}\t${candidate.fileChanges?.length ?? 0} files`)
|
|
3027
3908
|
.join("\n");
|
|
3028
3909
|
});
|
|
3029
3910
|
return 0;
|
|
3030
3911
|
}
|
|
3031
|
-
async function
|
|
3912
|
+
async function candidateShow(argv, io) {
|
|
3032
3913
|
const parsed = parseArgs(argv);
|
|
3033
3914
|
rejectUnknownFlags(parsed, new Set(["dir", "benchmark", "json"]));
|
|
3034
|
-
rejectUnexpectedPositionals(parsed, "workbench cloud
|
|
3915
|
+
rejectUnexpectedPositionals(parsed, "workbench cloud candidates show", 1);
|
|
3035
3916
|
const target = await resolveHostedTarget(parsed);
|
|
3036
|
-
const
|
|
3037
|
-
const params = new URLSearchParams({ id:
|
|
3038
|
-
const
|
|
3039
|
-
writeOutput(
|
|
3917
|
+
const candidateId = readRequiredCandidateId(parsed);
|
|
3918
|
+
const params = new URLSearchParams({ id: candidateId });
|
|
3919
|
+
const candidate = await apiRequest(projectApiPath(target.projectId, `/workbench/record?${params.toString()}`), {}, target.baseUrl);
|
|
3920
|
+
writeOutput(candidate, parsed, io, (record) => {
|
|
3040
3921
|
const value = record;
|
|
3041
3922
|
return [
|
|
3042
|
-
`${value.id ??
|
|
3923
|
+
`${value.id ?? candidateId}\t${value.status ?? "unknown"}`,
|
|
3043
3924
|
...(value.benchmarkFingerprint ? [`Benchmark version: ${shortDigest(value.benchmarkFingerprint)}`] : []),
|
|
3044
|
-
...(value.
|
|
3925
|
+
...(value.candidateFingerprint ? [`Candidate digest: ${shortDigest(value.candidateFingerprint)}`] : []),
|
|
3045
3926
|
].join("\n");
|
|
3046
3927
|
});
|
|
3047
3928
|
return 0;
|
|
3048
3929
|
}
|
|
3049
|
-
async function
|
|
3930
|
+
async function candidateFiles(argv, io) {
|
|
3050
3931
|
const parsed = parseArgs(argv);
|
|
3051
3932
|
rejectUnknownFlags(parsed, new Set(["dir", "benchmark", "json"]));
|
|
3052
|
-
rejectUnexpectedPositionals(parsed, "workbench cloud
|
|
3933
|
+
rejectUnexpectedPositionals(parsed, "workbench cloud candidates files", 1);
|
|
3053
3934
|
const target = await resolveHostedTarget(parsed);
|
|
3054
|
-
const
|
|
3055
|
-
const response = await apiRequest(projectApiPath(target.projectId, `/
|
|
3935
|
+
const candidateId = readRequiredCandidateId(parsed);
|
|
3936
|
+
const response = await apiRequest(projectApiPath(target.projectId, `/candidates/${encodeURIComponent(candidateId)}/files`), {}, target.baseUrl);
|
|
3056
3937
|
writeOutput(response.files, parsed, io, (files) => files
|
|
3057
3938
|
.map((file) => `${file.path}\t${file.status}\t${file.preview_kind}`)
|
|
3058
3939
|
.join("\n") || "No files.");
|
|
3059
3940
|
return 0;
|
|
3060
3941
|
}
|
|
3061
|
-
async function
|
|
3942
|
+
async function candidatePreview(argv, io) {
|
|
3062
3943
|
const parsed = parseArgs(argv);
|
|
3063
3944
|
rejectUnknownFlags(parsed, new Set(["dir", "benchmark", "path", "output", "json"]));
|
|
3064
|
-
rejectUnexpectedPositionals(parsed, "workbench cloud
|
|
3945
|
+
rejectUnexpectedPositionals(parsed, "workbench cloud candidates preview", 1);
|
|
3065
3946
|
const target = await resolveHostedTarget(parsed);
|
|
3066
|
-
const
|
|
3947
|
+
const candidateId = readRequiredCandidateId(parsed);
|
|
3067
3948
|
const filePath = requireFlag(parsed, "path");
|
|
3068
3949
|
const params = new URLSearchParams({ path: filePath });
|
|
3069
|
-
const response = await apiRequest(projectApiPath(target.projectId, `/
|
|
3950
|
+
const response = await apiRequest(projectApiPath(target.projectId, `/candidates/${encodeURIComponent(candidateId)}/files?${params.toString()}`), {}, target.baseUrl);
|
|
3070
3951
|
const content = response.preview.source?.content ??
|
|
3071
3952
|
response.preview.rendered_html ??
|
|
3072
3953
|
response.preview.diff ??
|
|
@@ -3084,14 +3965,14 @@ async function subjectPreview(argv, io) {
|
|
|
3084
3965
|
}
|
|
3085
3966
|
return 0;
|
|
3086
3967
|
}
|
|
3087
|
-
async function
|
|
3968
|
+
async function candidateExport(argv, io) {
|
|
3088
3969
|
const parsed = parseArgs(argv);
|
|
3089
3970
|
rejectUnknownFlags(parsed, new Set(["dir", "benchmark", "out", "json"]));
|
|
3090
|
-
rejectUnexpectedPositionals(parsed, "workbench cloud
|
|
3971
|
+
rejectUnexpectedPositionals(parsed, "workbench cloud candidates pull", 1);
|
|
3091
3972
|
const target = await resolveHostedTarget(parsed);
|
|
3092
|
-
const
|
|
3973
|
+
const candidateId = readRequiredCandidateId(parsed);
|
|
3093
3974
|
const outputDir = requireOutDir(parsed);
|
|
3094
|
-
const response = await apiRequest(projectApiPath(target.projectId, `/
|
|
3975
|
+
const response = await apiRequest(projectApiPath(target.projectId, `/candidates/${encodeURIComponent(candidateId)}/export`), {}, target.baseUrl);
|
|
3095
3976
|
await writeFiles(outputDir, response.files);
|
|
3096
3977
|
writeOutput({ ok: true, outputDir, files: response.files.length }, parsed, io, (result) => {
|
|
3097
3978
|
const record = result;
|
|
@@ -3099,14 +3980,14 @@ async function subjectExport(argv, io) {
|
|
|
3099
3980
|
});
|
|
3100
3981
|
return 0;
|
|
3101
3982
|
}
|
|
3102
|
-
async function
|
|
3983
|
+
async function candidateVisibility(argv, io, visibility) {
|
|
3103
3984
|
const parsed = parseArgs(argv);
|
|
3104
3985
|
rejectUnknownFlags(parsed, new Set(["dir", "benchmark", "json"]));
|
|
3105
|
-
rejectUnexpectedPositionals(parsed, `workbench cloud
|
|
3986
|
+
rejectUnexpectedPositionals(parsed, `workbench cloud candidates ${visibility === "public" ? "publish" : "unpublish"}`, 1);
|
|
3106
3987
|
const target = await resolveHostedTarget(parsed, { requireProjectIdentity: true });
|
|
3107
|
-
const
|
|
3108
|
-
const response = await apiRequest(projectApiPath(target.projectId, `/
|
|
3109
|
-
writeOutput({ ok: true, visibility,
|
|
3988
|
+
const candidateId = readRequiredCandidateId(parsed);
|
|
3989
|
+
const response = await apiRequest(projectApiPath(target.projectId, `/candidates/${encodeURIComponent(candidateId)}/publish`), { method: visibility === "public" ? "PUT" : "DELETE" }, target.baseUrl);
|
|
3990
|
+
writeOutput({ ok: true, visibility, candidate: response.candidate }, parsed, io, () => `${visibility === "public" ? "Published" : "Unpublished"} candidate ${candidateId}.`);
|
|
3110
3991
|
return 0;
|
|
3111
3992
|
}
|
|
3112
3993
|
async function runList(argv, io) {
|
|
@@ -3116,7 +3997,7 @@ async function runList(argv, io) {
|
|
|
3116
3997
|
const target = await resolveHostedTarget(parsed);
|
|
3117
3998
|
const response = await apiRequest(projectApiPath(target.projectId, "/runs"), {}, target.baseUrl);
|
|
3118
3999
|
writeOutput(response.runs, parsed, io, (runs) => runs
|
|
3119
|
-
.map((run) => `${run.id}\t${run.status}\t${run.
|
|
4000
|
+
.map((run) => `${run.id}\t${run.status}\t${run.candidateId ?? "pending"}`)
|
|
3120
4001
|
.join("\n") || "No runs.");
|
|
3121
4002
|
return 0;
|
|
3122
4003
|
}
|
|
@@ -3191,7 +4072,7 @@ async function runLogs(argv, io) {
|
|
|
3191
4072
|
function formatRunLogs(record) {
|
|
3192
4073
|
const value = record;
|
|
3193
4074
|
return (value.jobs
|
|
3194
|
-
.map((job) => `${job.id}\t${job.kind}\t${job.status}\t${job.
|
|
4075
|
+
.map((job) => `${job.id}\t${job.kind}\t${job.status}\t${job.candidateId ?? "-"}${job.error ? `\t${job.error}` : ""}`)
|
|
3195
4076
|
.join("\n") || `No jobs for ${value.runId}.`);
|
|
3196
4077
|
}
|
|
3197
4078
|
async function openWorkbench(argv, io) {
|
|
@@ -3226,7 +4107,7 @@ function buildWorkbenchWebUrl(target, ref) {
|
|
|
3226
4107
|
if (ref.startsWith("run_")) {
|
|
3227
4108
|
return benchmarkUrl;
|
|
3228
4109
|
}
|
|
3229
|
-
return buildWorkbenchResourceUrls(target, {
|
|
4110
|
+
return buildWorkbenchResourceUrls(target, { candidateId: ref }).candidateEvaluation;
|
|
3230
4111
|
}
|
|
3231
4112
|
async function resolveHostedTarget(parsed, options = {}) {
|
|
3232
4113
|
if (options.sourceArg !== undefined && parsed.flags.dir !== undefined) {
|
|
@@ -3313,7 +4194,7 @@ async function resolveOpenTarget(parsed) {
|
|
|
3313
4194
|
const ref = parsed.positionals[0];
|
|
3314
4195
|
if (ref &&
|
|
3315
4196
|
!ref.startsWith("run_") &&
|
|
3316
|
-
!ref.startsWith("
|
|
4197
|
+
!ref.startsWith("candidate_")) {
|
|
3317
4198
|
const baseUrl = await effectiveBaseUrl();
|
|
3318
4199
|
if (ref.includes("/")) {
|
|
3319
4200
|
const parsedRef = parseBenchmarkRef(ref);
|
|
@@ -3347,13 +4228,13 @@ function buildWorkbenchResourceUrls(target, refs = {}) {
|
|
|
3347
4228
|
const projectRef = `${encodeURIComponent(target.owner)}/${encodeURIComponent(target.projectName)}`;
|
|
3348
4229
|
const benchmark = `${target.baseUrl}/benchmarks/${projectRef}`;
|
|
3349
4230
|
const urls = { benchmark };
|
|
3350
|
-
if (refs.
|
|
4231
|
+
if (refs.candidateId) {
|
|
3351
4232
|
const evaluationId = refs.runId
|
|
3352
|
-
? evaluationScorecardId(refs.runId, refs.
|
|
4233
|
+
? evaluationScorecardId(refs.runId, refs.candidateId)
|
|
3353
4234
|
: null;
|
|
3354
|
-
urls.
|
|
3355
|
-
? `${benchmark}/
|
|
3356
|
-
: `${benchmark}/
|
|
4235
|
+
urls.candidateEvaluation = evaluationId
|
|
4236
|
+
? `${benchmark}/candidates/${encodeURIComponent(refs.candidateId)}?evaluation=${encodeURIComponent(evaluationId)}`
|
|
4237
|
+
: `${benchmark}/candidates/${encodeURIComponent(refs.candidateId)}`;
|
|
3357
4238
|
}
|
|
3358
4239
|
return urls;
|
|
3359
4240
|
}
|
|
@@ -3423,15 +4304,15 @@ function withRunUrls(target, run) {
|
|
|
3423
4304
|
...run,
|
|
3424
4305
|
urls: buildWorkbenchResourceUrls(target, {
|
|
3425
4306
|
runId: run.id,
|
|
3426
|
-
|
|
4307
|
+
candidateId: run.outputCandidateId ?? run.candidateId,
|
|
3427
4308
|
}),
|
|
3428
4309
|
};
|
|
3429
4310
|
}
|
|
3430
4311
|
function withRunDetailUrls(target, detail) {
|
|
3431
|
-
const
|
|
4312
|
+
const candidateId = hostedRunEvaluationCandidateId(detail.run, detail.jobs);
|
|
3432
4313
|
const run = withRunUrls(target, {
|
|
3433
4314
|
...detail.run,
|
|
3434
|
-
|
|
4315
|
+
outputCandidateId: detail.run.outputCandidateId ?? candidateId,
|
|
3435
4316
|
});
|
|
3436
4317
|
return {
|
|
3437
4318
|
run,
|
|
@@ -3439,15 +4320,15 @@ function withRunDetailUrls(target, detail) {
|
|
|
3439
4320
|
urls: run.urls ?? buildWorkbenchResourceUrls(target, { runId: run.id }),
|
|
3440
4321
|
};
|
|
3441
4322
|
}
|
|
3442
|
-
function
|
|
3443
|
-
if (run.
|
|
3444
|
-
return run.
|
|
4323
|
+
function hostedRunEvaluationCandidateId(run, jobs = []) {
|
|
4324
|
+
if (run.outputCandidateId) {
|
|
4325
|
+
return run.outputCandidateId;
|
|
3445
4326
|
}
|
|
3446
|
-
const
|
|
4327
|
+
const attemptCandidates = jobs
|
|
3447
4328
|
.filter((job) => readRunJobPurpose(job) === "attempt")
|
|
3448
|
-
.map((job) => job.
|
|
3449
|
-
.filter((
|
|
3450
|
-
return
|
|
4329
|
+
.map((job) => job.candidateId)
|
|
4330
|
+
.filter((candidateId) => Boolean(candidateId));
|
|
4331
|
+
return attemptCandidates.at(-1) ?? run.candidateId ?? null;
|
|
3451
4332
|
}
|
|
3452
4333
|
function sourceFileCount(source) {
|
|
3453
4334
|
return source.sourceFiles.length;
|
|
@@ -3456,7 +4337,7 @@ function hostedProjectSourceRequest(source) {
|
|
|
3456
4337
|
const { network, resources } = hostedEnvironmentOptions(source);
|
|
3457
4338
|
return {
|
|
3458
4339
|
source: source.specSource,
|
|
3459
|
-
|
|
4340
|
+
candidateFiles: source.candidateFiles,
|
|
3460
4341
|
engineResolveFiles: hostedEngineResolveFiles(source),
|
|
3461
4342
|
engineResolveBinding: engineResolveBindingForSpec(source.spec),
|
|
3462
4343
|
adapterFiles: source.adapterFiles,
|
|
@@ -3539,24 +4420,45 @@ async function watchHostedRun(args) {
|
|
|
3539
4420
|
}
|
|
3540
4421
|
}
|
|
3541
4422
|
function formatHostedRunResult(run) {
|
|
3542
|
-
const
|
|
3543
|
-
const activeDetail = run.
|
|
3544
|
-
? `; active ${run.
|
|
4423
|
+
const candidateId = run.outputCandidateId ?? run.candidateId;
|
|
4424
|
+
const activeDetail = run.activeCandidateId && candidateId && run.activeCandidateId !== candidateId
|
|
4425
|
+
? `; active ${run.activeCandidateId}`
|
|
3545
4426
|
: "";
|
|
3546
|
-
const summary = `Run ${run.id} reached ${run.status}; ${run.outcome ? `outcome ${run.outcome}; ` : ""}
|
|
4427
|
+
const summary = `Run ${run.id} reached ${run.status}; ${run.outcome ? `outcome ${run.outcome}; ` : ""}candidate ${candidateId ?? "pending"}${activeDetail}; ${run.completedJobCount ?? 0}/${run.jobCount ?? 0} jobs completed.`;
|
|
3547
4428
|
return [
|
|
3548
4429
|
run.error ? `${summary}\nError: ${run.error}` : summary,
|
|
3549
|
-
...(run.urls?.
|
|
3550
|
-
? [`Open evaluation: ${run.urls.
|
|
4430
|
+
...(run.urls?.candidateEvaluation
|
|
4431
|
+
? [`Open evaluation: ${run.urls.candidateEvaluation}`]
|
|
3551
4432
|
: [`Open benchmark: ${run.urls?.benchmark ?? ""}`].filter(Boolean)),
|
|
3552
4433
|
].join("\n");
|
|
3553
4434
|
}
|
|
4435
|
+
function formatRetryCommandResult(result) {
|
|
4436
|
+
const run = result.run;
|
|
4437
|
+
const runId = run?.id ?? result.runId ?? "unknown";
|
|
4438
|
+
const scope = `${result.retried.kind} ${result.retried.id}`;
|
|
4439
|
+
const verb = run
|
|
4440
|
+
? run.status === "finished" ? "finished as hosted run" : "started as hosted run"
|
|
4441
|
+
: "finished as local run";
|
|
4442
|
+
return [
|
|
4443
|
+
`Retry of ${scope} ${verb} ${runId}.`,
|
|
4444
|
+
...(result.evaluationId ? [`Evaluation: ${result.evaluationId}`] : []),
|
|
4445
|
+
...(result.candidateId ? [`Candidate: ${result.candidateId}`] : []),
|
|
4446
|
+
...(result.failedJobCount ? [`Failed jobs: ${result.failedJobCount}`] : []),
|
|
4447
|
+
...(result.error ? [`Error: ${result.error}`] : []),
|
|
4448
|
+
...(result.localView
|
|
4449
|
+
? [`Open local view: ${result.localView.command}`, result.localView.note]
|
|
4450
|
+
: []),
|
|
4451
|
+
...(result.urls?.candidateEvaluation
|
|
4452
|
+
? [`Open evaluation: ${result.urls.candidateEvaluation}`]
|
|
4453
|
+
: result.urls?.benchmark ? [`Open benchmark: ${result.urls.benchmark}`] : []),
|
|
4454
|
+
].join("\n");
|
|
4455
|
+
}
|
|
3554
4456
|
function formatHostedRunStarted(run, fallbackWorkflow) {
|
|
3555
|
-
const
|
|
4457
|
+
const candidateId = run.outputCandidateId ?? run.candidateId;
|
|
3556
4458
|
return [
|
|
3557
|
-
`Started ${run.workflow ?? fallbackWorkflow} run ${run.id}; ${
|
|
3558
|
-
...(run.urls?.
|
|
3559
|
-
? [`Open evaluation: ${run.urls.
|
|
4459
|
+
`Started ${run.workflow ?? fallbackWorkflow} run ${run.id}; ${candidateId ? `candidate ${candidateId}` : `${run.jobCount ?? 0} jobs queued`}.`,
|
|
4460
|
+
...(run.urls?.candidateEvaluation
|
|
4461
|
+
? [`Open evaluation: ${run.urls.candidateEvaluation}`]
|
|
3560
4462
|
: run.urls?.benchmark ? [`Open benchmark: ${run.urls.benchmark}`] : []),
|
|
3561
4463
|
"",
|
|
3562
4464
|
].join("\n");
|
|
@@ -3566,13 +4468,13 @@ function formatRunDetail(record) {
|
|
|
3566
4468
|
const { run, jobs, urls } = detail;
|
|
3567
4469
|
const cost = sumJobCostUsd(jobs);
|
|
3568
4470
|
const firstFailedJob = jobs.find((job) => job.status === "failed" && job.error);
|
|
3569
|
-
const
|
|
4471
|
+
const candidateId = hostedRunEvaluationCandidateId(run, jobs);
|
|
3570
4472
|
return [
|
|
3571
4473
|
`Run ${run.id}: ${run.status}${run.outcome ? ` (${run.outcome})` : ""}`,
|
|
3572
4474
|
`Workflow: ${run.workflow ?? "improve"}`,
|
|
3573
|
-
`
|
|
3574
|
-
...(run.
|
|
3575
|
-
? [`Active
|
|
4475
|
+
`Candidate: ${candidateId ?? "pending"}`,
|
|
4476
|
+
...(run.activeCandidateId && candidateId && run.activeCandidateId !== candidateId
|
|
4477
|
+
? [`Active candidate: ${run.activeCandidateId}`]
|
|
3576
4478
|
: []),
|
|
3577
4479
|
`Samples: ${run.samples ?? 0}`,
|
|
3578
4480
|
`Attempts: ${run.attemptsExecuted ?? 0}/${run.attemptsRequested ?? run.attemptsExecuted ?? 0}`,
|
|
@@ -3584,8 +4486,8 @@ function formatRunDetail(record) {
|
|
|
3584
4486
|
...(firstFailedJob?.error
|
|
3585
4487
|
? [`First failed job ${firstFailedJob.id}: ${firstFailedJob.error}`]
|
|
3586
4488
|
: []),
|
|
3587
|
-
...(urls.
|
|
3588
|
-
? [`Open evaluation: ${urls.
|
|
4489
|
+
...(urls.candidateEvaluation
|
|
4490
|
+
? [`Open evaluation: ${urls.candidateEvaluation}`]
|
|
3589
4491
|
: [`Open benchmark: ${urls.benchmark}`]),
|
|
3590
4492
|
...(jobs.length > 0 ? ["", "Jobs:", ...jobs.map(formatRunJobLine)] : []),
|
|
3591
4493
|
].join("\n");
|
|
@@ -3595,7 +4497,7 @@ function formatRunJobLine(job) {
|
|
|
3595
4497
|
job.id,
|
|
3596
4498
|
readRunJobPurpose(job) ?? job.kind ?? "job",
|
|
3597
4499
|
job.status,
|
|
3598
|
-
job.
|
|
4500
|
+
job.candidateId ?? "-",
|
|
3599
4501
|
job.error ?? "",
|
|
3600
4502
|
].filter((value, index) => index < 4 || value !== "").join("\t");
|
|
3601
4503
|
}
|
|
@@ -3621,7 +4523,7 @@ function costUsdFromUsage(value) {
|
|
|
3621
4523
|
if (direct !== null) {
|
|
3622
4524
|
return direct;
|
|
3623
4525
|
}
|
|
3624
|
-
return ["total", "
|
|
4526
|
+
return ["total", "improver", "runner", "engine"].reduce((sum, key) => {
|
|
3625
4527
|
const nested = readRecord(usage[key]);
|
|
3626
4528
|
return sum + (readFiniteNumber(nested?.costUsd) ?? 0);
|
|
3627
4529
|
}, 0);
|
|
@@ -3631,6 +4533,15 @@ function readRecord(value) {
|
|
|
3631
4533
|
? value
|
|
3632
4534
|
: null;
|
|
3633
4535
|
}
|
|
4536
|
+
function stringValue(value) {
|
|
4537
|
+
return typeof value === "string" && value.length > 0 ? value : null;
|
|
4538
|
+
}
|
|
4539
|
+
function numberValue(value) {
|
|
4540
|
+
return readFiniteNumber(value);
|
|
4541
|
+
}
|
|
4542
|
+
function integerValue(value) {
|
|
4543
|
+
return Number.isSafeInteger(value) ? value : null;
|
|
4544
|
+
}
|
|
3634
4545
|
function readFiniteNumber(value) {
|
|
3635
4546
|
return typeof value === "number" && Number.isFinite(value) ? value : null;
|
|
3636
4547
|
}
|
|
@@ -3763,15 +4674,15 @@ async function readWorkbenchProfileStatus(config) {
|
|
|
3763
4674
|
return { authenticated: true, profile: null };
|
|
3764
4675
|
}
|
|
3765
4676
|
}
|
|
3766
|
-
function
|
|
3767
|
-
return asOptionalString(parsed.flags.
|
|
4677
|
+
function readOptionalCandidateId(parsed) {
|
|
4678
|
+
return asOptionalString(parsed.flags.candidate) ?? parsed.positionals[0];
|
|
3768
4679
|
}
|
|
3769
|
-
function
|
|
3770
|
-
const
|
|
3771
|
-
if (!
|
|
3772
|
-
throw new UsageError("Missing required
|
|
4680
|
+
function readRequiredCandidateId(parsed) {
|
|
4681
|
+
const candidateId = readOptionalCandidateId(parsed);
|
|
4682
|
+
if (!candidateId) {
|
|
4683
|
+
throw new UsageError("Missing required CANDIDATE_ID.");
|
|
3773
4684
|
}
|
|
3774
|
-
return
|
|
4685
|
+
return candidateId;
|
|
3775
4686
|
}
|
|
3776
4687
|
function readRequiredRunId(parsed) {
|
|
3777
4688
|
const runId = parsed.positionals[0];
|
|
@@ -4002,6 +4913,38 @@ function readInitAgent(parsed, kind) {
|
|
|
4002
4913
|
function asOptionalString(value) {
|
|
4003
4914
|
return typeof value === "string" && value.length > 0 ? value : undefined;
|
|
4004
4915
|
}
|
|
4916
|
+
function singleRequestedRunId(value, command) {
|
|
4917
|
+
if (!value || value.trim() === "") {
|
|
4918
|
+
return undefined;
|
|
4919
|
+
}
|
|
4920
|
+
const trimmed = value.trim();
|
|
4921
|
+
if (trimmed === "all" || trimmed.includes(",")) {
|
|
4922
|
+
throw new UsageError(`${command} accepts one candidate run id for --runs; use workbench eval --runs all to evaluate every run.`);
|
|
4923
|
+
}
|
|
4924
|
+
return trimmed;
|
|
4925
|
+
}
|
|
4926
|
+
function resolveCandidateRunSelection(source, value) {
|
|
4927
|
+
const available = source.candidateRunIds;
|
|
4928
|
+
if (available.length === 0) {
|
|
4929
|
+
throw new UsageError("Candidate must declare at least one run.");
|
|
4930
|
+
}
|
|
4931
|
+
if (!value || value.trim() === "") {
|
|
4932
|
+
return [source.candidateRunId];
|
|
4933
|
+
}
|
|
4934
|
+
const trimmed = value.trim();
|
|
4935
|
+
if (trimmed === "all") {
|
|
4936
|
+
return available;
|
|
4937
|
+
}
|
|
4938
|
+
const requested = [...new Set(trimmed.split(",").map((entry) => entry.trim()).filter(Boolean))];
|
|
4939
|
+
if (requested.length === 0) {
|
|
4940
|
+
throw new UsageError("--runs must include at least one run id or all.");
|
|
4941
|
+
}
|
|
4942
|
+
const missing = requested.filter((runId) => !available.includes(runId));
|
|
4943
|
+
if (missing.length > 0) {
|
|
4944
|
+
throw new UsageError(`Unknown candidate run(s): ${missing.join(", ")}. Available: ${available.join(", ")}.`);
|
|
4945
|
+
}
|
|
4946
|
+
return requested;
|
|
4947
|
+
}
|
|
4005
4948
|
function readOptionalStringFlag(value, name) {
|
|
4006
4949
|
if (value == null || value === false) {
|
|
4007
4950
|
return undefined;
|
|
@@ -4226,6 +5169,27 @@ function parsePortFlag(value) {
|
|
|
4226
5169
|
}
|
|
4227
5170
|
return port;
|
|
4228
5171
|
}
|
|
5172
|
+
function formatCandidateEvaluationScore(candidate) {
|
|
5173
|
+
const score = candidate.eval?.metrics?.score?.mean;
|
|
5174
|
+
return typeof score === "number" && Number.isFinite(score)
|
|
5175
|
+
? formatMetricValue(score)
|
|
5176
|
+
: "n/a";
|
|
5177
|
+
}
|
|
5178
|
+
function formatLocalCandidateLabel(candidate) {
|
|
5179
|
+
if (!candidate) {
|
|
5180
|
+
return "none";
|
|
5181
|
+
}
|
|
5182
|
+
const name = candidate.name?.trim() || candidate.id;
|
|
5183
|
+
const displayName = candidate.version > 0
|
|
5184
|
+
? `${name} v${candidate.version}`
|
|
5185
|
+
: name;
|
|
5186
|
+
return `${displayName} (${candidate.id})`;
|
|
5187
|
+
}
|
|
5188
|
+
function formatCandidateEvaluationSummary(candidate) {
|
|
5189
|
+
return formatMetricSummary(evaluationMeanMetrics(candidate.eval), {
|
|
5190
|
+
limit: Number.POSITIVE_INFINITY,
|
|
5191
|
+
});
|
|
5192
|
+
}
|
|
4229
5193
|
function formatMetricSummary(metrics, options = {}) {
|
|
4230
5194
|
const entries = Object.entries(metrics ?? {}).filter((entry) => Number.isFinite(entry[1]));
|
|
4231
5195
|
if (entries.length === 0) {
|
|
@@ -4263,15 +5227,18 @@ function resolveSourceDir(parsed) {
|
|
|
4263
5227
|
function isWorkbenchSourceYamlPath(filePath) {
|
|
4264
5228
|
return path.basename(filePath) === WORKBENCH_BENCHMARK_FILE;
|
|
4265
5229
|
}
|
|
4266
|
-
function
|
|
4267
|
-
const explicit =
|
|
5230
|
+
function readCandidateIdFlag(parsed, snapshot) {
|
|
5231
|
+
const explicit = readOptionalCandidateFlag(parsed);
|
|
4268
5232
|
if (explicit) {
|
|
4269
5233
|
return explicit;
|
|
4270
5234
|
}
|
|
4271
5235
|
if (snapshot.activeId) {
|
|
4272
5236
|
return snapshot.activeId;
|
|
4273
5237
|
}
|
|
4274
|
-
throw new UsageError("Missing required --
|
|
5238
|
+
throw new UsageError("Missing required --candidate; no active candidate exists.");
|
|
5239
|
+
}
|
|
5240
|
+
function readOptionalCandidateFlag(parsed) {
|
|
5241
|
+
return asOptionalString(parsed.flags.candidate);
|
|
4275
5242
|
}
|
|
4276
5243
|
function readPreviewMode(parsed) {
|
|
4277
5244
|
const view = asOptionalString(parsed.flags.view) ?? "rendered";
|
|
@@ -4375,8 +5342,8 @@ async function copyInitSeedIfProvided(parsed, workspace, seed) {
|
|
|
4375
5342
|
}
|
|
4376
5343
|
});
|
|
4377
5344
|
}
|
|
4378
|
-
function
|
|
4379
|
-
return spec.improve ? `adapter:${spec.improve.use}` : "
|
|
5345
|
+
function formatSpecImprover(spec) {
|
|
5346
|
+
return spec.improve ? `adapter:${spec.improve.use}` : "improve not configured";
|
|
4380
5347
|
}
|
|
4381
5348
|
async function writeFiles(outputDir, files) {
|
|
4382
5349
|
await fs.mkdir(outputDir, { recursive: true });
|