@workbench-ai/workbench 0.0.49 → 0.0.51
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/adapter-project.js +3 -3
- package/dist/benchmark-fingerprint.d.ts +1 -1
- package/dist/benchmark-fingerprint.d.ts.map +1 -1
- package/dist/benchmark-fingerprint.js +4 -6
- package/dist/command-model.d.ts.map +1 -1
- package/dist/command-model.js +95 -453
- package/dist/dev-open/client.css +42 -43
- package/dist/dev-open/client.js +145 -145
- package/dist/dev-open-server.d.ts +12 -22
- package/dist/dev-open-server.d.ts.map +1 -1
- package/dist/dev-open-server.js +82 -42
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +1758 -1245
- package/dist/init-scaffold.d.ts +4 -4
- package/dist/init-scaffold.d.ts.map +1 -1
- package/dist/init-scaffold.js +2 -2
- package/dist/init-template-pack.d.ts +4 -4
- package/dist/init-template-pack.d.ts.map +1 -1
- package/dist/init-template-pack.js +47 -59
- package/dist/local-archive.d.ts +15 -11
- package/dist/local-archive.d.ts.map +1 -1
- package/dist/local-archive.js +325 -83
- package/dist/project-source.d.ts +14 -17
- package/dist/project-source.d.ts.map +1 -1
- package/dist/project-source.js +80 -151
- package/package.json +4 -4
package/dist/index.js
CHANGED
|
@@ -5,7 +5,7 @@ import { createRequire } from "node:module";
|
|
|
5
5
|
import os from "node:os";
|
|
6
6
|
import path from "node:path";
|
|
7
7
|
import { Writable } from "node:stream";
|
|
8
|
-
import {
|
|
8
|
+
import { createCandidateFilePreview, createBaselineCandidateJob as createRuntimeBaselineCandidateJob, evaluationScorecardId, evaluationMeanMetrics, executeWorkbenchExecutionJob, engineResolveBindingForSpec, filterOptimizerTraceJobsForCaseIds, filterCandidateSourceFiles, formatWorkbenchCaseSelector, formatWorkbenchSelectionPolicy, workbenchCaseSelectorUsesAllCases, workbenchExecutionPurpose, workbenchRunExecutionFingerprint, createWorkbenchAdapterAuthBundle, createOptimizerTraceInputFiles, DOCKER_SANDBOX_BACKEND, localWorkbenchAdapterAuthStore, materializeWorkbenchRunResult, normalizeSurfaceFiles, planWorkbenchExecutionJobsForPurpose, runWorkbenchExecutionDag, resolveEngineCaseExecutionConfig, resolveWorkbenchResolvedSourceYaml, summarizeCandidateFiles, validateWorkbenchRunEnvelope, parseWorkbenchAdapterAuthTarget, workbenchEngineCaseIdsForImproveEvaluation, workbenchEngineCaseIdsForSelector, workbenchImproveOptimizeSelector, workbenchImproveSelectionPolicy, workbenchProjectSourceFingerprint, workbenchRuntimeBundleFingerprint, } from "@workbench-ai/workbench-core";
|
|
9
9
|
import { assertWorkbenchAdapterOperationResultOk, collectWorkbenchAdapterAuthRequirements, WORKBENCH_ADAPTER_RESULT_FILE, WORKBENCH_ADAPTER_RESULT_PROTOCOL, normalizeWorkbenchAdapterOperationRequest, readWorkbenchAdapterOperationResult, workbenchAdapterOperationCommand, workbenchAdapterOperationResultPath, withDefaultWorkbenchAdapterAuthProfiles as applyDefaultWorkbenchAdapterAuthProfiles, } from "@workbench-ai/workbench-protocol";
|
|
10
10
|
import { builtinLocalTraceAdapter, builtinLocalTraceAdapters, sortLocalTraceRefs, } from "@workbench-ai/workbench-built-in-adapters/local-traces";
|
|
11
11
|
import { commandUsage, HOSTED_WATCH_LIFECYCLE_NOTE, LOCAL_DEV_OPEN_LIFECYCLE_NOTE, rootUsage, } from "./command-model.js";
|
|
@@ -13,10 +13,10 @@ import { startLocalWorkbenchDevServer } from "./dev-open-server.js";
|
|
|
13
13
|
import { createWorkbenchInitScaffold, } from "./init-scaffold.js";
|
|
14
14
|
import { defaultAdapterManifests, composeRuntimeDockerfileWithAdapters, resolveDefaultWorkbenchAdapter, resolveProjectAdapterSource, resolveWorkbenchAdaptersForProject, WORKBENCH_ADAPTER_MANIFEST_FILE, } from "./adapter-project.js";
|
|
15
15
|
import { createAdapterCommandEnv } from "./adapter-command-env.js";
|
|
16
|
-
import {
|
|
16
|
+
import { loadLocalArchive, loadLocalArchiveIndex, exportLocalRuntimeBundle, importLocalRuntimeBundle, runtimeBundleStats, materializeCandidateRoot, readLocalCandidate, readLocalCandidateFiles, readLocalJobs, saveLocalArchive, saveLocalJobs, setLocalActive, upsertLocalRun, upsertLocalCandidate, upsertLocalEvaluation, } from "./local-archive.js";
|
|
17
17
|
import { WorkspaceSnapshotError, } from "./workspace-snapshot.js";
|
|
18
18
|
import { readLocalProjectSource, WORKBENCH_BENCHMARK_FILE, } from "./project-source.js";
|
|
19
|
-
import { localBenchmarkFingerprint,
|
|
19
|
+
import { localBenchmarkFingerprint, localCandidateFingerprint, } from "./benchmark-fingerprint.js";
|
|
20
20
|
const require = createRequire(import.meta.url);
|
|
21
21
|
function getCliVersion() {
|
|
22
22
|
const manifest = require("../package.json");
|
|
@@ -74,29 +74,38 @@ export async function runCli(argv, io = {
|
|
|
74
74
|
if (argv[0] === "clone") {
|
|
75
75
|
return await cloneProject(argv.slice(1), io);
|
|
76
76
|
}
|
|
77
|
-
if (argv[0] === "fetch") {
|
|
78
|
-
return await fetchProject(argv.slice(1), io);
|
|
79
|
-
}
|
|
80
77
|
if (argv[0] === "pull") {
|
|
81
78
|
return await pullProject(argv.slice(1), io);
|
|
82
79
|
}
|
|
83
80
|
if (argv[0] === "push") {
|
|
84
81
|
return await pushBenchmark(argv.slice(1), io);
|
|
85
82
|
}
|
|
86
|
-
if (argv[0] === "remote") {
|
|
87
|
-
return await runRemoteCommand(argv.slice(1), io);
|
|
88
|
-
}
|
|
89
83
|
if (argv[0] === "eval") {
|
|
90
|
-
|
|
84
|
+
const hosted = extractHostedFlag(argv.slice(1));
|
|
85
|
+
return hosted.enabled
|
|
86
|
+
? await startHostedWorkflow("eval", hosted.argv, io)
|
|
87
|
+
: await localEvaluateCandidate(hosted.argv, io, runtimeOptions);
|
|
88
|
+
}
|
|
89
|
+
if (argv[0] === "retry") {
|
|
90
|
+
const hosted = extractHostedFlag(argv.slice(1));
|
|
91
|
+
return hosted.enabled
|
|
92
|
+
? await retryHostedWorkflow(hosted.argv, io)
|
|
93
|
+
: await localRetry(hosted.argv, io, runtimeOptions);
|
|
91
94
|
}
|
|
92
95
|
if (argv[0] === "improve") {
|
|
93
|
-
|
|
96
|
+
const hosted = extractHostedFlag(argv.slice(1));
|
|
97
|
+
return hosted.enabled
|
|
98
|
+
? await startHostedWorkflow("improve", hosted.argv, io)
|
|
99
|
+
: await localRun(hosted.argv, io, runtimeOptions);
|
|
94
100
|
}
|
|
95
101
|
if (argv[0] === "restore") {
|
|
96
102
|
return await localRestore(argv.slice(1), io);
|
|
97
103
|
}
|
|
98
104
|
if (argv[0] === "open") {
|
|
99
|
-
|
|
105
|
+
const hosted = extractHostedFlag(argv.slice(1));
|
|
106
|
+
return hosted.enabled
|
|
107
|
+
? await openWorkbench(hosted.argv, io)
|
|
108
|
+
: await localDevOpen(hosted.argv, io);
|
|
100
109
|
}
|
|
101
110
|
if (argv[0] === "auth") {
|
|
102
111
|
return await runAuthCommand(argv.slice(1), io);
|
|
@@ -107,9 +116,6 @@ export async function runCli(argv, io = {
|
|
|
107
116
|
if (argv[0] === "traces") {
|
|
108
117
|
return await runTracesCommand(argv.slice(1), io);
|
|
109
118
|
}
|
|
110
|
-
if (argv[0] === "cloud") {
|
|
111
|
-
return await runCloudCommand(argv.slice(1), io);
|
|
112
|
-
}
|
|
113
119
|
const commandPath = argv.slice(0, 2).join(" ");
|
|
114
120
|
const rest = argv.slice(2);
|
|
115
121
|
switch (commandPath) {
|
|
@@ -117,14 +123,14 @@ export async function runCli(argv, io = {
|
|
|
117
123
|
return await localRunList(rest, io);
|
|
118
124
|
case "runs show":
|
|
119
125
|
return await localRunShow(rest, io);
|
|
120
|
-
case "
|
|
121
|
-
return await
|
|
122
|
-
case "
|
|
123
|
-
return await
|
|
124
|
-
case "
|
|
125
|
-
return await
|
|
126
|
-
case "
|
|
127
|
-
return await
|
|
126
|
+
case "candidates list":
|
|
127
|
+
return await localCandidateList(rest, io);
|
|
128
|
+
case "candidates show":
|
|
129
|
+
return await localCandidateShow(rest, io);
|
|
130
|
+
case "candidates files":
|
|
131
|
+
return await localCandidateFiles(rest, io);
|
|
132
|
+
case "candidates preview":
|
|
133
|
+
return await localCandidatePreview(rest, io);
|
|
128
134
|
default:
|
|
129
135
|
break;
|
|
130
136
|
}
|
|
@@ -145,9 +151,6 @@ export async function runCli(argv, io = {
|
|
|
145
151
|
}
|
|
146
152
|
function commandPathForHelp(argv) {
|
|
147
153
|
const positionals = argv.filter((arg) => arg !== "--help" && arg !== "-h" && !arg.startsWith("--"));
|
|
148
|
-
if (positionals[0] === "cloud") {
|
|
149
|
-
return positionals.slice(0, 3).join(" ");
|
|
150
|
-
}
|
|
151
154
|
if (positionals[0] === "adapters" &&
|
|
152
155
|
["create", "list", "inspect", "test"].includes(positionals[1] ?? "")) {
|
|
153
156
|
return positionals.slice(0, 2).join(" ");
|
|
@@ -156,76 +159,31 @@ function commandPathForHelp(argv) {
|
|
|
156
159
|
["collect", "list", "show"].includes(positionals[1] ?? "")) {
|
|
157
160
|
return positionals.slice(0, 2).join(" ");
|
|
158
161
|
}
|
|
159
|
-
if (positionals[0] === "auth"
|
|
162
|
+
if (positionals[0] === "auth") {
|
|
160
163
|
return positionals.slice(0, 2).join(" ");
|
|
161
164
|
}
|
|
162
165
|
if (positionals[0] === "runs" &&
|
|
163
166
|
["list", "show"].includes(positionals[1] ?? "")) {
|
|
164
167
|
return positionals.slice(0, 2).join(" ");
|
|
165
168
|
}
|
|
166
|
-
if (positionals[0] === "
|
|
169
|
+
if (positionals[0] === "candidates" &&
|
|
167
170
|
["list", "show", "files", "preview"].includes(positionals[1] ?? "")) {
|
|
168
171
|
return positionals.slice(0, 2).join(" ");
|
|
169
172
|
}
|
|
170
173
|
return positionals[0] ?? "";
|
|
171
174
|
}
|
|
172
|
-
|
|
173
|
-
|
|
174
|
-
const
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
case "watch":
|
|
183
|
-
return await runWatch(rest, io);
|
|
184
|
-
case "logs":
|
|
185
|
-
return await runLogs(rest, io);
|
|
186
|
-
case "star":
|
|
187
|
-
return await starProject(rest, io, true);
|
|
188
|
-
case "unstar":
|
|
189
|
-
return await starProject(rest, io, false);
|
|
190
|
-
default:
|
|
191
|
-
break;
|
|
192
|
-
}
|
|
193
|
-
const commandPath = argv.slice(0, 2).join(" ");
|
|
194
|
-
const subRest = argv.slice(2);
|
|
195
|
-
switch (commandPath) {
|
|
196
|
-
case "benchmarks list":
|
|
197
|
-
return await benchmarkList(subRest, io);
|
|
198
|
-
case "benchmarks show":
|
|
199
|
-
return await benchmarkShow(subRest, io);
|
|
200
|
-
case "benchmarks versions":
|
|
201
|
-
return await benchmarkVersions(subRest, io);
|
|
202
|
-
case "benchmarks starred":
|
|
203
|
-
return await benchmarkStarred(subRest, io);
|
|
204
|
-
case "benchmarks delete":
|
|
205
|
-
return await benchmarkDelete(subRest, io);
|
|
206
|
-
case "runs list":
|
|
207
|
-
return await runList(subRest, io);
|
|
208
|
-
case "runs show":
|
|
209
|
-
return await runShow(subRest, io);
|
|
210
|
-
case "runs cancel":
|
|
211
|
-
return await runCancel(subRest, io);
|
|
212
|
-
case "subjects list":
|
|
213
|
-
return await subjectList(subRest, io);
|
|
214
|
-
case "subjects show":
|
|
215
|
-
return await subjectShow(subRest, io);
|
|
216
|
-
case "subjects files":
|
|
217
|
-
return await subjectFiles(subRest, io);
|
|
218
|
-
case "subjects preview":
|
|
219
|
-
return await subjectPreview(subRest, io);
|
|
220
|
-
case "subjects pull":
|
|
221
|
-
return await subjectExport(subRest, io);
|
|
222
|
-
case "subjects publish":
|
|
223
|
-
return await subjectVisibility(subRest, io, "public");
|
|
224
|
-
case "subjects unpublish":
|
|
225
|
-
return await subjectVisibility(subRest, io, "private");
|
|
226
|
-
default:
|
|
227
|
-
throw new UsageError(`Unknown command: cloud ${argv.join(" ")}`);
|
|
175
|
+
function extractHostedFlag(argv) {
|
|
176
|
+
let enabled = false;
|
|
177
|
+
const next = [];
|
|
178
|
+
for (const arg of argv) {
|
|
179
|
+
if (arg === "--hosted") {
|
|
180
|
+
enabled = true;
|
|
181
|
+
}
|
|
182
|
+
else {
|
|
183
|
+
next.push(arg);
|
|
184
|
+
}
|
|
228
185
|
}
|
|
186
|
+
return { enabled, argv: next };
|
|
229
187
|
}
|
|
230
188
|
async function localDevOpen(argv, io) {
|
|
231
189
|
const parsed = parseArgs(argv);
|
|
@@ -313,7 +271,7 @@ async function localInit(argv, io) {
|
|
|
313
271
|
specPath,
|
|
314
272
|
kind: scaffold.kind,
|
|
315
273
|
name: scaffold.name,
|
|
316
|
-
|
|
274
|
+
candidateRoot: scaffold.candidateRoot,
|
|
317
275
|
}, parsed, io, () => `Initialized ${scaffold.kind} Workbench source directory at ${workspace}`);
|
|
318
276
|
return 0;
|
|
319
277
|
}
|
|
@@ -358,20 +316,20 @@ function buildWorkbenchCheckPlan(source) {
|
|
|
358
316
|
files: sourceFileCount(source),
|
|
359
317
|
yaml: [
|
|
360
318
|
path.relative(source.dir, source.benchmarkPath) || "benchmark.yaml",
|
|
361
|
-
path.relative(source.dir, source.
|
|
362
|
-
...(source.optimizerSource !== undefined
|
|
363
|
-
? [path.relative(source.dir, source.optimizerPath ?? "") || "optimizer YAML"]
|
|
364
|
-
: []),
|
|
319
|
+
path.relative(source.dir, source.candidateSpecPath) || "candidate YAML",
|
|
365
320
|
],
|
|
366
321
|
dockerfile: source.dockerfilePath,
|
|
367
322
|
},
|
|
368
|
-
|
|
369
|
-
|
|
370
|
-
|
|
323
|
+
candidate: {
|
|
324
|
+
name: source.spec.candidate.name,
|
|
325
|
+
selectedRunId: source.spec.candidate.selectedRunId,
|
|
326
|
+
runCount: Object.keys(source.spec.candidate.runs).length,
|
|
327
|
+
filesPath: source.spec.candidate.files.path,
|
|
328
|
+
files: source.candidateFiles.length,
|
|
371
329
|
},
|
|
372
|
-
|
|
330
|
+
improve: source.spec.candidate.improve
|
|
373
331
|
? {
|
|
374
|
-
edits: [...source.spec.
|
|
332
|
+
edits: [...source.spec.candidate.improve.edits],
|
|
375
333
|
}
|
|
376
334
|
: null,
|
|
377
335
|
engine: {
|
|
@@ -394,8 +352,8 @@ function buildWorkbenchCheckPlan(source) {
|
|
|
394
352
|
};
|
|
395
353
|
}
|
|
396
354
|
function formatWorkbenchCheckPlan(plan, warningSuffix) {
|
|
397
|
-
const edits = plan.
|
|
398
|
-
? plan.
|
|
355
|
+
const edits = plan.improve?.edits.length
|
|
356
|
+
? plan.improve.edits.join(", ")
|
|
399
357
|
: "-";
|
|
400
358
|
const network = plan.environment.network.egress;
|
|
401
359
|
const resources = plan.environment.resources;
|
|
@@ -404,11 +362,12 @@ function formatWorkbenchCheckPlan(plan, warningSuffix) {
|
|
|
404
362
|
`Benchmark: ${plan.benchmarkName}`,
|
|
405
363
|
`Description: ${plan.benchmarkDescription}`,
|
|
406
364
|
`Source: ${plan.source.files} file(s) (${plan.source.yaml.join(", ")}, ${plan.source.dockerfile})`,
|
|
407
|
-
`
|
|
408
|
-
`
|
|
365
|
+
`Candidate: ${plan.candidate.name} (${plan.candidate.runCount} run(s), selected ${plan.candidate.selectedRunId})`,
|
|
366
|
+
`Candidate files: ${plan.candidate.filesPath} (${plan.candidate.files} file(s))`,
|
|
367
|
+
`Improve edits: ${edits}`,
|
|
409
368
|
`Engine cases: ${plan.engine.cases} case(s) from ${formatAdapterSummary(plan.engine.resolver)} at ${plan.engine.path} (${plan.engine.files} file(s))`,
|
|
410
369
|
`Environment: ${plan.environment.dockerfile}, network ${network}, ${resources.cpu} CPU, ${resources.memoryGb}GB RAM, ${resources.timeoutMinutes}m timeout`,
|
|
411
|
-
`Execution: improve ${plan.adapters.improve ? formatAdapterSummary(plan.adapters.improve) : "not configured"},
|
|
370
|
+
`Execution: improve ${plan.adapters.improve ? formatAdapterSummary(plan.adapters.improve) : "not configured"}, candidate run ${formatAdapterSummary(plan.adapters.run)}, engine ${formatAdapterSummary(plan.adapters.engine)}`,
|
|
412
371
|
...adapterSourceLines(plan.adapters.sources),
|
|
413
372
|
].join("\n");
|
|
414
373
|
}
|
|
@@ -493,18 +452,206 @@ function splitWorkspaceError(error) {
|
|
|
493
452
|
const message = error instanceof Error ? error.message : String(error);
|
|
494
453
|
return message.split(/\n+/u).map((entry) => entry.trim()).filter(Boolean);
|
|
495
454
|
}
|
|
455
|
+
async function localRetry(argv, io, runtimeOptions) {
|
|
456
|
+
const parsed = parseArgs(argv);
|
|
457
|
+
rejectUnknownFlags(parsed, new Set(["dir", "json"]));
|
|
458
|
+
rejectUnexpectedPositionals(parsed, "workbench retry", 1);
|
|
459
|
+
const targetId = parsed.positionals[0];
|
|
460
|
+
if (!targetId) {
|
|
461
|
+
throw new UsageError("Missing required TARGET_ID.");
|
|
462
|
+
}
|
|
463
|
+
const workspace = resolveDir(parsed);
|
|
464
|
+
const target = await resolveLocalRetryTarget(workspace, targetId);
|
|
465
|
+
const captured = createCapturingIo(io);
|
|
466
|
+
const code = target.workflow === "eval"
|
|
467
|
+
? await localEvaluateCandidate([
|
|
468
|
+
"--dir",
|
|
469
|
+
workspace,
|
|
470
|
+
"--candidate",
|
|
471
|
+
target.candidateId,
|
|
472
|
+
"--runs",
|
|
473
|
+
target.candidateRunId,
|
|
474
|
+
"--samples",
|
|
475
|
+
String(target.samples),
|
|
476
|
+
"--json",
|
|
477
|
+
], captured.io, runtimeOptions)
|
|
478
|
+
: await localRun([
|
|
479
|
+
"--dir",
|
|
480
|
+
workspace,
|
|
481
|
+
"--from",
|
|
482
|
+
target.candidateId,
|
|
483
|
+
"--runs",
|
|
484
|
+
target.candidateRunId,
|
|
485
|
+
"--budget",
|
|
486
|
+
String(target.budget ?? 1),
|
|
487
|
+
"--samples",
|
|
488
|
+
String(target.samples),
|
|
489
|
+
"--json",
|
|
490
|
+
], captured.io, runtimeOptions);
|
|
491
|
+
const commandOutput = parseCapturedJson(captured.stdoutText());
|
|
492
|
+
await preserveLocalActiveCandidate(workspace, target.preserveActiveId);
|
|
493
|
+
const outputRecord = readRecord(commandOutput) ?? {};
|
|
494
|
+
const result = {
|
|
495
|
+
ok: code === 0 && outputRecord.ok !== false,
|
|
496
|
+
retried: {
|
|
497
|
+
id: target.sourceId,
|
|
498
|
+
kind: target.sourceKind,
|
|
499
|
+
workflow: target.workflow,
|
|
500
|
+
},
|
|
501
|
+
};
|
|
502
|
+
assignRetryResultString(result, "runId", outputRecord.runId);
|
|
503
|
+
assignRetryResultString(result, "evaluationId", outputRecord.evaluationId);
|
|
504
|
+
assignRetryResultString(result, "candidateId", outputRecord.candidateId);
|
|
505
|
+
assignRetryResultString(result, "activeCandidateId", outputRecord.activeCandidateId);
|
|
506
|
+
const localView = localRetryViewHint(outputRecord.localView);
|
|
507
|
+
if (localView) {
|
|
508
|
+
result.localView = localView;
|
|
509
|
+
}
|
|
510
|
+
const failedJobCount = numberValue(outputRecord.failedJobCount);
|
|
511
|
+
if (failedJobCount !== null) {
|
|
512
|
+
result.failedJobCount = failedJobCount;
|
|
513
|
+
}
|
|
514
|
+
const error = stringValue(outputRecord.error);
|
|
515
|
+
if (error) {
|
|
516
|
+
result.error = error;
|
|
517
|
+
}
|
|
518
|
+
writeOutput(result, parsed, io, formatRetryCommandResult);
|
|
519
|
+
return code;
|
|
520
|
+
}
|
|
521
|
+
async function resolveLocalRetryTarget(workspace, targetId) {
|
|
522
|
+
const snapshot = await loadLocalArchive(workspace);
|
|
523
|
+
const evaluation = snapshot.evaluations.find((entry) => entry.id === targetId);
|
|
524
|
+
if (evaluation) {
|
|
525
|
+
const run = snapshot.runs.find((entry) => entry.id === evaluation.runId) ?? null;
|
|
526
|
+
return localEvaluationRetryTarget(snapshot, evaluation, run, "evaluation", targetId);
|
|
527
|
+
}
|
|
528
|
+
const run = snapshot.runs.find((entry) => entry.id === targetId);
|
|
529
|
+
if (!run) {
|
|
530
|
+
throw new UsageError(`Run or evaluation not found: ${targetId}`);
|
|
531
|
+
}
|
|
532
|
+
if (run.status !== "finished") {
|
|
533
|
+
throw new UsageError(`Run ${run.id} is ${run.status}; wait for it to finish before retrying.`);
|
|
534
|
+
}
|
|
535
|
+
if (!runSummaryFailed(run)) {
|
|
536
|
+
throw new UsageError(`Run ${run.id} did not fail; use workbench ${run.workflow} to intentionally run it again.`);
|
|
537
|
+
}
|
|
538
|
+
if (run.workflow === "eval") {
|
|
539
|
+
const evaluations = snapshot.evaluations.filter((entry) => entry.runId === run.id);
|
|
540
|
+
if (evaluations.length !== 1) {
|
|
541
|
+
throw new UsageError(evaluations.length === 0
|
|
542
|
+
? `Run ${run.id} has no evaluation record to retry.`
|
|
543
|
+
: `Run ${run.id} has multiple evaluations; retry a specific evaluation id instead.`);
|
|
544
|
+
}
|
|
545
|
+
return localEvaluationRetryTarget(snapshot, evaluations[0], run, "run", targetId);
|
|
546
|
+
}
|
|
547
|
+
const candidateRunId = run.candidateRunId;
|
|
548
|
+
if (!run.candidateId || !candidateRunId) {
|
|
549
|
+
throw new UsageError(`Run ${run.id} is missing retry metadata; use workbench improve --from with an explicit candidate id.`);
|
|
550
|
+
}
|
|
551
|
+
return {
|
|
552
|
+
sourceId: targetId,
|
|
553
|
+
sourceKind: "run",
|
|
554
|
+
workflow: "improve",
|
|
555
|
+
candidateId: run.candidateId,
|
|
556
|
+
candidateRunId,
|
|
557
|
+
samples: run.samples,
|
|
558
|
+
budget: run.budget,
|
|
559
|
+
preserveActiveId: snapshot.activeId,
|
|
560
|
+
};
|
|
561
|
+
}
|
|
562
|
+
function localEvaluationRetryTarget(snapshot, evaluation, run, sourceKind, sourceId) {
|
|
563
|
+
if (!evaluationScorecardFailed(evaluation, run)) {
|
|
564
|
+
throw new UsageError(`Evaluation ${evaluation.id} did not fail; use workbench eval to intentionally run it again.`);
|
|
565
|
+
}
|
|
566
|
+
if (!snapshot.candidates.some((entry) => entry.id === evaluation.candidateId)) {
|
|
567
|
+
throw new UsageError(`Candidate not found for evaluation ${evaluation.id}: ${evaluation.candidateId}`);
|
|
568
|
+
}
|
|
569
|
+
const candidateRunId = evaluation.candidateRunId ?? run?.candidateRunId;
|
|
570
|
+
if (!candidateRunId) {
|
|
571
|
+
throw new UsageError(`Evaluation ${evaluation.id} is missing its candidate run configuration.`);
|
|
572
|
+
}
|
|
573
|
+
return {
|
|
574
|
+
sourceId,
|
|
575
|
+
sourceKind,
|
|
576
|
+
workflow: "eval",
|
|
577
|
+
candidateId: evaluation.candidateId,
|
|
578
|
+
candidateRunId,
|
|
579
|
+
samples: evaluation.sampleCount || run?.samples || 1,
|
|
580
|
+
preserveActiveId: snapshot.activeId,
|
|
581
|
+
};
|
|
582
|
+
}
|
|
583
|
+
async function preserveLocalActiveCandidate(workspace, activeId) {
|
|
584
|
+
let snapshot = await loadLocalArchive(workspace);
|
|
585
|
+
if (activeId && !snapshot.candidates.some((candidate) => candidate.id === activeId)) {
|
|
586
|
+
return;
|
|
587
|
+
}
|
|
588
|
+
if (snapshot.activeId === activeId) {
|
|
589
|
+
return;
|
|
590
|
+
}
|
|
591
|
+
snapshot = setLocalActive(snapshot, activeId);
|
|
592
|
+
await saveLocalArchive(workspace, snapshot);
|
|
593
|
+
}
|
|
594
|
+
function evaluationScorecardFailed(evaluation, run) {
|
|
595
|
+
return evaluation.errorSampleCount > 0 ||
|
|
596
|
+
evaluation.status !== "completed" ||
|
|
597
|
+
runSummaryFailed(run);
|
|
598
|
+
}
|
|
599
|
+
function runSummaryFailed(run) {
|
|
600
|
+
return run?.outcome === "error" || run?.outcome === "cancelled";
|
|
601
|
+
}
|
|
602
|
+
function createCapturingIo(io) {
|
|
603
|
+
const chunks = [];
|
|
604
|
+
const stdout = new class extends Writable {
|
|
605
|
+
_write(chunk, _encoding, callback) {
|
|
606
|
+
chunks.push(Buffer.isBuffer(chunk) ? chunk.toString("utf8") : String(chunk));
|
|
607
|
+
callback();
|
|
608
|
+
}
|
|
609
|
+
}();
|
|
610
|
+
return {
|
|
611
|
+
io: {
|
|
612
|
+
stdin: io.stdin,
|
|
613
|
+
stdout,
|
|
614
|
+
stderr: io.stderr,
|
|
615
|
+
},
|
|
616
|
+
stdoutText: () => chunks.join(""),
|
|
617
|
+
};
|
|
618
|
+
}
|
|
619
|
+
function parseCapturedJson(value) {
|
|
620
|
+
const trimmed = value.trim();
|
|
621
|
+
if (!trimmed) {
|
|
622
|
+
return {};
|
|
623
|
+
}
|
|
624
|
+
try {
|
|
625
|
+
return JSON.parse(trimmed);
|
|
626
|
+
}
|
|
627
|
+
catch {
|
|
628
|
+
return { output: trimmed };
|
|
629
|
+
}
|
|
630
|
+
}
|
|
631
|
+
function localRetryViewHint(value) {
|
|
632
|
+
const record = readRecord(value);
|
|
633
|
+
const command = stringValue(record?.command);
|
|
634
|
+
const note = stringValue(record?.note);
|
|
635
|
+
return command && note ? { command, note } : undefined;
|
|
636
|
+
}
|
|
637
|
+
function assignRetryResultString(result, key, value) {
|
|
638
|
+
const normalized = stringValue(value);
|
|
639
|
+
if (normalized) {
|
|
640
|
+
result[key] = normalized;
|
|
641
|
+
}
|
|
642
|
+
}
|
|
496
643
|
async function localRun(argv, io, runtimeOptions) {
|
|
497
644
|
const parsed = parseArgs(argv);
|
|
498
|
-
rejectUnknownFlags(parsed, new Set(["dir", "
|
|
645
|
+
rejectUnknownFlags(parsed, new Set(["dir", "runs", "from", "budget", "samples", "rerun", "json"]));
|
|
499
646
|
const budget = parsePositiveInt(parsed.flags.budget, 1, "budget");
|
|
500
647
|
const samples = parsePositiveInt(parsed.flags.samples, 1, "samples");
|
|
501
648
|
const sourceArg = resolveSourceDir(parsed);
|
|
502
649
|
const projectSource = await readLocalProjectSource(sourceArg, {
|
|
503
|
-
|
|
650
|
+
runId: singleRequestedRunId(asOptionalString(parsed.flags.runs), "workbench improve"),
|
|
504
651
|
});
|
|
505
652
|
const workspace = projectSource.dir;
|
|
506
|
-
if (!projectSource.spec.
|
|
507
|
-
throw new UsageError("
|
|
653
|
+
if (!projectSource.spec.improve || !projectSource.spec.candidate.improve) {
|
|
654
|
+
throw new UsageError("Candidate improve configuration is required for workbench improve.");
|
|
508
655
|
}
|
|
509
656
|
const executionProject = await resolveLocalProjectForExecution(workspace, projectSource.specSource);
|
|
510
657
|
const { spec, adapterManifests } = executionProject;
|
|
@@ -514,18 +661,32 @@ async function localRun(argv, io, runtimeOptions) {
|
|
|
514
661
|
if (caseIds.length === 0) {
|
|
515
662
|
throw new UsageError("Engine resolver must emit at least one case.");
|
|
516
663
|
}
|
|
664
|
+
const optimizeSelector = workbenchImproveOptimizeSelector(spec);
|
|
665
|
+
const selectionPolicy = workbenchImproveSelectionPolicy(spec);
|
|
666
|
+
const optimizeCaseIds = workbenchEngineCaseIdsForSelector(engineCases, optimizeSelector);
|
|
667
|
+
if (optimizeCaseIds.length === 0) {
|
|
668
|
+
throw new UsageError(`Improve optimizeOn selector matched no cases: ${formatWorkbenchCaseSelector(optimizeSelector)}.`);
|
|
669
|
+
}
|
|
670
|
+
const selectionCaseIds = workbenchEngineCaseIdsForSelector(engineCases, selectionPolicy.selector);
|
|
671
|
+
if (selectionCaseIds.length === 0) {
|
|
672
|
+
throw new UsageError(`Improve selectBy selector matched no cases: ${formatWorkbenchCaseSelector(selectionPolicy.selector)}.`);
|
|
673
|
+
}
|
|
674
|
+
const selectionScoreCaseIds = workbenchCaseSelectorUsesAllCases(selectionPolicy.selector)
|
|
675
|
+
? undefined
|
|
676
|
+
: selectionCaseIds;
|
|
677
|
+
const evaluationCaseIds = workbenchEngineCaseIdsForImproveEvaluation({ spec, engineCases });
|
|
517
678
|
requireValidRunEnvelope({
|
|
518
679
|
workflow: "improve",
|
|
519
680
|
budget,
|
|
520
681
|
samples,
|
|
521
|
-
caseCount:
|
|
682
|
+
caseCount: evaluationCaseIds.length,
|
|
522
683
|
});
|
|
684
|
+
const optimizeOnLabel = formatWorkbenchCaseSelector(optimizeSelector);
|
|
685
|
+
const selectByLabel = formatWorkbenchSelectionPolicy(selectionPolicy);
|
|
523
686
|
const environmentRefs = await ensureLocalDockerfileEnvironments(workspace, spec, engineCases);
|
|
524
687
|
const benchmarkFingerprint = await readLocalBenchmarkFingerprint(workspace);
|
|
525
|
-
const
|
|
526
|
-
const
|
|
527
|
-
let snapshot = await loadLocalArchive(workspace);
|
|
528
|
-
const baseSubject = await ensureLocalImproveBaseSubject({
|
|
688
|
+
const executionFingerprint = localRunExecutionFingerprint(projectSource);
|
|
689
|
+
const baseCandidate = await ensureLocalImproveBaseCandidate({
|
|
529
690
|
parsed,
|
|
530
691
|
sourceArg,
|
|
531
692
|
workspace,
|
|
@@ -534,242 +695,370 @@ async function localRun(argv, io, runtimeOptions) {
|
|
|
534
695
|
io,
|
|
535
696
|
runtimeOptions,
|
|
536
697
|
});
|
|
537
|
-
let
|
|
698
|
+
let snapshot = await loadLocalArchive(workspace);
|
|
699
|
+
if (parsed.flags.rerun !== true) {
|
|
700
|
+
const reusableRun = findReusableLocalImproveRun(snapshot.runs, {
|
|
701
|
+
benchmarkFingerprint,
|
|
702
|
+
candidateId: baseCandidate.id,
|
|
703
|
+
candidateRunId: projectSource.spec.candidate.selectedRunId,
|
|
704
|
+
executionFingerprint,
|
|
705
|
+
budget,
|
|
706
|
+
samples,
|
|
707
|
+
});
|
|
708
|
+
if (reusableRun) {
|
|
709
|
+
const evaluation = snapshot.evaluations.find((entry) => entry.runId === reusableRun.id) ?? null;
|
|
710
|
+
const outputCandidateId = reusableRun.outputCandidateId ?? reusableRun.candidateId ?? baseCandidate.id;
|
|
711
|
+
const outputCandidate = readLocalCandidate(snapshot, outputCandidateId);
|
|
712
|
+
const activeCandidate = snapshot.activeId
|
|
713
|
+
? readLocalCandidate(snapshot, snapshot.activeId)
|
|
714
|
+
: null;
|
|
715
|
+
const result = {
|
|
716
|
+
ok: true,
|
|
717
|
+
reused: true,
|
|
718
|
+
runId: reusableRun.id,
|
|
719
|
+
evaluationId: evaluation?.id ?? null,
|
|
720
|
+
outputCandidateId,
|
|
721
|
+
outputCandidate,
|
|
722
|
+
activeCandidateId: snapshot.activeId,
|
|
723
|
+
activeCandidate,
|
|
724
|
+
completedJobCount: 0,
|
|
725
|
+
failedJobCount: 0,
|
|
726
|
+
localView: localDevViewHint(workspace, reusableRun.id),
|
|
727
|
+
};
|
|
728
|
+
writeOutput(result, parsed, io, () => `Reused improve run ${reusableRun.id}. Use --rerun to intentionally run it again.`);
|
|
729
|
+
return 0;
|
|
730
|
+
}
|
|
731
|
+
}
|
|
732
|
+
const runId = `run_local_${Date.now().toString(36)}`;
|
|
733
|
+
const startedAt = new Date().toISOString();
|
|
734
|
+
let currentBaseId = baseCandidate.id;
|
|
735
|
+
let outputCandidateId = null;
|
|
538
736
|
let completedJobCount = 0;
|
|
539
737
|
let failedJobCount = 0;
|
|
738
|
+
let attemptsExecuted = 0;
|
|
540
739
|
const failedJobs = [];
|
|
541
740
|
const events = [
|
|
542
741
|
createLocalEvent("run_started", startedAt, {
|
|
543
742
|
runId,
|
|
544
|
-
detail: { budget, samples, strategy: "greedy" },
|
|
743
|
+
detail: { budget, samples, strategy: "greedy", optimizeOn: optimizeOnLabel, selectBy: selectByLabel },
|
|
545
744
|
}),
|
|
546
745
|
];
|
|
547
|
-
const
|
|
548
|
-
|
|
549
|
-
|
|
550
|
-
|
|
551
|
-
|
|
552
|
-
|
|
553
|
-
|
|
554
|
-
|
|
555
|
-
|
|
556
|
-
|
|
557
|
-
|
|
558
|
-
|
|
559
|
-
|
|
560
|
-
|
|
561
|
-
|
|
562
|
-
|
|
563
|
-
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
|
|
567
|
-
|
|
568
|
-
|
|
569
|
-
|
|
570
|
-
|
|
571
|
-
|
|
572
|
-
|
|
573
|
-
|
|
574
|
-
|
|
575
|
-
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
baseFiles,
|
|
580
|
-
traceFiles: subjectRevisionTraceFiles,
|
|
581
|
-
...(environmentRefs.defaultRef ? { environmentRef: environmentRefs.defaultRef } : {}),
|
|
582
|
-
baseId: activeSubject.id,
|
|
583
|
-
})[0];
|
|
584
|
-
const subjectRevisionJobs = await executeLocalDevelopmentDag({
|
|
585
|
-
jobs: [plannedSubjectRevision],
|
|
586
|
-
spec,
|
|
587
|
-
adapterManifests,
|
|
588
|
-
adapterFiles: normalizeSurfaceFiles(projectSource.adapterFiles),
|
|
589
|
-
baseFiles,
|
|
590
|
-
engineResolveFiles,
|
|
591
|
-
engineCases,
|
|
592
|
-
traceFiles: subjectRevisionTraceFiles,
|
|
593
|
-
capacity: devCapacity,
|
|
746
|
+
const runningRun = {
|
|
747
|
+
id: runId,
|
|
748
|
+
workflow: "improve",
|
|
749
|
+
benchmarkFingerprint,
|
|
750
|
+
status: "running",
|
|
751
|
+
candidateId: baseCandidate.id,
|
|
752
|
+
candidateRunId: projectSource.spec.candidate.selectedRunId,
|
|
753
|
+
candidateRunName: projectSource.spec.candidate.selectedRunName,
|
|
754
|
+
startedAt,
|
|
755
|
+
improver: formatSpecImprover(spec),
|
|
756
|
+
engineRun: spec.engineRun.use,
|
|
757
|
+
strategy: "greedy",
|
|
758
|
+
optimizeOn: optimizeOnLabel,
|
|
759
|
+
selectBy: selectByLabel,
|
|
760
|
+
budget,
|
|
761
|
+
repairBudget: 0,
|
|
762
|
+
attemptsRequested: budget,
|
|
763
|
+
attemptsExecuted: 0,
|
|
764
|
+
samples,
|
|
765
|
+
executionFingerprint,
|
|
766
|
+
activeCandidateId: snapshot.activeId,
|
|
767
|
+
outputCandidateId: null,
|
|
768
|
+
};
|
|
769
|
+
snapshot = upsertLocalRun(snapshot, runningRun, events);
|
|
770
|
+
await saveLocalArchive(workspace, snapshot);
|
|
771
|
+
try {
|
|
772
|
+
const devCapacity = await localDevelopmentCapacity(workspace);
|
|
773
|
+
const baselineTraceJobs = selectLocalOptimizerBaselineTraceJobs(snapshot, await readLocalJobs(workspace), {
|
|
774
|
+
benchmarkFingerprint,
|
|
775
|
+
candidateId: baseCandidate.id,
|
|
776
|
+
candidateRunId: projectSource.spec.candidate.selectedRunId,
|
|
777
|
+
executionFingerprint,
|
|
594
778
|
});
|
|
595
|
-
const
|
|
596
|
-
const
|
|
597
|
-
|
|
598
|
-
|
|
599
|
-
|
|
600
|
-
|
|
601
|
-
|
|
779
|
+
const runTraceJobs = [];
|
|
780
|
+
const attempts = budget;
|
|
781
|
+
for (let attemptIndex = 0; attemptIndex < attempts; attemptIndex += 1) {
|
|
782
|
+
snapshot = await loadLocalArchive(workspace);
|
|
783
|
+
const activeCandidate = readLocalCandidate(snapshot, currentBaseId);
|
|
784
|
+
const baseFiles = filterCandidateSourceFiles(readLocalCandidateFiles(snapshot, activeCandidate.id));
|
|
785
|
+
if (baseFiles.length === 0) {
|
|
786
|
+
throw new UsageError("Candidate snapshot must include at least one file.");
|
|
787
|
+
}
|
|
788
|
+
const candidateRevisionTraceFiles = createOptimizerTraceInputFiles({
|
|
789
|
+
jobs: filterOptimizerTraceJobsForCaseIds([...baselineTraceJobs, ...runTraceJobs], optimizeCaseIds),
|
|
790
|
+
});
|
|
791
|
+
const candidateId = `candidate_${runId.replace(/^run_/u, "")}_${String(attemptIndex + 1).padStart(3, "0")}`;
|
|
792
|
+
const plannedCandidateRevision = planWorkbenchExecutionJobsForPurpose({
|
|
602
793
|
ownerUserId: "local",
|
|
603
794
|
projectId: "local",
|
|
604
795
|
runId,
|
|
605
|
-
|
|
796
|
+
candidateId,
|
|
606
797
|
attemptIndex,
|
|
607
798
|
samples,
|
|
608
|
-
|
|
609
|
-
caseIds,
|
|
799
|
+
caseIds: optimizeCaseIds,
|
|
610
800
|
engineCases,
|
|
611
801
|
spec,
|
|
612
|
-
environmentRefsByCase: environmentRefs.byCase,
|
|
613
802
|
workflow: "improve",
|
|
614
|
-
purpose: "
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
|
|
803
|
+
purpose: "improve",
|
|
804
|
+
now: new Date().toISOString(),
|
|
805
|
+
baseFiles,
|
|
806
|
+
traceFiles: candidateRevisionTraceFiles,
|
|
807
|
+
...(environmentRefs.defaultRef ? { environmentRef: environmentRefs.defaultRef } : {}),
|
|
808
|
+
baseId: activeCandidate.id,
|
|
809
|
+
})[0];
|
|
810
|
+
const candidateRevisionJobs = await executeLocalDevelopmentDag({
|
|
811
|
+
jobs: [plannedCandidateRevision],
|
|
618
812
|
spec,
|
|
619
813
|
adapterManifests,
|
|
620
814
|
adapterFiles: normalizeSurfaceFiles(projectSource.adapterFiles),
|
|
621
|
-
baseFiles
|
|
815
|
+
baseFiles,
|
|
622
816
|
engineResolveFiles,
|
|
623
817
|
engineCases,
|
|
818
|
+
traceFiles: candidateRevisionTraceFiles,
|
|
624
819
|
capacity: devCapacity,
|
|
625
820
|
});
|
|
626
|
-
|
|
627
|
-
|
|
628
|
-
|
|
629
|
-
|
|
630
|
-
|
|
631
|
-
|
|
632
|
-
|
|
633
|
-
|
|
634
|
-
|
|
635
|
-
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
|
|
639
|
-
|
|
640
|
-
|
|
641
|
-
|
|
642
|
-
|
|
821
|
+
const candidateRevision = candidateRevisionJobs[0];
|
|
822
|
+
const completedJobs = [candidateRevision];
|
|
823
|
+
if (candidateRevision.status === "succeeded") {
|
|
824
|
+
const candidateRevisionFiles = completedJobOutputFiles(candidateRevision).length > 0
|
|
825
|
+
? normalizeSurfaceFiles(completedJobOutputFiles(candidateRevision).filter((file) => !file.path.startsWith(".workbench/")))
|
|
826
|
+
: baseFiles;
|
|
827
|
+
const attemptJobs = planWorkbenchExecutionJobsForPurpose({
|
|
828
|
+
ownerUserId: "local",
|
|
829
|
+
projectId: "local",
|
|
830
|
+
runId,
|
|
831
|
+
candidateId,
|
|
832
|
+
attemptIndex,
|
|
833
|
+
samples,
|
|
834
|
+
now: new Date().toISOString(),
|
|
835
|
+
caseIds: evaluationCaseIds,
|
|
836
|
+
engineCases,
|
|
837
|
+
spec,
|
|
838
|
+
environmentRefsByCase: environmentRefs.byCase,
|
|
839
|
+
workflow: "improve",
|
|
840
|
+
purpose: "attempt",
|
|
841
|
+
});
|
|
842
|
+
const dagJobs = await executeLocalDevelopmentDag({
|
|
843
|
+
jobs: [candidateRevision, ...attemptJobs],
|
|
844
|
+
spec,
|
|
845
|
+
adapterManifests,
|
|
846
|
+
adapterFiles: normalizeSurfaceFiles(projectSource.adapterFiles),
|
|
847
|
+
baseFiles: candidateRevisionFiles,
|
|
848
|
+
engineResolveFiles,
|
|
849
|
+
engineCases,
|
|
850
|
+
capacity: devCapacity,
|
|
851
|
+
});
|
|
852
|
+
completedJobs.splice(0, completedJobs.length, ...dagJobs);
|
|
853
|
+
}
|
|
854
|
+
runTraceJobs.push(...completedJobs);
|
|
855
|
+
const materialized = materializeWorkbenchRunResult({
|
|
643
856
|
runId,
|
|
644
|
-
|
|
645
|
-
|
|
646
|
-
|
|
647
|
-
|
|
857
|
+
benchmarkFingerprint,
|
|
858
|
+
sourceYaml: projectSource.specSource,
|
|
859
|
+
benchmarkSourceFiles: authoredBenchmarkSourceFiles(projectSource),
|
|
860
|
+
startedAt,
|
|
861
|
+
spec,
|
|
862
|
+
jobs: completedJobs,
|
|
863
|
+
previousCandidate: activeCandidate,
|
|
864
|
+
existingCandidateCount: snapshot.candidates.length,
|
|
865
|
+
selection: {
|
|
866
|
+
metric: selectionPolicy.metric,
|
|
867
|
+
...(selectionScoreCaseIds ? { caseIds: selectionScoreCaseIds } : {}),
|
|
868
|
+
label: selectByLabel,
|
|
869
|
+
},
|
|
870
|
+
});
|
|
871
|
+
for (const candidate of materialized.candidates) {
|
|
872
|
+
outputCandidateId = candidate.id;
|
|
873
|
+
snapshot = upsertLocalCandidate(snapshot, candidate, materialized.candidateFiles[candidate.id] ?? []);
|
|
874
|
+
events.push(createLocalEvent("candidate_created", candidate.createdAt, {
|
|
875
|
+
runId,
|
|
876
|
+
candidateId: candidate.id,
|
|
877
|
+
baseId: candidate.baseId,
|
|
878
|
+
status: candidate.status,
|
|
879
|
+
metrics: evaluationMeanMetrics(candidate.eval),
|
|
880
|
+
}));
|
|
881
|
+
}
|
|
882
|
+
for (const evaluation of materialized.evaluations) {
|
|
883
|
+
snapshot = upsertLocalEvaluation(snapshot, evaluation);
|
|
884
|
+
}
|
|
885
|
+
snapshot = setLocalActive(snapshot, materialized.activeCandidateId);
|
|
886
|
+
currentBaseId = materialized.activeCandidateId ?? currentBaseId;
|
|
887
|
+
completedJobCount += materialized.completedJobCount;
|
|
888
|
+
failedJobCount += materialized.failedJobCount;
|
|
889
|
+
failedJobs.push(...completedJobs
|
|
890
|
+
.filter((job) => job.status === "failed")
|
|
891
|
+
.map((job) => ({
|
|
892
|
+
id: job.id,
|
|
893
|
+
purpose: workbenchExecutionPurpose(job),
|
|
894
|
+
error: job.error ?? "Job failed without an error message.",
|
|
895
|
+
})));
|
|
896
|
+
events.push(createLocalEvent("active_changed", new Date().toISOString(), {
|
|
897
|
+
runId,
|
|
898
|
+
candidateId: materialized.activeCandidateId ?? undefined,
|
|
899
|
+
activeId: materialized.activeCandidateId ?? undefined,
|
|
900
|
+
status: materialized.selectedCandidate?.status,
|
|
901
|
+
metrics: evaluationMeanMetrics(materialized.selectedCandidate?.eval),
|
|
648
902
|
}));
|
|
903
|
+
await saveLocalJobs(workspace, completedJobs);
|
|
904
|
+
await saveLocalArchive(workspace, snapshot);
|
|
905
|
+
attemptsExecuted += 1;
|
|
649
906
|
}
|
|
650
|
-
|
|
651
|
-
|
|
652
|
-
|
|
653
|
-
|
|
654
|
-
|
|
655
|
-
|
|
656
|
-
|
|
657
|
-
|
|
658
|
-
|
|
659
|
-
.
|
|
660
|
-
|
|
661
|
-
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
|
|
907
|
+
snapshot = await loadLocalArchive(workspace);
|
|
908
|
+
const finishedAt = new Date().toISOString();
|
|
909
|
+
const run = {
|
|
910
|
+
id: runId,
|
|
911
|
+
workflow: "improve",
|
|
912
|
+
benchmarkFingerprint,
|
|
913
|
+
status: "finished",
|
|
914
|
+
candidateId: baseCandidate.id,
|
|
915
|
+
candidateRunId: projectSource.spec.candidate.selectedRunId,
|
|
916
|
+
candidateRunName: projectSource.spec.candidate.selectedRunName,
|
|
917
|
+
startedAt,
|
|
918
|
+
finishedAt,
|
|
919
|
+
durationMs: Math.max(0, Date.parse(finishedAt) - Date.parse(startedAt)),
|
|
920
|
+
improver: formatSpecImprover(spec),
|
|
921
|
+
engineRun: spec.engineRun.use,
|
|
922
|
+
strategy: "greedy",
|
|
923
|
+
optimizeOn: optimizeOnLabel,
|
|
924
|
+
selectBy: selectByLabel,
|
|
925
|
+
budget,
|
|
926
|
+
repairBudget: 0,
|
|
927
|
+
attemptsRequested: budget,
|
|
928
|
+
attemptsExecuted,
|
|
929
|
+
samples,
|
|
930
|
+
executionFingerprint,
|
|
931
|
+
stoppedReason: "budget_exhausted",
|
|
932
|
+
outcome: failedJobCount > 0 ? "error" : "ok",
|
|
933
|
+
activeCandidateId: snapshot.activeId,
|
|
934
|
+
outputCandidateId: outputCandidateId ?? snapshot.activeId,
|
|
935
|
+
};
|
|
936
|
+
events.push(createLocalEvent("run_finished", finishedAt, {
|
|
665
937
|
runId,
|
|
666
|
-
|
|
667
|
-
|
|
668
|
-
|
|
669
|
-
|
|
938
|
+
detail: {
|
|
939
|
+
outcome: run.outcome ?? null,
|
|
940
|
+
attemptsExecuted: run.attemptsExecuted,
|
|
941
|
+
durationMs: run.durationMs ?? null,
|
|
942
|
+
},
|
|
670
943
|
}));
|
|
671
|
-
|
|
944
|
+
snapshot = upsertLocalRun(snapshot, run, events.slice(1));
|
|
672
945
|
await saveLocalArchive(workspace, snapshot);
|
|
946
|
+
const outputCandidate = run.outputCandidateId
|
|
947
|
+
? readLocalCandidate(snapshot, run.outputCandidateId)
|
|
948
|
+
: null;
|
|
949
|
+
const activeCandidate = snapshot.activeId
|
|
950
|
+
? readLocalCandidate(snapshot, snapshot.activeId)
|
|
951
|
+
: null;
|
|
952
|
+
const result = {
|
|
953
|
+
ok: failedJobCount === 0,
|
|
954
|
+
runId,
|
|
955
|
+
outputCandidateId: run.outputCandidateId,
|
|
956
|
+
outputCandidate,
|
|
957
|
+
activeCandidateId: snapshot.activeId,
|
|
958
|
+
activeCandidate,
|
|
959
|
+
completedJobCount,
|
|
960
|
+
failedJobCount,
|
|
961
|
+
failedJobs,
|
|
962
|
+
localView: localDevViewHint(workspace, runId),
|
|
963
|
+
};
|
|
964
|
+
writeOutput(result, parsed, io, () => {
|
|
965
|
+
const outputMetricValue = outputCandidate ? formatCandidateEvaluationScore(outputCandidate) : "n/a";
|
|
966
|
+
const activeMetricValue = activeCandidate ? formatCandidateEvaluationScore(activeCandidate) : "n/a";
|
|
967
|
+
const firstFailure = result.failedJobs[0];
|
|
968
|
+
const failureDetail = firstFailure
|
|
969
|
+
? `\nFirst failed job ${firstFailure.id}${firstFailure.purpose ? ` (${firstFailure.purpose})` : ""}: ${firstFailure.error}`
|
|
970
|
+
: "";
|
|
971
|
+
const viewDetail = failedJobCount === 0
|
|
972
|
+
? `\nOpen local view: ${result.localView.command}\n${result.localView.note}`
|
|
973
|
+
: "";
|
|
974
|
+
return `Run ${runId} finished. Output candidate: ${formatLocalCandidateLabel(outputCandidate)} (score: ${outputMetricValue}). Active candidate: ${formatLocalCandidateLabel(activeCandidate)} (score: ${activeMetricValue}).${failureDetail}${viewDetail}`;
|
|
975
|
+
});
|
|
976
|
+
return failedJobCount === 0 ? 0 : 1;
|
|
977
|
+
}
|
|
978
|
+
catch (error) {
|
|
979
|
+
await markLocalRunFailed({
|
|
980
|
+
workspace,
|
|
981
|
+
run: {
|
|
982
|
+
...runningRun,
|
|
983
|
+
attemptsExecuted,
|
|
984
|
+
outputCandidateId,
|
|
985
|
+
},
|
|
986
|
+
startedAt,
|
|
987
|
+
error,
|
|
988
|
+
}).catch(() => undefined);
|
|
989
|
+
throw error;
|
|
673
990
|
}
|
|
674
|
-
snapshot = await loadLocalArchive(workspace);
|
|
675
|
-
const finishedAt = new Date().toISOString();
|
|
676
|
-
const run = {
|
|
677
|
-
id: runId,
|
|
678
|
-
workflow: "improve",
|
|
679
|
-
benchmarkFingerprint,
|
|
680
|
-
status: "finished",
|
|
681
|
-
startedAt,
|
|
682
|
-
finishedAt,
|
|
683
|
-
durationMs: Math.max(0, Date.parse(finishedAt) - Date.parse(startedAt)),
|
|
684
|
-
optimizer: formatSpecOptimizer(spec),
|
|
685
|
-
engineRun: spec.engineRun.use,
|
|
686
|
-
strategy: "greedy",
|
|
687
|
-
budget,
|
|
688
|
-
repairBudget: 0,
|
|
689
|
-
attemptsRequested: budget,
|
|
690
|
-
attemptsExecuted: budget,
|
|
691
|
-
samples,
|
|
692
|
-
stoppedReason: "budget_exhausted",
|
|
693
|
-
outcome: failedJobCount > 0 ? "error" : "ok",
|
|
694
|
-
};
|
|
695
|
-
events.push(createLocalEvent("run_finished", finishedAt, {
|
|
696
|
-
runId,
|
|
697
|
-
detail: {
|
|
698
|
-
outcome: run.outcome ?? null,
|
|
699
|
-
attemptsExecuted: run.attemptsExecuted,
|
|
700
|
-
durationMs: run.durationMs ?? null,
|
|
701
|
-
},
|
|
702
|
-
}));
|
|
703
|
-
snapshot = appendLocalRun(snapshot, run, events);
|
|
704
|
-
await saveLocalArchive(workspace, snapshot);
|
|
705
|
-
const selected = snapshot.activeId
|
|
706
|
-
? readLocalSubject(snapshot, snapshot.activeId)
|
|
707
|
-
: null;
|
|
708
|
-
const result = {
|
|
709
|
-
ok: failedJobCount === 0,
|
|
710
|
-
runId,
|
|
711
|
-
activeSubjectId: snapshot.activeId,
|
|
712
|
-
selectedSubject: selected,
|
|
713
|
-
completedJobCount,
|
|
714
|
-
failedJobCount,
|
|
715
|
-
failedJobs,
|
|
716
|
-
localView: localDevViewHint(workspace, runId),
|
|
717
|
-
};
|
|
718
|
-
writeOutput(result, parsed, io, () => {
|
|
719
|
-
const metricValue = selected?.metrics?.score ?? "n/a";
|
|
720
|
-
const firstFailure = result.failedJobs[0];
|
|
721
|
-
const failureDetail = firstFailure
|
|
722
|
-
? `\nFirst failed job ${firstFailure.id}${firstFailure.purpose ? ` (${firstFailure.purpose})` : ""}: ${firstFailure.error}`
|
|
723
|
-
: "";
|
|
724
|
-
const viewDetail = failedJobCount === 0
|
|
725
|
-
? `\nOpen local view: ${result.localView.command}\n${result.localView.note}`
|
|
726
|
-
: "";
|
|
727
|
-
return `Run ${runId} finished. Active subject: ${snapshot.activeId ?? "none"} (score: ${metricValue}).${failureDetail}${viewDetail}`;
|
|
728
|
-
});
|
|
729
|
-
return failedJobCount === 0 ? 0 : 1;
|
|
730
991
|
}
|
|
731
|
-
async function
|
|
992
|
+
async function ensureLocalImproveBaseCandidate(args) {
|
|
732
993
|
let snapshot = await loadLocalArchive(args.workspace);
|
|
733
994
|
const explicitBase = asOptionalString(args.parsed.flags.from);
|
|
734
995
|
const benchmarkFingerprint = await readLocalBenchmarkFingerprint(args.workspace);
|
|
735
996
|
if (explicitBase) {
|
|
736
|
-
let
|
|
737
|
-
if (
|
|
738
|
-
throw new UsageError(`Base
|
|
997
|
+
let candidate = readLocalCandidate(snapshot, explicitBase);
|
|
998
|
+
if (candidate.benchmarkFingerprint !== benchmarkFingerprint) {
|
|
999
|
+
throw new UsageError(`Base candidate ${explicitBase} belongs to benchmark ${candidate.benchmarkFingerprint}, not ${benchmarkFingerprint}.`);
|
|
739
1000
|
}
|
|
740
|
-
if (!
|
|
741
|
-
throw new UsageError(`Base
|
|
1001
|
+
if (!candidate.candidateFingerprint) {
|
|
1002
|
+
throw new UsageError(`Base candidate ${explicitBase} is missing a candidate fingerprint.`);
|
|
742
1003
|
}
|
|
743
|
-
if (
|
|
744
|
-
const code = await
|
|
1004
|
+
if (candidate.status !== "evaluated" && !candidate.eval) {
|
|
1005
|
+
const code = await localEvaluateCandidate([
|
|
1006
|
+
"--dir",
|
|
1007
|
+
args.workspace,
|
|
1008
|
+
"--candidate",
|
|
1009
|
+
explicitBase,
|
|
1010
|
+
"--runs",
|
|
1011
|
+
args.projectSource.spec.candidate.selectedRunId,
|
|
1012
|
+
"--samples",
|
|
1013
|
+
String(args.samples),
|
|
1014
|
+
...(args.parsed.flags.rerun === true ? ["--rerun"] : []),
|
|
1015
|
+
"--json",
|
|
1016
|
+
], createSilentIo(args.io), args.runtimeOptions);
|
|
745
1017
|
if (code !== 0) {
|
|
746
|
-
throw new UsageError(`Base
|
|
1018
|
+
throw new UsageError(`Base candidate ${explicitBase} eval failed; improve was not started.`);
|
|
747
1019
|
}
|
|
748
1020
|
snapshot = await loadLocalArchive(args.workspace);
|
|
749
|
-
|
|
1021
|
+
candidate = readLocalCandidate(snapshot, explicitBase);
|
|
750
1022
|
}
|
|
751
|
-
return
|
|
1023
|
+
return candidate;
|
|
752
1024
|
}
|
|
753
|
-
const
|
|
754
|
-
const existing = snapshot.
|
|
755
|
-
|
|
756
|
-
(
|
|
1025
|
+
const candidateFingerprint = localCandidateFingerprint(args.projectSource);
|
|
1026
|
+
const existing = snapshot.candidates.find((candidate) => candidate.benchmarkFingerprint === benchmarkFingerprint &&
|
|
1027
|
+
candidate.candidateFingerprint === candidateFingerprint &&
|
|
1028
|
+
(candidate.status === "evaluated" || Boolean(candidate.eval)));
|
|
757
1029
|
if (existing) {
|
|
758
1030
|
return existing;
|
|
759
1031
|
}
|
|
760
1032
|
const evalArgs = args.parsed.positionals.length > 0
|
|
761
|
-
? [
|
|
762
|
-
|
|
763
|
-
|
|
1033
|
+
? [
|
|
1034
|
+
args.sourceArg,
|
|
1035
|
+
"--runs",
|
|
1036
|
+
args.projectSource.spec.candidate.selectedRunId,
|
|
1037
|
+
"--samples",
|
|
1038
|
+
String(args.samples),
|
|
1039
|
+
...(args.parsed.flags.rerun === true ? ["--rerun"] : []),
|
|
1040
|
+
"--json",
|
|
1041
|
+
]
|
|
1042
|
+
: [
|
|
1043
|
+
"--dir",
|
|
1044
|
+
args.workspace,
|
|
1045
|
+
"--runs",
|
|
1046
|
+
args.projectSource.spec.candidate.selectedRunId,
|
|
1047
|
+
"--samples",
|
|
1048
|
+
String(args.samples),
|
|
1049
|
+
...(args.parsed.flags.rerun === true ? ["--rerun"] : []),
|
|
1050
|
+
"--json",
|
|
1051
|
+
];
|
|
1052
|
+
const code = await localEvaluateCandidate(evalArgs, createSilentIo(args.io), args.runtimeOptions);
|
|
764
1053
|
if (code !== 0) {
|
|
765
|
-
throw new UsageError("Parent
|
|
1054
|
+
throw new UsageError("Parent candidate eval failed; improve was not started.");
|
|
766
1055
|
}
|
|
767
1056
|
snapshot = await loadLocalArchive(args.workspace);
|
|
768
|
-
const evaluated = snapshot.
|
|
769
|
-
|
|
770
|
-
(
|
|
1057
|
+
const evaluated = snapshot.candidates.find((candidate) => candidate.benchmarkFingerprint === benchmarkFingerprint &&
|
|
1058
|
+
candidate.candidateFingerprint === candidateFingerprint &&
|
|
1059
|
+
(candidate.status === "evaluated" || Boolean(candidate.eval)));
|
|
771
1060
|
if (!evaluated) {
|
|
772
|
-
throw new UsageError("Parent
|
|
1061
|
+
throw new UsageError("Parent candidate eval did not produce an evaluated candidate.");
|
|
773
1062
|
}
|
|
774
1063
|
return evaluated;
|
|
775
1064
|
}
|
|
@@ -785,13 +1074,62 @@ function createSilentIo(io) {
|
|
|
785
1074
|
stderr: io.stderr,
|
|
786
1075
|
};
|
|
787
1076
|
}
|
|
788
|
-
|
|
1077
|
+
function selectLocalOptimizerBaselineTraceJobs(snapshot, jobs, target) {
|
|
1078
|
+
const runById = new Map(snapshot.runs.map((run) => [run.id, run]));
|
|
1079
|
+
const evaluation = snapshot.evaluations
|
|
1080
|
+
.filter((entry) => {
|
|
1081
|
+
const run = runById.get(entry.runId);
|
|
1082
|
+
return entry.benchmarkFingerprint === target.benchmarkFingerprint &&
|
|
1083
|
+
entry.candidateId === target.candidateId &&
|
|
1084
|
+
entry.candidateRunId === target.candidateRunId &&
|
|
1085
|
+
run?.executionFingerprint === target.executionFingerprint;
|
|
1086
|
+
})
|
|
1087
|
+
.sort((left, right) => right.updatedAt.localeCompare(left.updatedAt) ||
|
|
1088
|
+
right.runId.localeCompare(left.runId))[0] ?? null;
|
|
1089
|
+
if (!evaluation) {
|
|
1090
|
+
return [];
|
|
1091
|
+
}
|
|
1092
|
+
return jobs.filter((job) => job.runId === evaluation.runId);
|
|
1093
|
+
}
|
|
1094
|
+
async function localEvaluateCandidate(argv, io, runtimeOptions) {
|
|
789
1095
|
void runtimeOptions;
|
|
790
1096
|
const parsed = parseArgs(argv);
|
|
791
|
-
rejectUnknownFlags(parsed, new Set(["dir", "
|
|
1097
|
+
rejectUnknownFlags(parsed, new Set(["dir", "candidate", "runs", "samples", "rerun", "json"]));
|
|
792
1098
|
const samples = parsePositiveInt(parsed.flags.samples, 1, "samples");
|
|
793
1099
|
const sourceArg = resolveSourceDir(parsed);
|
|
794
|
-
const
|
|
1100
|
+
const runsFlag = asOptionalString(parsed.flags.runs);
|
|
1101
|
+
const defaultProjectSource = await readLocalProjectSource(sourceArg);
|
|
1102
|
+
const selectedRunIds = resolveCandidateRunSelection(defaultProjectSource, runsFlag);
|
|
1103
|
+
if (selectedRunIds.length > 1) {
|
|
1104
|
+
let failed = 0;
|
|
1105
|
+
for (const runId of selectedRunIds) {
|
|
1106
|
+
const args = [
|
|
1107
|
+
"--dir",
|
|
1108
|
+
defaultProjectSource.dir,
|
|
1109
|
+
"--runs",
|
|
1110
|
+
runId,
|
|
1111
|
+
"--samples",
|
|
1112
|
+
String(samples),
|
|
1113
|
+
...(readOptionalCandidateFlag(parsed) ? ["--candidate", readOptionalCandidateFlag(parsed)] : []),
|
|
1114
|
+
...(parsed.flags.rerun === true ? ["--rerun"] : []),
|
|
1115
|
+
"--json",
|
|
1116
|
+
];
|
|
1117
|
+
const code = await localEvaluateCandidate(args, createSilentIo(io), runtimeOptions);
|
|
1118
|
+
if (code !== 0) {
|
|
1119
|
+
failed += 1;
|
|
1120
|
+
}
|
|
1121
|
+
}
|
|
1122
|
+
writeOutput({
|
|
1123
|
+
ok: failed === 0,
|
|
1124
|
+
candidateId: defaultProjectSource.candidateName,
|
|
1125
|
+
candidateRunIds: selectedRunIds,
|
|
1126
|
+
failedRunCount: failed,
|
|
1127
|
+
}, parsed, io, () => `Evaluated ${selectedRunIds.length} candidate run(s); ${failed} failed.`);
|
|
1128
|
+
return failed === 0 ? 0 : 1;
|
|
1129
|
+
}
|
|
1130
|
+
const projectSource = selectedRunIds[0] === defaultProjectSource.candidateRunId
|
|
1131
|
+
? defaultProjectSource
|
|
1132
|
+
: await readLocalProjectSource(sourceArg, { runId: selectedRunIds[0] });
|
|
795
1133
|
const workspace = projectSource.dir;
|
|
796
1134
|
const executionProject = await resolveLocalProjectForExecution(workspace, projectSource.specSource);
|
|
797
1135
|
const { spec, adapterManifests } = executionProject;
|
|
@@ -810,114 +1148,367 @@ async function localEvaluateSubject(argv, io, runtimeOptions) {
|
|
|
810
1148
|
const environmentRefs = await ensureLocalDockerfileEnvironments(workspace, spec, engineCases);
|
|
811
1149
|
let snapshot = await loadLocalArchive(workspace);
|
|
812
1150
|
const benchmarkFingerprint = await readLocalBenchmarkFingerprint(workspace);
|
|
813
|
-
const
|
|
814
|
-
const
|
|
815
|
-
const
|
|
816
|
-
|
|
817
|
-
|
|
818
|
-
const
|
|
819
|
-
const
|
|
820
|
-
|
|
821
|
-
|
|
1151
|
+
const executionFingerprint = localRunExecutionFingerprint(projectSource);
|
|
1152
|
+
const sourceCandidateFingerprint = localCandidateFingerprint(projectSource);
|
|
1153
|
+
const explicitCandidateId = readOptionalCandidateFlag(parsed);
|
|
1154
|
+
const existingSourceCandidate = snapshot.candidates.find((candidate) => candidate.benchmarkFingerprint === benchmarkFingerprint &&
|
|
1155
|
+
candidate.candidateFingerprint === sourceCandidateFingerprint);
|
|
1156
|
+
const candidateId = explicitCandidateId ?? existingSourceCandidate?.id ?? `candidate_${sourceCandidateFingerprint.slice(0, 12)}`;
|
|
1157
|
+
const existingCandidate = snapshot.candidates.find((candidate) => candidate.id === candidateId);
|
|
1158
|
+
const activeCandidateIdBeforeEval = snapshot.activeId;
|
|
1159
|
+
const selectedCandidateRunId = projectSource.spec.candidate.selectedRunId;
|
|
1160
|
+
const files = filterCandidateSourceFiles(existingCandidate
|
|
1161
|
+
? readLocalCandidateFiles(snapshot, candidateId)
|
|
1162
|
+
: normalizeSurfaceFiles(projectSource.candidateFiles));
|
|
1163
|
+
const evaluationWork = parsed.flags.rerun !== true
|
|
1164
|
+
? await resolveLocalEvaluationWork(workspace, snapshot, {
|
|
1165
|
+
benchmarkFingerprint,
|
|
1166
|
+
candidateId,
|
|
1167
|
+
candidateFingerprint: existingCandidate?.candidateFingerprint ?? sourceCandidateFingerprint,
|
|
1168
|
+
candidateRunId: selectedCandidateRunId,
|
|
1169
|
+
executionFingerprint,
|
|
1170
|
+
samples,
|
|
1171
|
+
caseIds,
|
|
1172
|
+
})
|
|
1173
|
+
: null;
|
|
1174
|
+
const reusableEvaluation = evaluationWork?.reusableEvaluation ?? null;
|
|
1175
|
+
if (reusableEvaluation) {
|
|
1176
|
+
const result = {
|
|
1177
|
+
ok: true,
|
|
1178
|
+
reused: true,
|
|
1179
|
+
runId: reusableEvaluation.runId,
|
|
1180
|
+
evaluation: reusableEvaluation,
|
|
1181
|
+
evaluationId: reusableEvaluation.id,
|
|
1182
|
+
candidateId,
|
|
1183
|
+
completedJobCount: 0,
|
|
1184
|
+
failedJobCount: 0,
|
|
1185
|
+
localView: localDevViewHint(workspace, reusableEvaluation.runId),
|
|
1186
|
+
};
|
|
1187
|
+
writeOutput(result, parsed, io, () => `Reused evaluation ${reusableEvaluation.id}. Use --rerun to intentionally run it again.`);
|
|
1188
|
+
return 0;
|
|
1189
|
+
}
|
|
1190
|
+
const selectedPairs = evaluationWork?.missingPairs.length
|
|
1191
|
+
? evaluationWork.missingPairs
|
|
1192
|
+
: allCaseSamplePairs(caseIds, samples);
|
|
822
1193
|
const runId = `eval_local_${Date.now().toString(36)}`;
|
|
823
|
-
const
|
|
1194
|
+
const evaluatedCandidateId = candidateId;
|
|
824
1195
|
const startedAt = new Date().toISOString();
|
|
825
|
-
const
|
|
826
|
-
ownerUserId: "local",
|
|
827
|
-
projectId: "local",
|
|
828
|
-
runId,
|
|
829
|
-
subjectId: evaluatedSubjectId,
|
|
830
|
-
attemptIndex: 0,
|
|
831
|
-
files,
|
|
832
|
-
now: startedAt,
|
|
833
|
-
baseId: null,
|
|
834
|
-
});
|
|
835
|
-
const completedJobs = [baseline];
|
|
836
|
-
const attemptJobs = planWorkbenchExecutionJobsForPurpose({
|
|
837
|
-
ownerUserId: "local",
|
|
838
|
-
projectId: "local",
|
|
839
|
-
runId,
|
|
840
|
-
subjectId: evaluatedSubjectId,
|
|
841
|
-
attemptIndex: 0,
|
|
842
|
-
samples,
|
|
843
|
-
now: startedAt,
|
|
844
|
-
caseIds,
|
|
845
|
-
engineCases,
|
|
846
|
-
spec,
|
|
847
|
-
environmentRefsByCase: environmentRefs.byCase,
|
|
848
|
-
workflow: "eval",
|
|
849
|
-
purpose: "attempt",
|
|
850
|
-
});
|
|
851
|
-
const dagJobs = await executeLocalDevelopmentDag({
|
|
852
|
-
jobs: [baseline, ...attemptJobs],
|
|
853
|
-
spec,
|
|
854
|
-
adapterManifests,
|
|
855
|
-
adapterFiles: normalizeSurfaceFiles(projectSource.adapterFiles),
|
|
856
|
-
baseFiles: files,
|
|
857
|
-
engineResolveFiles,
|
|
858
|
-
engineCases,
|
|
859
|
-
capacity: await localDevelopmentCapacity(workspace),
|
|
860
|
-
});
|
|
861
|
-
completedJobs.splice(0, completedJobs.length, ...dagJobs);
|
|
862
|
-
const materialized = materializeWorkbenchRunResult({
|
|
1196
|
+
const runStartedEvent = createLocalEvent("run_started", startedAt, {
|
|
863
1197
|
runId,
|
|
864
|
-
|
|
865
|
-
|
|
866
|
-
benchmarkSourceFiles: authoredBenchmarkSourceFiles(projectSource),
|
|
867
|
-
subjectFingerprint: existingSubject?.subjectFingerprint ?? sourceSubjectFingerprint,
|
|
868
|
-
...(!existingSubject || existingSubject.subjectFingerprint === sourceSubjectFingerprint
|
|
869
|
-
? { subjectSourceFiles: authoredSubjectSourceFiles(projectSource) }
|
|
870
|
-
: {}),
|
|
871
|
-
startedAt,
|
|
872
|
-
spec,
|
|
873
|
-
jobs: completedJobs,
|
|
874
|
-
previousSubject: null,
|
|
875
|
-
existingSubjectCount: snapshot.subjects.length,
|
|
1198
|
+
candidateId: evaluatedCandidateId,
|
|
1199
|
+
detail: { samples, strategy: "direct" },
|
|
876
1200
|
});
|
|
877
|
-
|
|
878
|
-
snapshot = upsertLocalSubject(snapshot, subjectRecord, materialized.subjectFiles[subjectRecord.id] ?? []);
|
|
879
|
-
}
|
|
880
|
-
if (materialized.activeSubjectId) {
|
|
881
|
-
snapshot = setLocalActive(snapshot, materialized.activeSubjectId);
|
|
882
|
-
}
|
|
883
|
-
for (const evaluation of materialized.evaluations) {
|
|
884
|
-
snapshot = upsertLocalEvaluation(snapshot, evaluation);
|
|
885
|
-
}
|
|
886
|
-
const finishedAt = new Date().toISOString();
|
|
887
|
-
snapshot = appendLocalRun(snapshot, {
|
|
1201
|
+
const runningRun = {
|
|
888
1202
|
id: runId,
|
|
889
1203
|
workflow: "eval",
|
|
890
1204
|
benchmarkFingerprint,
|
|
891
|
-
status: "
|
|
1205
|
+
status: "running",
|
|
1206
|
+
candidateId: evaluatedCandidateId,
|
|
1207
|
+
candidateRunId: projectSource.spec.candidate.selectedRunId,
|
|
1208
|
+
candidateRunName: projectSource.spec.candidate.selectedRunName,
|
|
892
1209
|
startedAt,
|
|
893
|
-
|
|
894
|
-
durationMs: Math.max(0, Date.parse(finishedAt) - Date.parse(startedAt)),
|
|
895
|
-
optimizer: "none",
|
|
1210
|
+
improver: "none",
|
|
896
1211
|
engineRun: spec.engineRun.use,
|
|
897
1212
|
strategy: "direct",
|
|
898
1213
|
budget: 1,
|
|
899
1214
|
repairBudget: 0,
|
|
900
1215
|
attemptsRequested: 1,
|
|
901
|
-
attemptsExecuted:
|
|
1216
|
+
attemptsExecuted: 0,
|
|
902
1217
|
samples,
|
|
903
|
-
|
|
904
|
-
|
|
905
|
-
|
|
906
|
-
|
|
1218
|
+
executionFingerprint,
|
|
1219
|
+
activeCandidateId: activeCandidateIdBeforeEval,
|
|
1220
|
+
outputCandidateId: evaluatedCandidateId,
|
|
1221
|
+
};
|
|
1222
|
+
snapshot = upsertLocalRun(snapshot, runningRun, [runStartedEvent]);
|
|
907
1223
|
await saveLocalArchive(workspace, snapshot);
|
|
908
|
-
|
|
909
|
-
|
|
910
|
-
|
|
911
|
-
|
|
912
|
-
|
|
913
|
-
|
|
914
|
-
|
|
915
|
-
|
|
916
|
-
|
|
917
|
-
|
|
1224
|
+
try {
|
|
1225
|
+
const baseline = createRuntimeBaselineCandidateJob({
|
|
1226
|
+
ownerUserId: "local",
|
|
1227
|
+
projectId: "local",
|
|
1228
|
+
runId,
|
|
1229
|
+
candidateId: evaluatedCandidateId,
|
|
1230
|
+
attemptIndex: 0,
|
|
1231
|
+
files,
|
|
1232
|
+
now: startedAt,
|
|
1233
|
+
baseId: null,
|
|
1234
|
+
});
|
|
1235
|
+
const attemptJobs = planWorkbenchExecutionJobsForPurpose({
|
|
1236
|
+
ownerUserId: "local",
|
|
1237
|
+
projectId: "local",
|
|
1238
|
+
runId,
|
|
1239
|
+
candidateId: evaluatedCandidateId,
|
|
1240
|
+
attemptIndex: 0,
|
|
1241
|
+
samples,
|
|
1242
|
+
now: startedAt,
|
|
1243
|
+
caseIds: orderedCaseIdsForPairs(caseIds, selectedPairs),
|
|
1244
|
+
sampleIndexesByCase: sampleIndexesByCase(selectedPairs),
|
|
1245
|
+
engineCases,
|
|
1246
|
+
spec,
|
|
1247
|
+
environmentRefsByCase: environmentRefs.byCase,
|
|
1248
|
+
workflow: "eval",
|
|
1249
|
+
purpose: "attempt",
|
|
1250
|
+
});
|
|
1251
|
+
const dagJobs = await executeLocalDevelopmentDag({
|
|
1252
|
+
jobs: [baseline, ...attemptJobs],
|
|
1253
|
+
spec,
|
|
1254
|
+
adapterManifests,
|
|
1255
|
+
adapterFiles: normalizeSurfaceFiles(projectSource.adapterFiles),
|
|
1256
|
+
baseFiles: files,
|
|
1257
|
+
engineResolveFiles,
|
|
1258
|
+
engineCases,
|
|
1259
|
+
capacity: await localDevelopmentCapacity(workspace),
|
|
1260
|
+
});
|
|
1261
|
+
const materializationJobs = [
|
|
1262
|
+
...(evaluationWork?.priorAttemptJobs ?? []),
|
|
1263
|
+
...dagJobs,
|
|
1264
|
+
];
|
|
1265
|
+
const currentRunJobs = dagJobs.filter((job) => job.runId === runId);
|
|
1266
|
+
const currentRunCompletedJobCount = currentRunJobs.filter((job) => job.status === "succeeded").length;
|
|
1267
|
+
const currentRunFailedJobCount = currentRunJobs.filter((job) => job.status === "failed").length;
|
|
1268
|
+
const materialized = materializeWorkbenchRunResult({
|
|
1269
|
+
runId,
|
|
1270
|
+
benchmarkFingerprint,
|
|
1271
|
+
sourceYaml: projectSource.specSource,
|
|
1272
|
+
benchmarkSourceFiles: authoredBenchmarkSourceFiles(projectSource),
|
|
1273
|
+
candidateFingerprint: existingCandidate?.candidateFingerprint ?? sourceCandidateFingerprint,
|
|
1274
|
+
...(!existingCandidate || existingCandidate.candidateFingerprint === sourceCandidateFingerprint
|
|
1275
|
+
? { candidateSourceFiles: authoredCandidateSourceFiles(projectSource) }
|
|
1276
|
+
: {}),
|
|
1277
|
+
startedAt,
|
|
1278
|
+
spec,
|
|
1279
|
+
jobs: materializationJobs,
|
|
1280
|
+
previousCandidate: existingCandidate ?? null,
|
|
1281
|
+
existingCandidateCount: snapshot.candidates.length,
|
|
1282
|
+
});
|
|
1283
|
+
for (const candidateRecord of materialized.candidates) {
|
|
1284
|
+
snapshot = upsertLocalCandidate(snapshot, candidateRecord, materialized.candidateFiles[candidateRecord.id] ?? []);
|
|
1285
|
+
}
|
|
1286
|
+
if (materialized.activeCandidateId) {
|
|
1287
|
+
snapshot = setLocalActive(snapshot, materialized.activeCandidateId);
|
|
1288
|
+
}
|
|
1289
|
+
for (const evaluation of materialized.evaluations) {
|
|
1290
|
+
snapshot = upsertLocalEvaluation(snapshot, evaluation);
|
|
1291
|
+
}
|
|
1292
|
+
const activeCandidateId = activeCandidateIdBeforeEval ?? materialized.activeCandidateId ?? null;
|
|
1293
|
+
const finishedAt = new Date().toISOString();
|
|
1294
|
+
if (activeCandidateId) {
|
|
1295
|
+
snapshot = setLocalActive(snapshot, activeCandidateId);
|
|
1296
|
+
}
|
|
1297
|
+
const runFinishedEvent = createLocalEvent("run_finished", finishedAt, {
|
|
1298
|
+
runId,
|
|
1299
|
+
candidateId: evaluatedCandidateId,
|
|
1300
|
+
detail: {
|
|
1301
|
+
outcome: currentRunFailedJobCount > 0 ? "error" : "ok",
|
|
1302
|
+
attemptsExecuted: 1,
|
|
1303
|
+
durationMs: Math.max(0, Date.parse(finishedAt) - Date.parse(startedAt)),
|
|
1304
|
+
},
|
|
1305
|
+
});
|
|
1306
|
+
snapshot = upsertLocalRun(snapshot, {
|
|
1307
|
+
id: runId,
|
|
1308
|
+
workflow: "eval",
|
|
1309
|
+
benchmarkFingerprint,
|
|
1310
|
+
status: "finished",
|
|
1311
|
+
candidateId: evaluatedCandidateId,
|
|
1312
|
+
candidateRunId: projectSource.spec.candidate.selectedRunId,
|
|
1313
|
+
candidateRunName: projectSource.spec.candidate.selectedRunName,
|
|
1314
|
+
startedAt,
|
|
1315
|
+
finishedAt,
|
|
1316
|
+
durationMs: Math.max(0, Date.parse(finishedAt) - Date.parse(startedAt)),
|
|
1317
|
+
improver: "none",
|
|
1318
|
+
engineRun: spec.engineRun.use,
|
|
1319
|
+
strategy: "direct",
|
|
1320
|
+
budget: 1,
|
|
1321
|
+
repairBudget: 0,
|
|
1322
|
+
attemptsRequested: 1,
|
|
1323
|
+
attemptsExecuted: 1,
|
|
1324
|
+
samples,
|
|
1325
|
+
executionFingerprint,
|
|
1326
|
+
stoppedReason: "completed",
|
|
1327
|
+
outcome: currentRunFailedJobCount > 0 ? "error" : "ok",
|
|
1328
|
+
activeCandidateId,
|
|
1329
|
+
outputCandidateId: evaluatedCandidateId,
|
|
1330
|
+
}, [runFinishedEvent]);
|
|
1331
|
+
await saveLocalJobs(workspace, currentRunJobs);
|
|
1332
|
+
await saveLocalArchive(workspace, snapshot);
|
|
1333
|
+
const evaluation = materialized.evaluations[0] ?? null;
|
|
1334
|
+
const result = {
|
|
1335
|
+
ok: currentRunFailedJobCount === 0,
|
|
1336
|
+
runId,
|
|
1337
|
+
evaluation,
|
|
1338
|
+
evaluationId: evaluation?.id ?? null,
|
|
1339
|
+
candidateId: evaluatedCandidateId,
|
|
1340
|
+
activeCandidateId,
|
|
1341
|
+
completedJobCount: currentRunCompletedJobCount,
|
|
1342
|
+
failedJobCount: currentRunFailedJobCount,
|
|
1343
|
+
localView: localDevViewHint(workspace, runId),
|
|
1344
|
+
};
|
|
1345
|
+
writeOutput(result, parsed, io, ({ evaluationId, candidateId }) => `Evaluation ${evaluationId ?? runId} finished for candidate ${candidateId}.\nOpen local view: ${result.localView.command}\n${result.localView.note}`);
|
|
1346
|
+
return currentRunFailedJobCount === 0 ? 0 : 1;
|
|
1347
|
+
}
|
|
1348
|
+
catch (error) {
|
|
1349
|
+
await markLocalRunFailed({
|
|
1350
|
+
workspace,
|
|
1351
|
+
run: runningRun,
|
|
1352
|
+
startedAt,
|
|
1353
|
+
error,
|
|
1354
|
+
}).catch(() => undefined);
|
|
1355
|
+
throw error;
|
|
1356
|
+
}
|
|
1357
|
+
}
|
|
1358
|
+
async function resolveLocalEvaluationWork(workspace, snapshot, target) {
|
|
1359
|
+
const runById = new Map(snapshot.runs.map((run) => [run.id, run]));
|
|
1360
|
+
const matchingEvaluations = snapshot.evaluations.filter((evaluation) => {
|
|
1361
|
+
const run = runById.get(evaluation.runId);
|
|
1362
|
+
return evaluation.benchmarkFingerprint === target.benchmarkFingerprint &&
|
|
1363
|
+
evaluation.candidateId === target.candidateId &&
|
|
1364
|
+
evaluation.candidateFingerprint === target.candidateFingerprint &&
|
|
1365
|
+
evaluation.candidateRunId === target.candidateRunId &&
|
|
1366
|
+
run?.executionFingerprint === target.executionFingerprint;
|
|
1367
|
+
});
|
|
1368
|
+
const reusableEvaluation = matchingEvaluations
|
|
1369
|
+
.filter((evaluation) => evaluation.status === "completed" &&
|
|
1370
|
+
evaluation.errorSampleCount === 0 &&
|
|
1371
|
+
evaluation.completedSampleCount >= target.samples)
|
|
1372
|
+
.sort((left, right) => right.updatedAt.localeCompare(left.updatedAt) ||
|
|
1373
|
+
right.id.localeCompare(left.id))[0] ?? null;
|
|
1374
|
+
if (reusableEvaluation) {
|
|
1375
|
+
return {
|
|
1376
|
+
reusableEvaluation,
|
|
1377
|
+
missingPairs: [],
|
|
1378
|
+
priorAttemptJobs: [],
|
|
1379
|
+
};
|
|
1380
|
+
}
|
|
1381
|
+
const matchingRunIds = new Set(matchingEvaluations.map((evaluation) => evaluation.runId));
|
|
1382
|
+
if (matchingRunIds.size === 0) {
|
|
1383
|
+
return null;
|
|
1384
|
+
}
|
|
1385
|
+
const allPairs = allCaseSamplePairs(target.caseIds, target.samples);
|
|
1386
|
+
const desiredKeys = new Set(allPairs.map(caseSamplePairKey));
|
|
1387
|
+
const previousJobs = await readLocalJobs(workspace);
|
|
1388
|
+
const priorAttemptJobsByPair = latestCompletedAttemptJobsByPair(previousJobs.filter((job) => matchingRunIds.has(job.runId) &&
|
|
1389
|
+
job.candidateId === target.candidateId), desiredKeys);
|
|
1390
|
+
const missingPairs = allPairs.filter((pair) => !priorAttemptJobsByPair.has(caseSamplePairKey(pair)));
|
|
1391
|
+
if (missingPairs.length === allPairs.length) {
|
|
1392
|
+
return null;
|
|
1393
|
+
}
|
|
1394
|
+
return {
|
|
1395
|
+
reusableEvaluation: null,
|
|
1396
|
+
missingPairs,
|
|
1397
|
+
priorAttemptJobs: [...priorAttemptJobsByPair.values()],
|
|
918
1398
|
};
|
|
919
|
-
|
|
920
|
-
|
|
1399
|
+
}
|
|
1400
|
+
async function markLocalRunFailed(args) {
|
|
1401
|
+
const latest = await loadLocalArchive(args.workspace);
|
|
1402
|
+
const current = latest.runs.find((run) => run.id === args.run.id);
|
|
1403
|
+
if (current?.status === "finished") {
|
|
1404
|
+
return;
|
|
1405
|
+
}
|
|
1406
|
+
const finishedAt = new Date().toISOString();
|
|
1407
|
+
const message = errorMessage(args.error);
|
|
1408
|
+
const failedRun = {
|
|
1409
|
+
...args.run,
|
|
1410
|
+
status: "finished",
|
|
1411
|
+
finishedAt,
|
|
1412
|
+
durationMs: Math.max(0, Date.parse(finishedAt) - Date.parse(args.startedAt)),
|
|
1413
|
+
outcome: "error",
|
|
1414
|
+
error: message,
|
|
1415
|
+
};
|
|
1416
|
+
await saveLocalArchive(args.workspace, upsertLocalRun(latest, failedRun, [
|
|
1417
|
+
createLocalEvent("run_finished", finishedAt, {
|
|
1418
|
+
runId: args.run.id,
|
|
1419
|
+
candidateId: args.run.candidateId ?? undefined,
|
|
1420
|
+
detail: {
|
|
1421
|
+
outcome: "error",
|
|
1422
|
+
error: message,
|
|
1423
|
+
attemptsExecuted: failedRun.attemptsExecuted,
|
|
1424
|
+
durationMs: failedRun.durationMs ?? null,
|
|
1425
|
+
},
|
|
1426
|
+
}),
|
|
1427
|
+
]));
|
|
1428
|
+
}
|
|
1429
|
+
function errorMessage(error) {
|
|
1430
|
+
return error instanceof Error ? error.message : String(error);
|
|
1431
|
+
}
|
|
1432
|
+
function allCaseSamplePairs(caseIds, samples) {
|
|
1433
|
+
return caseIds.flatMap((caseId) => Array.from({ length: samples }, (_, sampleIndex) => ({
|
|
1434
|
+
caseId,
|
|
1435
|
+
sampleIndex,
|
|
1436
|
+
})));
|
|
1437
|
+
}
|
|
1438
|
+
function orderedCaseIdsForPairs(caseIds, pairs) {
|
|
1439
|
+
const selected = new Set(pairs.map((pair) => pair.caseId));
|
|
1440
|
+
return caseIds.filter((caseId) => selected.has(caseId));
|
|
1441
|
+
}
|
|
1442
|
+
function sampleIndexesByCase(pairs) {
|
|
1443
|
+
const byCase = new Map();
|
|
1444
|
+
for (const pair of pairs) {
|
|
1445
|
+
byCase.set(pair.caseId, [...(byCase.get(pair.caseId) ?? []), pair.sampleIndex]);
|
|
1446
|
+
}
|
|
1447
|
+
for (const [caseId, indexes] of byCase.entries()) {
|
|
1448
|
+
byCase.set(caseId, [...new Set(indexes)].sort((left, right) => left - right));
|
|
1449
|
+
}
|
|
1450
|
+
return byCase;
|
|
1451
|
+
}
|
|
1452
|
+
function latestCompletedAttemptJobsByPair(jobs, desiredKeys) {
|
|
1453
|
+
const byPair = new Map();
|
|
1454
|
+
for (const job of jobs) {
|
|
1455
|
+
if (job.status !== "succeeded" || executionPurposeFromJobInput(job.input) !== "attempt") {
|
|
1456
|
+
continue;
|
|
1457
|
+
}
|
|
1458
|
+
const pair = caseSamplePairFromJob(job);
|
|
1459
|
+
if (!pair) {
|
|
1460
|
+
continue;
|
|
1461
|
+
}
|
|
1462
|
+
const key = caseSamplePairKey(pair);
|
|
1463
|
+
if (!desiredKeys.has(key)) {
|
|
1464
|
+
continue;
|
|
1465
|
+
}
|
|
1466
|
+
const previous = byPair.get(key);
|
|
1467
|
+
if (!previous || compareJobRecency(job, previous) > 0) {
|
|
1468
|
+
byPair.set(key, job);
|
|
1469
|
+
}
|
|
1470
|
+
}
|
|
1471
|
+
return byPair;
|
|
1472
|
+
}
|
|
1473
|
+
function caseSamplePairFromJob(job) {
|
|
1474
|
+
const input = readRecord(job.input);
|
|
1475
|
+
const execution = readRecord(input?.execution);
|
|
1476
|
+
const metadata = readRecord(execution?.metadata);
|
|
1477
|
+
const caseId = stringValue(input?.caseId) ?? stringValue(metadata?.caseId);
|
|
1478
|
+
const sampleIndex = integerValue(input?.sampleIndex) ?? integerValue(metadata?.sampleIndex);
|
|
1479
|
+
return caseId && sampleIndex !== null
|
|
1480
|
+
? { caseId, sampleIndex }
|
|
1481
|
+
: null;
|
|
1482
|
+
}
|
|
1483
|
+
function executionPurposeFromJobInput(inputValue) {
|
|
1484
|
+
const input = readRecord(inputValue);
|
|
1485
|
+
const execution = readRecord(input?.execution);
|
|
1486
|
+
return stringValue(execution?.purpose);
|
|
1487
|
+
}
|
|
1488
|
+
function caseSamplePairKey(pair) {
|
|
1489
|
+
return `${pair.caseId}\0${pair.sampleIndex}`;
|
|
1490
|
+
}
|
|
1491
|
+
function compareJobRecency(left, right) {
|
|
1492
|
+
return jobRecencyTimestamp(left).localeCompare(jobRecencyTimestamp(right)) ||
|
|
1493
|
+
left.id.localeCompare(right.id);
|
|
1494
|
+
}
|
|
1495
|
+
function jobRecencyTimestamp(job) {
|
|
1496
|
+
return job.finishedAt ?? job.updatedAt ?? job.startedAt ?? job.createdAt ?? "";
|
|
1497
|
+
}
|
|
1498
|
+
function findReusableLocalImproveRun(runs, target) {
|
|
1499
|
+
return runs
|
|
1500
|
+
.filter((run) => run.workflow === "improve" &&
|
|
1501
|
+
run.benchmarkFingerprint === target.benchmarkFingerprint &&
|
|
1502
|
+
run.candidateId === target.candidateId &&
|
|
1503
|
+
run.candidateRunId === target.candidateRunId &&
|
|
1504
|
+
run.executionFingerprint === target.executionFingerprint &&
|
|
1505
|
+
run.budget === target.budget &&
|
|
1506
|
+
run.samples === target.samples &&
|
|
1507
|
+
run.status === "finished" &&
|
|
1508
|
+
run.outcome === "ok" &&
|
|
1509
|
+
Boolean(run.outputCandidateId))
|
|
1510
|
+
.sort((left, right) => (right.finishedAt ?? right.startedAt).localeCompare(left.finishedAt ?? left.startedAt) ||
|
|
1511
|
+
right.id.localeCompare(left.id))[0] ?? null;
|
|
921
1512
|
}
|
|
922
1513
|
function localDevViewHint(workspace, runId) {
|
|
923
1514
|
const runFlag = runId ? ` --run ${shellQuote(runId)}` : "";
|
|
@@ -935,20 +1526,26 @@ function localDevOpenUrl(baseUrl, snapshot, runId) {
|
|
|
935
1526
|
.reverse()
|
|
936
1527
|
.find((entry) => entry.runId === runId);
|
|
937
1528
|
if (!evaluation) {
|
|
938
|
-
return new URL("
|
|
1529
|
+
return new URL("candidates", baseUrl).toString();
|
|
939
1530
|
}
|
|
940
1531
|
const params = new URLSearchParams({ evaluation: evaluation.id });
|
|
941
|
-
return new URL(`
|
|
1532
|
+
return new URL(`candidates/${encodeURIComponent(evaluation.candidateId)}?${params.toString()}`, baseUrl).toString();
|
|
942
1533
|
}
|
|
943
1534
|
async function readLocalBenchmarkFingerprint(workspace) {
|
|
944
1535
|
return localBenchmarkFingerprint(await readLocalProjectSource(workspace));
|
|
945
1536
|
}
|
|
946
|
-
function
|
|
1537
|
+
function localRunExecutionFingerprint(projectSource) {
|
|
1538
|
+
return workbenchRunExecutionFingerprint({
|
|
1539
|
+
sourceYaml: projectSource.specSource,
|
|
1540
|
+
adapterFiles: normalizeSurfaceFiles(projectSource.adapterFiles),
|
|
1541
|
+
});
|
|
1542
|
+
}
|
|
1543
|
+
function authoredCandidateSourceFiles(projectSource) {
|
|
947
1544
|
return [{
|
|
948
|
-
path: path.relative(projectSource.dir, projectSource.
|
|
1545
|
+
path: path.relative(projectSource.dir, projectSource.candidateSpecPath).split(path.sep).join("/"),
|
|
949
1546
|
kind: "text",
|
|
950
1547
|
encoding: "utf8",
|
|
951
|
-
content: projectSource.
|
|
1548
|
+
content: projectSource.candidateSource,
|
|
952
1549
|
executable: false,
|
|
953
1550
|
}];
|
|
954
1551
|
}
|
|
@@ -1155,72 +1752,72 @@ function requireValidRunEnvelope(args) {
|
|
|
1155
1752
|
}
|
|
1156
1753
|
async function localRestore(argv, io) {
|
|
1157
1754
|
const parsed = parseArgs(argv);
|
|
1158
|
-
rejectUnknownFlags(parsed, new Set(["dir", "
|
|
1755
|
+
rejectUnknownFlags(parsed, new Set(["dir", "candidate", "dry-run", "yes", "json"]));
|
|
1159
1756
|
const workspace = resolveDir(parsed);
|
|
1160
1757
|
const spec = await readLocalSpecIfValid(workspace);
|
|
1161
1758
|
if (!spec) {
|
|
1162
1759
|
throw new UsageError("restore requires a valid Workbench project.");
|
|
1163
1760
|
}
|
|
1164
|
-
const
|
|
1761
|
+
const candidateRoot = spec.candidate.files.path;
|
|
1165
1762
|
const snapshot = await loadLocalArchive(workspace);
|
|
1166
|
-
const
|
|
1167
|
-
const files =
|
|
1763
|
+
const candidateId = readCandidateIdFlag(parsed, snapshot);
|
|
1764
|
+
const files = readLocalCandidateFiles(snapshot, candidateId);
|
|
1168
1765
|
if (parsed.flags["dry-run"] === true) {
|
|
1169
|
-
writeOutput({ ok: true,
|
|
1766
|
+
writeOutput({ ok: true, candidateId: candidateId, fileCount: files.length }, parsed, io, () => `Restore would write ${files.length} file(s) from ${candidateId}.`);
|
|
1170
1767
|
return 0;
|
|
1171
1768
|
}
|
|
1172
1769
|
if (parsed.flags.yes !== true) {
|
|
1173
1770
|
throw new UsageError("restore requires --dry-run to preview or --yes to apply source directory changes.");
|
|
1174
1771
|
}
|
|
1175
|
-
const changedPaths = await
|
|
1176
|
-
const next = setLocalActive(snapshot,
|
|
1772
|
+
const changedPaths = await materializeCandidateRoot(workspace, candidateRoot, files);
|
|
1773
|
+
const next = setLocalActive(snapshot, candidateId);
|
|
1177
1774
|
await saveLocalArchive(workspace, next);
|
|
1178
|
-
writeOutput({ ok: true,
|
|
1775
|
+
writeOutput({ ok: true, activeCandidateId: candidateId, changedPaths }, parsed, io, () => `Restored ${candidateId} to ${candidateRoot}.`);
|
|
1179
1776
|
return 0;
|
|
1180
1777
|
}
|
|
1181
|
-
async function
|
|
1778
|
+
async function localCandidateList(argv, io) {
|
|
1182
1779
|
const parsed = parseArgs(argv);
|
|
1183
1780
|
rejectUnknownFlags(parsed, new Set(["dir", "json"]));
|
|
1184
1781
|
const snapshot = await loadLocalArchive(resolveDir(parsed));
|
|
1185
|
-
writeOutput(snapshot.
|
|
1186
|
-
.map((
|
|
1187
|
-
.join("\n") || "No
|
|
1782
|
+
writeOutput(snapshot.candidates, parsed, io, (candidates) => candidates
|
|
1783
|
+
.map((candidate) => `${candidate.id}\t${candidate.status}\tevaluation ${formatCandidateEvaluationScore(candidate)}${snapshot.activeId === candidate.id ? "\tactive" : ""}`)
|
|
1784
|
+
.join("\n") || "No candidates.");
|
|
1188
1785
|
return 0;
|
|
1189
1786
|
}
|
|
1190
|
-
async function
|
|
1787
|
+
async function localCandidateShow(argv, io) {
|
|
1191
1788
|
const parsed = parseArgs(argv);
|
|
1192
|
-
rejectUnknownFlags(parsed, new Set(["dir", "
|
|
1789
|
+
rejectUnknownFlags(parsed, new Set(["dir", "candidate", "json"]));
|
|
1193
1790
|
const snapshot = await loadLocalArchive(resolveDir(parsed));
|
|
1194
|
-
const
|
|
1195
|
-
const
|
|
1196
|
-
writeOutput(
|
|
1791
|
+
const candidateId = readCandidateIdFlag(parsed, snapshot);
|
|
1792
|
+
const candidate = readLocalCandidate(snapshot, candidateId);
|
|
1793
|
+
writeOutput(candidate, parsed, io, (record) => [
|
|
1197
1794
|
`${record.id}\t${record.status}`,
|
|
1198
1795
|
`benchmark\t${record.benchmarkFingerprint}`,
|
|
1199
|
-
`
|
|
1200
|
-
`
|
|
1796
|
+
`candidate\t${record.candidateFingerprint ?? record.candidateFingerprint}`,
|
|
1797
|
+
`evaluation\t${formatCandidateEvaluationSummary(record)}`,
|
|
1201
1798
|
...(record.baseId ? [`base\t${record.baseId}`] : []),
|
|
1202
1799
|
].join("\n"));
|
|
1203
1800
|
return 0;
|
|
1204
1801
|
}
|
|
1205
|
-
async function
|
|
1802
|
+
async function localCandidateFiles(argv, io) {
|
|
1206
1803
|
const parsed = parseArgs(argv);
|
|
1207
|
-
rejectUnknownFlags(parsed, new Set(["dir", "
|
|
1804
|
+
rejectUnknownFlags(parsed, new Set(["dir", "candidate", "json"]));
|
|
1208
1805
|
const snapshot = await loadLocalArchive(resolveDir(parsed));
|
|
1209
|
-
const
|
|
1210
|
-
const
|
|
1211
|
-
const files =
|
|
1806
|
+
const candidateId = readCandidateIdFlag(parsed, snapshot);
|
|
1807
|
+
const candidate = readLocalCandidate(snapshot, candidateId);
|
|
1808
|
+
const files = summarizeCandidateFiles(readLocalCandidateFiles(snapshot, candidateId), candidate.fileChanges);
|
|
1212
1809
|
writeOutput(files, parsed, io, (records) => records
|
|
1213
1810
|
.map((file) => `${file.path}\t${file.status}\t${file.preview_kind}`)
|
|
1214
1811
|
.join("\n") || "No files.");
|
|
1215
1812
|
return 0;
|
|
1216
1813
|
}
|
|
1217
|
-
async function
|
|
1814
|
+
async function localCandidatePreview(argv, io) {
|
|
1218
1815
|
const parsed = parseArgs(argv);
|
|
1219
|
-
rejectUnknownFlags(parsed, new Set(["dir", "
|
|
1816
|
+
rejectUnknownFlags(parsed, new Set(["dir", "candidate", "path", "output", "view", "json"]));
|
|
1220
1817
|
const snapshot = await loadLocalArchive(resolveDir(parsed));
|
|
1221
|
-
const
|
|
1222
|
-
const preview =
|
|
1223
|
-
files:
|
|
1818
|
+
const candidateId = readCandidateIdFlag(parsed, snapshot);
|
|
1819
|
+
const preview = createCandidateFilePreview({
|
|
1820
|
+
files: readLocalCandidateFiles(snapshot, candidateId),
|
|
1224
1821
|
path: requireFlag(parsed, "path"),
|
|
1225
1822
|
view: readPreviewMode(parsed),
|
|
1226
1823
|
});
|
|
@@ -1755,7 +2352,7 @@ function createAdapterScaffoldFiles(id) {
|
|
|
1755
2352
|
"setup:",
|
|
1756
2353
|
" - npm install --global .",
|
|
1757
2354
|
"operations:",
|
|
1758
|
-
"
|
|
2355
|
+
" candidate.run: {}",
|
|
1759
2356
|
"",
|
|
1760
2357
|
].join("\n");
|
|
1761
2358
|
const packageJson = `${JSON.stringify({
|
|
@@ -1777,11 +2374,11 @@ const request = requestPath && fs.existsSync(requestPath)
|
|
|
1777
2374
|
? JSON.parse(fs.readFileSync(requestPath, "utf8"))
|
|
1778
2375
|
: {};
|
|
1779
2376
|
fs.mkdirSync(outputRoot, { recursive: true });
|
|
1780
|
-
const operation = request.operation || "
|
|
2377
|
+
const operation = request.operation || "candidate.run";
|
|
1781
2378
|
const resultPath = process.env.WORKBENCH_RESULT || request.paths?.result || path.join(outputRoot, "workbench-result.json");
|
|
1782
2379
|
|
|
1783
2380
|
let value;
|
|
1784
|
-
if (operation === "
|
|
2381
|
+
if (operation === "candidate.run") {
|
|
1785
2382
|
const task = request.context?.case?.prompt || "No case prompt was provided.";
|
|
1786
2383
|
fs.writeFileSync(path.join(outputRoot, "adapter-output.txt"), [
|
|
1787
2384
|
"adapter: ${id}",
|
|
@@ -1790,7 +2387,7 @@ if (operation === "subject.run") {
|
|
|
1790
2387
|
"",
|
|
1791
2388
|
].join("\\n"));
|
|
1792
2389
|
} else {
|
|
1793
|
-
console.error("${id} only implements
|
|
2390
|
+
console.error("${id} only implements candidate.run.");
|
|
1794
2391
|
process.exit(2);
|
|
1795
2392
|
}
|
|
1796
2393
|
|
|
@@ -2065,7 +2662,7 @@ async function resolveAdapterForAuthTarget(dir, targetRaw) {
|
|
|
2065
2662
|
const adapters = await resolveWorkbenchAdaptersForProject(dir, spec);
|
|
2066
2663
|
const adapter = adapters.find((entry) => entry.manifest.id === target.adapterId);
|
|
2067
2664
|
if (!adapter) {
|
|
2068
|
-
throw new UsageError(`Adapter ${target.adapterId} is not used by this benchmark source. Add it to the benchmark
|
|
2665
|
+
throw new UsageError(`Adapter ${target.adapterId} is not used by this benchmark source. Add it to the benchmark or candidate YAML before connecting auth.`);
|
|
2069
2666
|
}
|
|
2070
2667
|
if (!adapter.manifest.auth) {
|
|
2071
2668
|
throw new UsageError(`Adapter ${target.adapterId} does not declare auth.`);
|
|
@@ -2313,13 +2910,21 @@ function adapterAuthRecord(value) {
|
|
|
2313
2910
|
}
|
|
2314
2911
|
async function pushBenchmark(argv, io) {
|
|
2315
2912
|
const parsed = parseArgs(argv);
|
|
2316
|
-
rejectUnknownFlags(parsed, new Set(["dir", "
|
|
2913
|
+
rejectUnknownFlags(parsed, new Set(["dir", "visibility", "dry-run", "json"]));
|
|
2317
2914
|
const dir = resolveSourceDir(parsed);
|
|
2318
2915
|
const source = await readLocalProjectSource(dir);
|
|
2319
2916
|
const origin = await readWorkbenchOrigin(dir);
|
|
2320
2917
|
const baseUrl = await effectiveBaseUrl(origin?.baseUrl);
|
|
2321
|
-
const visibility =
|
|
2918
|
+
const visibility = readOptionalBenchmarkVisibility(parsed.flags.visibility);
|
|
2919
|
+
const createVisibility = visibility ?? "public";
|
|
2322
2920
|
const dryRun = parsed.flags["dry-run"] === true;
|
|
2921
|
+
const runtime = await exportLocalRuntimeBundle(dir);
|
|
2922
|
+
const state = localProjectState({
|
|
2923
|
+
source,
|
|
2924
|
+
runtime,
|
|
2925
|
+
origin,
|
|
2926
|
+
visibility: createVisibility,
|
|
2927
|
+
});
|
|
2323
2928
|
if (!origin) {
|
|
2324
2929
|
if (dryRun) {
|
|
2325
2930
|
writeOutput({
|
|
@@ -2329,35 +2934,36 @@ async function pushBenchmark(argv, io) {
|
|
|
2329
2934
|
dir,
|
|
2330
2935
|
baseUrl,
|
|
2331
2936
|
benchmarkName: source.spec.name,
|
|
2332
|
-
|
|
2333
|
-
visibility,
|
|
2937
|
+
visibility: createVisibility,
|
|
2334
2938
|
sourceFileCount: sourceFileCount(source),
|
|
2939
|
+
runtime: runtimeBundleStats(runtime),
|
|
2940
|
+
sourceFingerprint: state.source.fingerprint,
|
|
2941
|
+
runtimeFingerprint: state.base.runtimeFingerprint,
|
|
2335
2942
|
}, parsed, io, () => `Would push benchmark ${source.spec.name}.`);
|
|
2336
2943
|
return 0;
|
|
2337
2944
|
}
|
|
2338
|
-
const { project,
|
|
2945
|
+
const { project, origin: nextOrigin, result } = await createHostedBenchmarkFromState({
|
|
2339
2946
|
baseUrl,
|
|
2340
2947
|
dir,
|
|
2341
|
-
|
|
2342
|
-
visibility,
|
|
2948
|
+
state,
|
|
2343
2949
|
});
|
|
2344
2950
|
writeOutput({
|
|
2345
2951
|
ok: true,
|
|
2346
2952
|
action: "create",
|
|
2347
|
-
benchmark:
|
|
2348
|
-
|
|
2349
|
-
visibility,
|
|
2953
|
+
benchmark: project,
|
|
2954
|
+
visibility: project.visibility ?? createVisibility,
|
|
2350
2955
|
origin: nextOrigin,
|
|
2956
|
+
source: result.source,
|
|
2957
|
+
runtime: result.runtime.stats,
|
|
2351
2958
|
urls: buildWorkbenchResourceUrls({
|
|
2352
2959
|
baseUrl,
|
|
2353
|
-
projectId:
|
|
2354
|
-
|
|
2355
|
-
projectName: nextOrigin.project,
|
|
2960
|
+
projectId: project.id,
|
|
2961
|
+
...originRemoteUrlParts(nextOrigin),
|
|
2356
2962
|
}),
|
|
2357
2963
|
}, parsed, io, (record) => {
|
|
2358
2964
|
const value = record;
|
|
2359
2965
|
return [
|
|
2360
|
-
`Pushed ${value.origin.
|
|
2966
|
+
`Pushed ${value.origin.remote} (${value.origin.projectId}).`,
|
|
2361
2967
|
`Open benchmark: ${value.urls.benchmark}`,
|
|
2362
2968
|
].join("\n");
|
|
2363
2969
|
});
|
|
@@ -2367,57 +2973,6 @@ async function pushBenchmark(argv, io) {
|
|
|
2367
2973
|
if (!projectId) {
|
|
2368
2974
|
throw new UsageError("Missing hosted benchmark. Run workbench push from a source directory.");
|
|
2369
2975
|
}
|
|
2370
|
-
if (!origin.writable) {
|
|
2371
|
-
const signedInUsername = dryRun ? null : await readAuthenticatedWorkbenchUsername(baseUrl);
|
|
2372
|
-
if (signedInUsername !== origin.owner) {
|
|
2373
|
-
const upstream = upstreamFromOrigin(origin);
|
|
2374
|
-
if (dryRun) {
|
|
2375
|
-
writeOutput({
|
|
2376
|
-
ok: true,
|
|
2377
|
-
dryRun: true,
|
|
2378
|
-
action: "create",
|
|
2379
|
-
dir,
|
|
2380
|
-
baseUrl,
|
|
2381
|
-
benchmarkName: source.spec.name,
|
|
2382
|
-
tag: asOptionalString(parsed.flags.tag) ?? null,
|
|
2383
|
-
visibility,
|
|
2384
|
-
sourceFileCount: sourceFileCount(source),
|
|
2385
|
-
upstream: upstream ?? null,
|
|
2386
|
-
}, parsed, io, () => `Would create a writable benchmark from read-only origin ${origin.owner}/${origin.project}.`);
|
|
2387
|
-
return 0;
|
|
2388
|
-
}
|
|
2389
|
-
const { project, publishedProject, origin: nextOrigin } = await createHostedBenchmarkFromSource({
|
|
2390
|
-
baseUrl,
|
|
2391
|
-
dir,
|
|
2392
|
-
source,
|
|
2393
|
-
visibility,
|
|
2394
|
-
upstream,
|
|
2395
|
-
});
|
|
2396
|
-
writeOutput({
|
|
2397
|
-
ok: true,
|
|
2398
|
-
action: "create",
|
|
2399
|
-
benchmark: publishedProject,
|
|
2400
|
-
tag: asOptionalString(parsed.flags.tag) ?? null,
|
|
2401
|
-
visibility,
|
|
2402
|
-
origin: nextOrigin,
|
|
2403
|
-
upstream: upstream ?? null,
|
|
2404
|
-
urls: buildWorkbenchResourceUrls({
|
|
2405
|
-
baseUrl,
|
|
2406
|
-
projectId: publishedProject.id ?? project.id,
|
|
2407
|
-
owner: nextOrigin.owner,
|
|
2408
|
-
projectName: nextOrigin.project,
|
|
2409
|
-
}),
|
|
2410
|
-
}, parsed, io, (record) => {
|
|
2411
|
-
const value = record;
|
|
2412
|
-
return [
|
|
2413
|
-
`Pushed ${value.origin.owner}/${value.origin.project} (${value.origin.projectId}).`,
|
|
2414
|
-
...(value.upstream ? [`Upstream: ${value.upstream.owner}/${value.upstream.project}`] : []),
|
|
2415
|
-
`Open benchmark: ${value.urls.benchmark}`,
|
|
2416
|
-
].join("\n");
|
|
2417
|
-
});
|
|
2418
|
-
return 0;
|
|
2419
|
-
}
|
|
2420
|
-
}
|
|
2421
2976
|
if (dryRun) {
|
|
2422
2977
|
writeOutput({
|
|
2423
2978
|
ok: true,
|
|
@@ -2426,92 +2981,82 @@ async function pushBenchmark(argv, io) {
|
|
|
2426
2981
|
dir,
|
|
2427
2982
|
baseUrl,
|
|
2428
2983
|
benchmarkId: projectId,
|
|
2429
|
-
|
|
2430
|
-
|
|
2984
|
+
remote: origin.remote,
|
|
2985
|
+
benchmarkName: source.spec.name,
|
|
2986
|
+
visibility: visibility ?? "unchanged",
|
|
2431
2987
|
sourceFileCount: sourceFileCount(source),
|
|
2432
|
-
|
|
2988
|
+
runtime: runtimeBundleStats(runtime),
|
|
2989
|
+
sourceFingerprint: state.source.fingerprint,
|
|
2990
|
+
runtimeFingerprint: state.base.runtimeFingerprint,
|
|
2991
|
+
}, parsed, io, () => `Would push ${sourceFileCount(source)} source file(s) and runtime history to ${origin.remote}.`);
|
|
2433
2992
|
return 0;
|
|
2434
2993
|
}
|
|
2435
|
-
const response = await apiRequest(projectApiPath(projectId, "/
|
|
2994
|
+
const response = await apiRequest(projectApiPath(projectId, "/state"), {
|
|
2436
2995
|
method: "PUT",
|
|
2437
|
-
body:
|
|
2996
|
+
body: state,
|
|
2438
2997
|
}, baseUrl);
|
|
2439
|
-
const
|
|
2440
|
-
|
|
2441
|
-
: response.benchmark;
|
|
2442
|
-
const nextOrigin = await writeWorkbenchOrigin(dir, {
|
|
2998
|
+
const responseProject = hostedProjectSummaryFromState(response.state);
|
|
2999
|
+
const publishedProject = await applyRequestedProjectVisibility({
|
|
2443
3000
|
baseUrl,
|
|
2444
|
-
|
|
2445
|
-
|
|
2446
|
-
|
|
2447
|
-
|
|
2448
|
-
|
|
2449
|
-
|
|
2450
|
-
|
|
3001
|
+
projectId: responseProject.id,
|
|
3002
|
+
responseProject,
|
|
3003
|
+
visibility,
|
|
3004
|
+
});
|
|
3005
|
+
const nextOrigin = await writeWorkbenchOriginFromState(dir, {
|
|
3006
|
+
baseUrl,
|
|
3007
|
+
state: response.state,
|
|
3008
|
+
project: publishedProject,
|
|
3009
|
+
sourceFingerprint: state.source.fingerprint,
|
|
2451
3010
|
});
|
|
2452
3011
|
writeOutput({
|
|
2453
3012
|
ok: true,
|
|
2454
3013
|
action: "update",
|
|
2455
3014
|
changed: response.changed === true,
|
|
2456
3015
|
benchmark: publishedProject,
|
|
2457
|
-
|
|
2458
|
-
visibility,
|
|
3016
|
+
visibility: visibility ?? "unchanged",
|
|
2459
3017
|
origin: nextOrigin,
|
|
3018
|
+
source: response.source,
|
|
3019
|
+
runtime: response.runtime.stats,
|
|
2460
3020
|
urls: buildWorkbenchResourceUrls({
|
|
2461
3021
|
baseUrl,
|
|
2462
|
-
projectId: publishedProject.id ??
|
|
2463
|
-
|
|
2464
|
-
projectName: nextOrigin.project,
|
|
3022
|
+
projectId: publishedProject.id ?? responseProject.id,
|
|
3023
|
+
...originRemoteUrlParts(nextOrigin),
|
|
2465
3024
|
}),
|
|
2466
3025
|
}, parsed, io, (record) => {
|
|
2467
3026
|
const value = record;
|
|
2468
3027
|
return [
|
|
2469
|
-
`${value.changed ? "Pushed" : "Already up to date"} ${value.origin.
|
|
3028
|
+
`${value.changed ? "Pushed" : "Already up to date"} ${value.origin.remote} (${value.origin.projectId}).`,
|
|
2470
3029
|
`Open benchmark: ${value.urls.benchmark}`,
|
|
2471
3030
|
].join("\n");
|
|
2472
3031
|
});
|
|
2473
3032
|
return 0;
|
|
2474
3033
|
}
|
|
2475
|
-
async function
|
|
2476
|
-
const
|
|
3034
|
+
async function createHostedBenchmarkFromState(args) {
|
|
3035
|
+
const result = await apiRequest("/api/workbench/benchmarks/state", {
|
|
2477
3036
|
method: "POST",
|
|
2478
|
-
body:
|
|
3037
|
+
body: args.state,
|
|
2479
3038
|
}, args.baseUrl);
|
|
2480
|
-
const project =
|
|
2481
|
-
const
|
|
2482
|
-
? (await apiRequest(projectApiPath(project.id, "/publish"), { method: "PUT" }, args.baseUrl)).benchmark
|
|
2483
|
-
: project;
|
|
2484
|
-
const origin = await writeWorkbenchOrigin(args.dir, {
|
|
3039
|
+
const project = hostedProjectSummaryFromState(result.state);
|
|
3040
|
+
const origin = await writeWorkbenchOriginFromState(args.dir, {
|
|
2485
3041
|
baseUrl: args.baseUrl,
|
|
2486
|
-
|
|
2487
|
-
project
|
|
2488
|
-
|
|
2489
|
-
writable: true,
|
|
2490
|
-
sourceRevisionId: publishedProject.currentSpecVersionId ?? project.currentSpecVersionId,
|
|
2491
|
-
sourceFingerprint: publishedProject.sourceFingerprint ?? project.sourceFingerprint,
|
|
2492
|
-
...(args.upstream ? { upstream: args.upstream } : {}),
|
|
3042
|
+
state: result.state,
|
|
3043
|
+
project,
|
|
3044
|
+
sourceFingerprint: args.state.source.fingerprint,
|
|
2493
3045
|
});
|
|
2494
|
-
return { project,
|
|
3046
|
+
return { project, origin, result };
|
|
2495
3047
|
}
|
|
2496
|
-
async function
|
|
2497
|
-
|
|
2498
|
-
|
|
2499
|
-
return status.authenticated ? status.profile?.username ?? null : null;
|
|
2500
|
-
}
|
|
2501
|
-
function upstreamFromOrigin(origin) {
|
|
2502
|
-
if (!origin.owner || !origin.project || !origin.projectId || !origin.sourceRevisionId) {
|
|
2503
|
-
return undefined;
|
|
3048
|
+
async function applyRequestedProjectVisibility(args) {
|
|
3049
|
+
if (args.visibility === "public") {
|
|
3050
|
+
return (await apiRequest(projectApiPath(args.projectId, "/publish"), { method: "PUT" }, args.baseUrl)).benchmark;
|
|
2504
3051
|
}
|
|
2505
|
-
|
|
2506
|
-
|
|
2507
|
-
|
|
2508
|
-
|
|
2509
|
-
sourceRevisionId: origin.sourceRevisionId,
|
|
2510
|
-
};
|
|
3052
|
+
if (args.visibility === "private") {
|
|
3053
|
+
return (await apiRequest(projectApiPath(args.projectId, "/publish"), { method: "DELETE" }, args.baseUrl)).benchmark;
|
|
3054
|
+
}
|
|
3055
|
+
return args.responseProject;
|
|
2511
3056
|
}
|
|
2512
|
-
function
|
|
3057
|
+
function readOptionalBenchmarkVisibility(value) {
|
|
2513
3058
|
if (value === undefined) {
|
|
2514
|
-
return
|
|
3059
|
+
return undefined;
|
|
2515
3060
|
}
|
|
2516
3061
|
if (value === "private" || value === "public") {
|
|
2517
3062
|
return value;
|
|
@@ -2524,41 +3069,37 @@ async function cloneProject(argv, io) {
|
|
|
2524
3069
|
const ref = readRequiredBenchmarkRef(parsed);
|
|
2525
3070
|
const outputDir = parsed.positionals[1] ?? ref.project;
|
|
2526
3071
|
if (parsed.positionals.length > 2) {
|
|
2527
|
-
throw new UsageError("workbench clone accepts OWNER/BENCHMARK
|
|
3072
|
+
throw new UsageError("workbench clone accepts OWNER/BENCHMARK and an optional output directory.");
|
|
2528
3073
|
}
|
|
2529
3074
|
const baseUrl = await effectiveBaseUrl();
|
|
2530
|
-
const
|
|
2531
|
-
const filesResponse = await apiRequest(publicProjectSourceApiPath(ref), {}, baseUrl);
|
|
3075
|
+
const state = await apiRequest(publicProjectStateApiPath(ref), {}, baseUrl);
|
|
2532
3076
|
if (parsed.flags["dry-run"] === true) {
|
|
2533
3077
|
writeOutput({
|
|
2534
3078
|
ok: true,
|
|
2535
3079
|
dryRun: true,
|
|
2536
3080
|
ref,
|
|
2537
3081
|
outputDir,
|
|
2538
|
-
fileCount:
|
|
3082
|
+
fileCount: state.source.files.length,
|
|
3083
|
+
runtime: runtimeBundleStats(state.runtime),
|
|
3084
|
+
sourceFingerprint: state.source.fingerprint ?? state.base.sourceFingerprint ?? null,
|
|
3085
|
+
runtimeFingerprint: state.base.runtimeFingerprint ?? null,
|
|
2539
3086
|
}, parsed, io, () => `Would clone ${formatBenchmarkRef(ref)} to ${outputDir}.`);
|
|
2540
3087
|
return 0;
|
|
2541
3088
|
}
|
|
2542
|
-
await
|
|
2543
|
-
|
|
2544
|
-
const sourceProject = filesResponse.benchmark;
|
|
2545
|
-
const origin = await writeWorkbenchOrigin(outputDir, {
|
|
3089
|
+
const applied = await applyProjectStateToLocal({
|
|
3090
|
+
dir: outputDir,
|
|
2546
3091
|
baseUrl,
|
|
2547
|
-
|
|
2548
|
-
project: sourceProject?.name ?? project.name,
|
|
2549
|
-
projectId: sourceProject?.id ?? project.id,
|
|
2550
|
-
writable: false,
|
|
2551
|
-
sourceRevisionId: sourceProject?.currentSpecVersionId ?? project.currentSpecVersionId,
|
|
2552
|
-
sourceFingerprint: sourceProject?.sourceFingerprint ?? project.sourceFingerprint,
|
|
3092
|
+
state,
|
|
2553
3093
|
});
|
|
2554
3094
|
writeOutput({
|
|
2555
3095
|
ok: true,
|
|
2556
|
-
origin,
|
|
3096
|
+
origin: applied.origin,
|
|
2557
3097
|
outputDir,
|
|
2558
|
-
files:
|
|
3098
|
+
files: applied.files,
|
|
3099
|
+
runtime: applied.runtime,
|
|
2559
3100
|
}, parsed, io, (record) => {
|
|
2560
3101
|
const value = record;
|
|
2561
|
-
return `Cloned ${value.origin.
|
|
3102
|
+
return `Cloned ${value.origin.remote} to ${value.outputDir} (${value.files} file(s)).`;
|
|
2562
3103
|
});
|
|
2563
3104
|
return 0;
|
|
2564
3105
|
}
|
|
@@ -2566,167 +3107,273 @@ async function pullProject(argv, io) {
|
|
|
2566
3107
|
const parsed = parseArgs(argv);
|
|
2567
3108
|
rejectUnknownFlags(parsed, new Set(["dir", "dry-run", "json"]));
|
|
2568
3109
|
if (parsed.positionals.length > 0) {
|
|
2569
|
-
throw new UsageError("workbench pull updates the current origin; use workbench clone OWNER/BENCHMARK
|
|
3110
|
+
throw new UsageError("workbench pull updates the current origin; use workbench clone OWNER/BENCHMARK DIR for a new directory.");
|
|
2570
3111
|
}
|
|
2571
3112
|
const dir = resolveDir(parsed);
|
|
2572
3113
|
const origin = await requireWorkbenchOrigin(dir);
|
|
2573
|
-
const
|
|
2574
|
-
|
|
2575
|
-
|
|
3114
|
+
const baseUrl = await effectiveBaseUrl(origin.baseUrl);
|
|
3115
|
+
const remoteRef = parseOriginRemote(origin);
|
|
3116
|
+
const state = await apiRequest(publicProjectStateApiPath(remoteRef), {}, baseUrl);
|
|
2576
3117
|
if (parsed.flags["dry-run"] === true) {
|
|
2577
3118
|
writeOutput({
|
|
2578
3119
|
ok: true,
|
|
2579
3120
|
dryRun: true,
|
|
2580
3121
|
dir,
|
|
2581
|
-
fileCount:
|
|
2582
|
-
|
|
3122
|
+
fileCount: state.source.files.length,
|
|
3123
|
+
runtime: runtimeBundleStats(state.runtime),
|
|
3124
|
+
sourceFingerprint: state.source.fingerprint ?? state.base.sourceFingerprint ?? null,
|
|
3125
|
+
runtimeFingerprint: state.base.runtimeFingerprint ?? null,
|
|
3126
|
+
}, parsed, io, () => `Would pull ${state.source.files.length} source file(s) and runtime history into ${dir}.`);
|
|
2583
3127
|
return 0;
|
|
2584
3128
|
}
|
|
2585
|
-
await
|
|
2586
|
-
|
|
2587
|
-
|
|
2588
|
-
|
|
2589
|
-
|
|
2590
|
-
|
|
2591
|
-
...(sourceProject?.id ? { projectId: sourceProject.id } : {}),
|
|
2592
|
-
...(sourceProject?.currentSpecVersionId ? { sourceRevisionId: sourceProject.currentSpecVersionId } : {}),
|
|
2593
|
-
...(sourceProject?.sourceFingerprint ? { sourceFingerprint: sourceProject.sourceFingerprint } : {}),
|
|
3129
|
+
const applied = await applyProjectStateToLocal({
|
|
3130
|
+
dir,
|
|
3131
|
+
baseUrl,
|
|
3132
|
+
state,
|
|
3133
|
+
origin,
|
|
3134
|
+
requireCleanSource: true,
|
|
2594
3135
|
});
|
|
2595
3136
|
writeOutput({
|
|
2596
3137
|
ok: true,
|
|
2597
|
-
origin:
|
|
3138
|
+
origin: applied.origin,
|
|
2598
3139
|
dir,
|
|
2599
|
-
files:
|
|
3140
|
+
files: applied.files,
|
|
3141
|
+
runtime: applied.runtime,
|
|
2600
3142
|
}, parsed, io, (record) => {
|
|
2601
3143
|
const value = record;
|
|
2602
3144
|
return `Pulled ${value.files} source file(s) into ${value.dir}.`;
|
|
2603
3145
|
});
|
|
2604
3146
|
return 0;
|
|
2605
3147
|
}
|
|
2606
|
-
async function
|
|
2607
|
-
|
|
2608
|
-
|
|
2609
|
-
if (parsed.positionals.length > 0) {
|
|
2610
|
-
throw new UsageError("workbench fetch updates the current remote cache; use workbench clone OWNER/BENCHMARK[@REF] DIR for a new directory.");
|
|
3148
|
+
async function applyProjectStateToLocal(args) {
|
|
3149
|
+
if (args.requireCleanSource === true && args.origin) {
|
|
3150
|
+
await assertLocalSourceMatchesOrigin(args.dir, args.origin);
|
|
2611
3151
|
}
|
|
2612
|
-
|
|
2613
|
-
const
|
|
2614
|
-
const
|
|
2615
|
-
|
|
2616
|
-
|
|
2617
|
-
|
|
2618
|
-
await writeFiles(path.join(fetchRoot, "source"), filesResponse.files);
|
|
2619
|
-
const sourceProject = filesResponse.benchmark;
|
|
2620
|
-
const nextOrigin = await writeWorkbenchOrigin(dir, {
|
|
2621
|
-
...origin,
|
|
2622
|
-
...(sourceProject?.ownerUsername ? { owner: sourceProject.ownerUsername } : {}),
|
|
2623
|
-
...(sourceProject?.name ? { project: sourceProject.name } : {}),
|
|
2624
|
-
...(sourceProject?.id ? { projectId: sourceProject.id } : {}),
|
|
2625
|
-
...(sourceProject?.currentSpecVersionId ? { sourceRevisionId: sourceProject.currentSpecVersionId } : {}),
|
|
2626
|
-
...(sourceProject?.sourceFingerprint ? { sourceFingerprint: sourceProject.sourceFingerprint } : {}),
|
|
3152
|
+
await syncSourceFiles(args.dir, args.state.source.files);
|
|
3153
|
+
const runtimeImport = await importLocalRuntimeBundle(args.dir, args.state.runtime);
|
|
3154
|
+
const origin = await writeWorkbenchOriginFromState(args.dir, {
|
|
3155
|
+
baseUrl: args.baseUrl,
|
|
3156
|
+
state: args.state,
|
|
3157
|
+
sourceFingerprint: await localSourceFingerprint(args.dir),
|
|
2627
3158
|
});
|
|
2628
|
-
|
|
2629
|
-
|
|
2630
|
-
|
|
2631
|
-
|
|
2632
|
-
}
|
|
2633
|
-
|
|
3159
|
+
return {
|
|
3160
|
+
origin,
|
|
3161
|
+
files: args.state.source.files.length,
|
|
3162
|
+
runtime: runtimeImport.stats,
|
|
3163
|
+
};
|
|
3164
|
+
}
|
|
3165
|
+
async function retryHostedWorkflow(argv, io) {
|
|
3166
|
+
const parsed = parseArgs(argv);
|
|
3167
|
+
rejectUnknownFlags(parsed, new Set([
|
|
3168
|
+
"dir",
|
|
3169
|
+
"benchmark",
|
|
3170
|
+
"watch",
|
|
3171
|
+
"interval-ms",
|
|
3172
|
+
"timeout-ms",
|
|
3173
|
+
"json",
|
|
3174
|
+
]));
|
|
3175
|
+
rejectUnexpectedPositionals(parsed, "workbench retry --hosted", 1);
|
|
3176
|
+
const targetId = parsed.positionals[0];
|
|
3177
|
+
if (!targetId) {
|
|
3178
|
+
throw new UsageError("Missing required TARGET_ID.");
|
|
3179
|
+
}
|
|
3180
|
+
if (parsed.flags.watch !== true && (parsed.flags["interval-ms"] !== undefined ||
|
|
3181
|
+
parsed.flags["timeout-ms"] !== undefined)) {
|
|
3182
|
+
throw new UsageError("--interval-ms and --timeout-ms require --watch.");
|
|
3183
|
+
}
|
|
3184
|
+
const target = await resolveHostedTarget(parsed, { requireProjectIdentity: true });
|
|
3185
|
+
const retryTarget = await resolveHostedRetryTarget(target, targetId);
|
|
3186
|
+
const watchIntervalMs = parsed.flags.watch === true
|
|
3187
|
+
? parsePositiveInt(parsed.flags["interval-ms"], 1000, "interval-ms")
|
|
3188
|
+
: undefined;
|
|
3189
|
+
const watchTimeoutMs = parsed.flags.watch === true
|
|
3190
|
+
? parseOptionalPositiveInt(parsed.flags["timeout-ms"], "timeout-ms")
|
|
3191
|
+
: undefined;
|
|
3192
|
+
const response = await apiRequest(projectApiPath(target.projectId, "/runs"), {
|
|
3193
|
+
method: "POST",
|
|
3194
|
+
body: retryTarget.request,
|
|
3195
|
+
}, target.baseUrl);
|
|
3196
|
+
const startedRun = withRunUrls(target, response.run);
|
|
3197
|
+
if (parsed.flags.watch === true) {
|
|
3198
|
+
if (parsed.flags.json !== true) {
|
|
3199
|
+
io.stdout.write(`${formatHostedRunStarted(startedRun, retryTarget.workflow).trimEnd()}\n${HOSTED_WATCH_LIFECYCLE_NOTE}\n`);
|
|
3200
|
+
}
|
|
3201
|
+
const watched = await watchHostedRun({
|
|
3202
|
+
parsed,
|
|
3203
|
+
target,
|
|
3204
|
+
runId: response.run.id,
|
|
3205
|
+
intervalMs: watchIntervalMs ?? 1000,
|
|
3206
|
+
timeoutMs: watchTimeoutMs,
|
|
3207
|
+
});
|
|
3208
|
+
const outputRun = withRunUrls(target, await withHostedRunFailureSummary(target, watched));
|
|
3209
|
+
await tryImportTerminalHostedProjectState({ target, io });
|
|
3210
|
+
const result = {
|
|
3211
|
+
ok: hostedRunSucceeded(watched),
|
|
3212
|
+
retried: {
|
|
3213
|
+
id: retryTarget.sourceId,
|
|
3214
|
+
kind: retryTarget.sourceKind,
|
|
3215
|
+
workflow: retryTarget.workflow,
|
|
3216
|
+
},
|
|
3217
|
+
runId: outputRun.id,
|
|
3218
|
+
candidateId: outputRun.outputCandidateId ?? outputRun.candidateId,
|
|
3219
|
+
activeCandidateId: outputRun.activeCandidateId ?? null,
|
|
3220
|
+
run: outputRun,
|
|
3221
|
+
...(outputRun.urls ? { urls: outputRun.urls } : {}),
|
|
3222
|
+
...(outputRun.failedJobCount !== undefined ? { failedJobCount: outputRun.failedJobCount } : {}),
|
|
3223
|
+
...(outputRun.error ? { error: outputRun.error } : {}),
|
|
3224
|
+
};
|
|
3225
|
+
writeOutput(result, parsed, io, formatRetryCommandResult);
|
|
3226
|
+
return hostedRunSucceeded(watched) ? 0 : 1;
|
|
3227
|
+
}
|
|
3228
|
+
const result = {
|
|
2634
3229
|
ok: true,
|
|
2635
|
-
|
|
2636
|
-
|
|
2637
|
-
|
|
2638
|
-
|
|
2639
|
-
|
|
2640
|
-
|
|
2641
|
-
|
|
2642
|
-
|
|
3230
|
+
retried: {
|
|
3231
|
+
id: retryTarget.sourceId,
|
|
3232
|
+
kind: retryTarget.sourceKind,
|
|
3233
|
+
workflow: retryTarget.workflow,
|
|
3234
|
+
},
|
|
3235
|
+
runId: startedRun.id,
|
|
3236
|
+
candidateId: startedRun.outputCandidateId ?? startedRun.candidateId,
|
|
3237
|
+
activeCandidateId: startedRun.activeCandidateId ?? null,
|
|
3238
|
+
run: startedRun,
|
|
3239
|
+
...(startedRun.urls ? { urls: startedRun.urls } : {}),
|
|
3240
|
+
};
|
|
3241
|
+
writeOutput(result, parsed, io, formatRetryCommandResult);
|
|
2643
3242
|
return 0;
|
|
2644
3243
|
}
|
|
2645
|
-
async function
|
|
2646
|
-
|
|
2647
|
-
|
|
2648
|
-
|
|
3244
|
+
async function resolveHostedRetryTarget(target, targetId) {
|
|
3245
|
+
if (targetId.startsWith("eval_")) {
|
|
3246
|
+
return await resolveHostedEvaluationRetryTarget(target, targetId);
|
|
3247
|
+
}
|
|
3248
|
+
const detail = await readHostedRunDetail(target, targetId);
|
|
3249
|
+
const run = detail.run;
|
|
3250
|
+
if (run.status !== "finished") {
|
|
3251
|
+
throw new UsageError(`Run ${run.id} is ${run.status}; wait for it to finish before retrying.`);
|
|
3252
|
+
}
|
|
3253
|
+
if (!hostedRunRecordFailed(run)) {
|
|
3254
|
+
throw new UsageError(`Run ${run.id} did not fail; use workbench ${run.workflow ?? "eval"} --hosted to intentionally run it again.`);
|
|
3255
|
+
}
|
|
3256
|
+
if (run.workflow === "eval") {
|
|
3257
|
+
const candidateId = hostedRunEvaluationCandidateId(run, detail.jobs);
|
|
3258
|
+
if (!candidateId) {
|
|
3259
|
+
throw new UsageError(`Run ${run.id} has no candidate id to retry.`);
|
|
3260
|
+
}
|
|
3261
|
+
return {
|
|
3262
|
+
sourceId: targetId,
|
|
3263
|
+
sourceKind: "run",
|
|
3264
|
+
workflow: "eval",
|
|
3265
|
+
request: {
|
|
3266
|
+
workflow: "eval",
|
|
3267
|
+
samples: run.samples ?? 1,
|
|
3268
|
+
candidateId,
|
|
3269
|
+
sourceYaml: hostedRetrySourceYaml(run, run.id),
|
|
3270
|
+
preserveActive: true,
|
|
3271
|
+
...retrySampleSelectionFromJobs(detail.jobs),
|
|
3272
|
+
},
|
|
3273
|
+
};
|
|
3274
|
+
}
|
|
3275
|
+
if (run.workflow === "improve") {
|
|
3276
|
+
const baseCandidateId = stringValue(readRecord(run.input)?.baseCandidateId);
|
|
3277
|
+
if (!baseCandidateId) {
|
|
3278
|
+
throw new UsageError(`Run ${run.id} is missing its base candidate id.`);
|
|
3279
|
+
}
|
|
3280
|
+
return {
|
|
3281
|
+
sourceId: targetId,
|
|
3282
|
+
sourceKind: "run",
|
|
3283
|
+
workflow: "improve",
|
|
3284
|
+
request: {
|
|
3285
|
+
workflow: "improve",
|
|
3286
|
+
samples: run.samples ?? 1,
|
|
3287
|
+
budget: run.budget ?? run.attemptsRequested ?? 1,
|
|
3288
|
+
candidateId: baseCandidateId,
|
|
3289
|
+
sourceYaml: hostedRetrySourceYaml(run, run.id),
|
|
3290
|
+
preserveActive: true,
|
|
3291
|
+
},
|
|
3292
|
+
};
|
|
3293
|
+
}
|
|
3294
|
+
throw new UsageError(`Run ${run.id} has no retryable workflow.`);
|
|
2649
3295
|
}
|
|
2650
|
-
async function
|
|
2651
|
-
const
|
|
2652
|
-
|
|
2653
|
-
|
|
2654
|
-
|
|
2655
|
-
case "add":
|
|
2656
|
-
return await remoteAdd(argv.slice(1), io, "add");
|
|
2657
|
-
case "set-url":
|
|
2658
|
-
return await remoteAdd(argv.slice(1), io, "set-url");
|
|
2659
|
-
case "remove":
|
|
2660
|
-
return await remoteRemove(argv.slice(1), io);
|
|
2661
|
-
default:
|
|
2662
|
-
throw new UsageError(`Unknown command: remote ${argv.join(" ")}`);
|
|
3296
|
+
async function resolveHostedEvaluationRetryTarget(target, evaluationId) {
|
|
3297
|
+
const snapshot = await apiRequest(projectApiPath(target.projectId, "/workbench/snapshot"), {}, target.baseUrl);
|
|
3298
|
+
const evaluation = snapshot.evaluations.find((entry) => entry.id === evaluationId);
|
|
3299
|
+
if (!evaluation) {
|
|
3300
|
+
throw new UsageError(`Hosted evaluation not found: ${evaluationId}`);
|
|
2663
3301
|
}
|
|
3302
|
+
const run = snapshot.runs.find((entry) => entry.id === evaluation.runId) ?? null;
|
|
3303
|
+
if (!evaluationScorecardFailed(evaluation, run)) {
|
|
3304
|
+
throw new UsageError(`Evaluation ${evaluation.id} did not fail; use workbench eval --hosted to intentionally run it again.`);
|
|
3305
|
+
}
|
|
3306
|
+
if (!run) {
|
|
3307
|
+
throw new UsageError(`Evaluation ${evaluation.id} is missing its run record.`);
|
|
3308
|
+
}
|
|
3309
|
+
const detail = await readHostedRunDetail(target, run.id);
|
|
3310
|
+
const detailedRun = detail.run;
|
|
3311
|
+
return {
|
|
3312
|
+
sourceId: evaluationId,
|
|
3313
|
+
sourceKind: "evaluation",
|
|
3314
|
+
workflow: "eval",
|
|
3315
|
+
request: {
|
|
3316
|
+
workflow: "eval",
|
|
3317
|
+
samples: evaluation.sampleCount || detailedRun.samples || 1,
|
|
3318
|
+
candidateId: evaluation.candidateId,
|
|
3319
|
+
sourceYaml: hostedRetrySourceYaml(detailedRun, detailedRun.id),
|
|
3320
|
+
preserveActive: true,
|
|
3321
|
+
...retrySampleSelectionFromJobs(detail.jobs),
|
|
3322
|
+
},
|
|
3323
|
+
};
|
|
2664
3324
|
}
|
|
2665
|
-
|
|
2666
|
-
const
|
|
2667
|
-
|
|
2668
|
-
|
|
2669
|
-
|
|
2670
|
-
|
|
2671
|
-
|
|
2672
|
-
|
|
2673
|
-
|
|
2674
|
-
`writable\t${value.origin.writable ? "yes" : "no"}`,
|
|
2675
|
-
...(value.origin.sourceFingerprint ? [`fingerprint\t${value.origin.sourceFingerprint}`] : []),
|
|
2676
|
-
].join("\n");
|
|
2677
|
-
});
|
|
2678
|
-
return 0;
|
|
3325
|
+
function retrySampleSelectionFromJobs(jobs) {
|
|
3326
|
+
const selectedSamples = uniqueCaseSamplePairs(jobs
|
|
3327
|
+
.filter((job) => job.status !== "succeeded" &&
|
|
3328
|
+
executionPurposeFromJobInput(job.input) === "attempt")
|
|
3329
|
+
.map(caseSamplePairFromJob)
|
|
3330
|
+
.filter((pair) => pair !== null));
|
|
3331
|
+
return selectedSamples.length > 0
|
|
3332
|
+
? { selectedSamples }
|
|
3333
|
+
: {};
|
|
2679
3334
|
}
|
|
2680
|
-
|
|
2681
|
-
const
|
|
2682
|
-
|
|
2683
|
-
|
|
2684
|
-
|
|
2685
|
-
|
|
3335
|
+
function uniqueCaseSamplePairs(pairs) {
|
|
3336
|
+
const byKey = new Map();
|
|
3337
|
+
for (const pair of pairs) {
|
|
3338
|
+
byKey.set(caseSamplePairKey(pair), pair);
|
|
3339
|
+
}
|
|
3340
|
+
return [...byKey.values()].sort((left, right) => left.caseId.localeCompare(right.caseId) ||
|
|
3341
|
+
left.sampleIndex - right.sampleIndex);
|
|
3342
|
+
}
|
|
3343
|
+
async function readHostedRunDetail(target, runId) {
|
|
3344
|
+
return await apiRequest(projectApiPath(target.projectId, `/runs/${encodeURIComponent(runId)}`), {}, target.baseUrl);
|
|
3345
|
+
}
|
|
3346
|
+
async function tryImportTerminalHostedProjectState(args) {
|
|
3347
|
+
const origin = args.target.origin;
|
|
3348
|
+
if (!origin || origin.projectId !== args.target.projectId) {
|
|
3349
|
+
return;
|
|
3350
|
+
}
|
|
3351
|
+
try {
|
|
3352
|
+
const state = await apiRequest(projectApiPath(args.target.projectId, "/state"), {}, args.target.baseUrl);
|
|
3353
|
+
await applyProjectStateToLocal({
|
|
3354
|
+
dir: args.target.dir,
|
|
3355
|
+
baseUrl: args.target.baseUrl,
|
|
3356
|
+
state,
|
|
3357
|
+
origin,
|
|
3358
|
+
requireCleanSource: true,
|
|
3359
|
+
});
|
|
3360
|
+
}
|
|
3361
|
+
catch (error) {
|
|
3362
|
+
args.io.stderr.write(`Hosted run finished, but local project state was not updated: ${errorMessage(error)}\n`);
|
|
2686
3363
|
}
|
|
2687
|
-
const ref = parseBenchmarkRef(refValue);
|
|
2688
|
-
const baseUrl = await effectiveBaseUrl();
|
|
2689
|
-
const project = await resolveRemoteProject(formatBenchmarkRef(ref), baseUrl);
|
|
2690
|
-
const origin = await writeWorkbenchOrigin(resolveDir(parsed), {
|
|
2691
|
-
baseUrl,
|
|
2692
|
-
owner: project.ownerUsername ?? ref.owner,
|
|
2693
|
-
project: project.name ?? ref.project,
|
|
2694
|
-
projectId: project.id,
|
|
2695
|
-
writable: false,
|
|
2696
|
-
...(project.currentSpecVersionId ? { sourceRevisionId: project.currentSpecVersionId } : {}),
|
|
2697
|
-
...(project.sourceFingerprint ? { sourceFingerprint: project.sourceFingerprint } : {}),
|
|
2698
|
-
});
|
|
2699
|
-
writeOutput({ ok: true, remote: "origin", origin }, parsed, io, () => `Set origin to ${origin.owner}/${origin.project}.`);
|
|
2700
|
-
return 0;
|
|
2701
|
-
}
|
|
2702
|
-
async function remoteRemove(argv, io) {
|
|
2703
|
-
const parsed = parseArgs(argv);
|
|
2704
|
-
rejectUnknownFlags(parsed, new Set(["dir", "json"]));
|
|
2705
|
-
const [name] = parsed.positionals;
|
|
2706
|
-
if (name !== "origin" || parsed.positionals.length !== 1) {
|
|
2707
|
-
throw new UsageError("workbench remote remove accepts: origin.");
|
|
2708
|
-
}
|
|
2709
|
-
const originPath = workbenchOriginPath(resolveDir(parsed));
|
|
2710
|
-
const existed = await fileIsReadable(originPath);
|
|
2711
|
-
await fs.rm(originPath, { force: true });
|
|
2712
|
-
writeOutput({ ok: true, remote: "origin", removed: existed, path: originPath }, parsed, io, () => existed
|
|
2713
|
-
? `Removed origin (${originPath}).`
|
|
2714
|
-
: `No origin configured (${originPath}).`);
|
|
2715
|
-
return 0;
|
|
2716
3364
|
}
|
|
2717
|
-
|
|
2718
|
-
const
|
|
2719
|
-
|
|
2720
|
-
|
|
2721
|
-
if (parsed.positionals.length > 1) {
|
|
2722
|
-
throw new UsageError(`${starred ? "workbench cloud star" : "workbench cloud unstar"} accepts exactly one OWNER/BENCHMARK ref.`);
|
|
3365
|
+
function hostedRetrySourceYaml(run, runId) {
|
|
3366
|
+
const sourceYaml = stringValue(readRecord(run.input)?.sourceYaml);
|
|
3367
|
+
if (!sourceYaml) {
|
|
3368
|
+
throw new UsageError(`Run ${runId} is missing its recorded source configuration.`);
|
|
2723
3369
|
}
|
|
2724
|
-
|
|
2725
|
-
|
|
2726
|
-
|
|
2727
|
-
|
|
2728
|
-
|
|
2729
|
-
|
|
3370
|
+
return sourceYaml;
|
|
3371
|
+
}
|
|
3372
|
+
function hostedRunRecordFailed(run) {
|
|
3373
|
+
return run.outcome === "error" ||
|
|
3374
|
+
run.outcome === "cancelled" ||
|
|
3375
|
+
(run.failedJobCount ?? 0) > 0 ||
|
|
3376
|
+
Boolean(run.error);
|
|
2730
3377
|
}
|
|
2731
3378
|
async function startHostedWorkflow(workflow, argv, io) {
|
|
2732
3379
|
const parsed = parseArgs(argv);
|
|
@@ -2734,9 +3381,10 @@ async function startHostedWorkflow(workflow, argv, io) {
|
|
|
2734
3381
|
"dir",
|
|
2735
3382
|
"benchmark",
|
|
2736
3383
|
"base",
|
|
2737
|
-
"
|
|
3384
|
+
"runs",
|
|
2738
3385
|
"budget",
|
|
2739
3386
|
"samples",
|
|
3387
|
+
"rerun",
|
|
2740
3388
|
"watch",
|
|
2741
3389
|
"dry-run",
|
|
2742
3390
|
"interval-ms",
|
|
@@ -2744,44 +3392,68 @@ async function startHostedWorkflow(workflow, argv, io) {
|
|
|
2744
3392
|
"json",
|
|
2745
3393
|
]));
|
|
2746
3394
|
if (parsed.positionals.length > 1) {
|
|
2747
|
-
throw new UsageError(`workbench
|
|
3395
|
+
throw new UsageError(`workbench ${workflow} --hosted accepts at most one source file or directory argument.`);
|
|
2748
3396
|
}
|
|
2749
|
-
const
|
|
2750
|
-
const
|
|
2751
|
-
|
|
2752
|
-
|
|
3397
|
+
const sourceArg = resolveSourceDir(parsed);
|
|
3398
|
+
const samples = parsePositiveInt(parsed.flags.samples, 1, "samples");
|
|
3399
|
+
const budget = workflow === "improve"
|
|
3400
|
+
? parsePositiveInt(parsed.flags.budget, 1, "budget")
|
|
3401
|
+
: undefined;
|
|
3402
|
+
if (parsed.flags.watch !== true && (parsed.flags["interval-ms"] !== undefined ||
|
|
3403
|
+
parsed.flags["timeout-ms"] !== undefined)) {
|
|
3404
|
+
throw new UsageError("--interval-ms and --timeout-ms require --watch.");
|
|
2753
3405
|
}
|
|
2754
|
-
const
|
|
3406
|
+
const runsFlag = asOptionalString(parsed.flags.runs);
|
|
3407
|
+
const defaultProjectSource = await readLocalProjectSource(path.resolve(sourceArg));
|
|
3408
|
+
const selectedRunIds = workflow === "eval"
|
|
3409
|
+
? resolveCandidateRunSelection(defaultProjectSource, runsFlag)
|
|
3410
|
+
: [singleRequestedRunId(runsFlag, `workbench ${workflow} --hosted`) ?? defaultProjectSource.candidateRunId];
|
|
3411
|
+
if (workflow === "eval" && selectedRunIds.length > 1) {
|
|
3412
|
+
let failed = 0;
|
|
3413
|
+
const results = [];
|
|
3414
|
+
for (const runId of selectedRunIds) {
|
|
3415
|
+
const captured = createCapturingIo(io);
|
|
3416
|
+
const code = await startHostedWorkflow(workflow, hostedWorkflowArgsForRun({
|
|
3417
|
+
parsed,
|
|
3418
|
+
sourceDir: defaultProjectSource.dir,
|
|
3419
|
+
runId,
|
|
3420
|
+
}), captured.io);
|
|
3421
|
+
if (code !== 0) {
|
|
3422
|
+
failed += 1;
|
|
3423
|
+
}
|
|
3424
|
+
results.push(parseCapturedJson(captured.stdoutText()));
|
|
3425
|
+
}
|
|
3426
|
+
writeOutput({
|
|
3427
|
+
ok: failed === 0,
|
|
3428
|
+
candidateRunIds: selectedRunIds,
|
|
3429
|
+
failedRunCount: failed,
|
|
3430
|
+
results,
|
|
3431
|
+
}, parsed, io, () => `Processed ${selectedRunIds.length} hosted candidate run(s); ${failed} failed.`);
|
|
3432
|
+
return failed === 0 ? 0 : 1;
|
|
3433
|
+
}
|
|
3434
|
+
const baseCandidateId = asOptionalString(parsed.flags.base);
|
|
2755
3435
|
const request = workflow === "improve"
|
|
2756
3436
|
? {
|
|
2757
3437
|
workflow,
|
|
2758
|
-
budget
|
|
2759
|
-
samples
|
|
2760
|
-
...(
|
|
3438
|
+
budget,
|
|
3439
|
+
samples,
|
|
3440
|
+
...(baseCandidateId ? { candidateId: baseCandidateId } : {}),
|
|
2761
3441
|
}
|
|
2762
3442
|
: {
|
|
2763
3443
|
workflow,
|
|
2764
|
-
samples
|
|
2765
|
-
...(
|
|
3444
|
+
samples,
|
|
3445
|
+
...(baseCandidateId ? { candidateId: baseCandidateId } : {}),
|
|
2766
3446
|
};
|
|
2767
|
-
|
|
2768
|
-
|
|
3447
|
+
const projectSource = selectedRunIds[0] === defaultProjectSource.candidateRunId
|
|
3448
|
+
? defaultProjectSource
|
|
3449
|
+
: await readLocalProjectSource(path.resolve(sourceArg), { runId: selectedRunIds[0] });
|
|
3450
|
+
request.sourceYaml = projectSource.specSource;
|
|
3451
|
+
request.adapterFiles = projectSource.adapterFiles;
|
|
3452
|
+
if (workflow === "eval" && !baseCandidateId) {
|
|
3453
|
+
request.candidateFiles = projectSource.candidateFiles;
|
|
2769
3454
|
}
|
|
2770
|
-
if (parsed.flags.
|
|
2771
|
-
|
|
2772
|
-
throw new UsageError("--interval-ms and --timeout-ms require --watch.");
|
|
2773
|
-
}
|
|
2774
|
-
const projectSource = await readLocalProjectSource(path.resolve(sourceArg), {
|
|
2775
|
-
optimizerPath,
|
|
2776
|
-
});
|
|
2777
|
-
if (workflow === "eval") {
|
|
2778
|
-
request.subjectSource = projectSource.subjectSource;
|
|
2779
|
-
request.subjectFiles = projectSource.subjectFiles;
|
|
2780
|
-
request.adapterFiles = projectSource.adapterFiles;
|
|
2781
|
-
}
|
|
2782
|
-
if (workflow === "improve" && projectSource.optimizerSource) {
|
|
2783
|
-
request.optimizerSource = projectSource.optimizerSource;
|
|
2784
|
-
request.adapterFiles = projectSource.adapterFiles;
|
|
3455
|
+
if (parsed.flags.rerun === true) {
|
|
3456
|
+
request.rerun = true;
|
|
2785
3457
|
}
|
|
2786
3458
|
const watchIntervalMs = parsed.flags.watch === true
|
|
2787
3459
|
? parsePositiveInt(parsed.flags["interval-ms"], 1000, "interval-ms")
|
|
@@ -2808,13 +3480,16 @@ async function startHostedWorkflow(workflow, argv, io) {
|
|
|
2808
3480
|
sourceDir: projectSource.dir,
|
|
2809
3481
|
});
|
|
2810
3482
|
if (workflow === "improve") {
|
|
2811
|
-
request.
|
|
3483
|
+
request.candidateId = await ensureHostedImproveBaseCandidate({
|
|
2812
3484
|
parsed,
|
|
2813
3485
|
target,
|
|
2814
3486
|
samples: request.samples,
|
|
2815
|
-
|
|
3487
|
+
candidateId: baseCandidateId,
|
|
3488
|
+
sourceYaml: projectSource.specSource,
|
|
3489
|
+
adapterFiles: projectSource.adapterFiles,
|
|
2816
3490
|
intervalMs: watchIntervalMs ?? 1000,
|
|
2817
3491
|
timeoutMs: watchTimeoutMs,
|
|
3492
|
+
io,
|
|
2818
3493
|
});
|
|
2819
3494
|
}
|
|
2820
3495
|
const response = await apiRequest(projectApiPath(target.projectId, "/runs"), {
|
|
@@ -2822,6 +3497,20 @@ async function startHostedWorkflow(workflow, argv, io) {
|
|
|
2822
3497
|
body: request,
|
|
2823
3498
|
}, target.baseUrl);
|
|
2824
3499
|
const startedRun = withRunUrls(target, response.run);
|
|
3500
|
+
const startedRunOutput = response.reused === true
|
|
3501
|
+
? { ...startedRun, reused: true }
|
|
3502
|
+
: startedRun;
|
|
3503
|
+
if (response.reused === true && response.run.status === "finished") {
|
|
3504
|
+
await tryImportTerminalHostedProjectState({ target, io });
|
|
3505
|
+
writeOutput({
|
|
3506
|
+
ok: hostedRunSucceeded(response.run),
|
|
3507
|
+
reused: true,
|
|
3508
|
+
workflow,
|
|
3509
|
+
runId: startedRun.id,
|
|
3510
|
+
...startedRun,
|
|
3511
|
+
}, parsed, io, () => `Reused hosted ${workflow} ${startedRun.id}. Use --rerun to intentionally run it again.`);
|
|
3512
|
+
return hostedRunSucceeded(response.run) ? 0 : 1;
|
|
3513
|
+
}
|
|
2825
3514
|
if (parsed.flags.watch === true) {
|
|
2826
3515
|
if (parsed.flags.json !== true) {
|
|
2827
3516
|
io.stdout.write(`${formatHostedRunStarted(startedRun, workflow).trimEnd()}\n${HOSTED_WATCH_LIFECYCLE_NOTE}\n`);
|
|
@@ -2834,26 +3523,27 @@ async function startHostedWorkflow(workflow, argv, io) {
|
|
|
2834
3523
|
timeoutMs: watchTimeoutMs,
|
|
2835
3524
|
});
|
|
2836
3525
|
const outputRun = await withHostedRunFailureSummary(target, watched);
|
|
3526
|
+
await tryImportTerminalHostedProjectState({ target, io });
|
|
2837
3527
|
writeOutput(withRunUrls(target, outputRun), parsed, io, formatHostedRunResult);
|
|
2838
3528
|
return hostedRunSucceeded(watched) ? 0 : 1;
|
|
2839
3529
|
}
|
|
2840
|
-
writeOutput(
|
|
3530
|
+
writeOutput(startedRunOutput, parsed, io, (run) => formatHostedRunStarted(run, workflow).trimEnd());
|
|
2841
3531
|
return 0;
|
|
2842
3532
|
}
|
|
2843
|
-
async function
|
|
2844
|
-
if (args.
|
|
2845
|
-
const
|
|
2846
|
-
if (!
|
|
2847
|
-
throw new UsageError(`Base
|
|
3533
|
+
async function ensureHostedImproveBaseCandidate(args) {
|
|
3534
|
+
if (args.candidateId) {
|
|
3535
|
+
const candidate = await readHostedCandidateSummary(args.target, args.candidateId);
|
|
3536
|
+
if (!candidate) {
|
|
3537
|
+
throw new UsageError(`Base candidate ${args.candidateId} was not found for the current benchmark.`);
|
|
2848
3538
|
}
|
|
2849
|
-
if (
|
|
2850
|
-
return args.
|
|
3539
|
+
if (hostedCandidateIsEvaluated(candidate)) {
|
|
3540
|
+
return args.candidateId;
|
|
2851
3541
|
}
|
|
2852
3542
|
}
|
|
2853
3543
|
else {
|
|
2854
|
-
const
|
|
2855
|
-
if (
|
|
2856
|
-
return
|
|
3544
|
+
const activeCandidate = await readEvaluatedActiveHostedCandidate(args.target);
|
|
3545
|
+
if (activeCandidate) {
|
|
3546
|
+
return activeCandidate.id;
|
|
2857
3547
|
}
|
|
2858
3548
|
}
|
|
2859
3549
|
const response = await apiRequest(projectApiPath(args.target.projectId, "/runs"), {
|
|
@@ -2861,7 +3551,9 @@ async function ensureHostedImproveBaseSubject(args) {
|
|
|
2861
3551
|
body: {
|
|
2862
3552
|
workflow: "eval",
|
|
2863
3553
|
samples: args.samples,
|
|
2864
|
-
...(args.
|
|
3554
|
+
...(args.candidateId ? { candidateId: args.candidateId } : {}),
|
|
3555
|
+
sourceYaml: args.sourceYaml,
|
|
3556
|
+
...(args.adapterFiles.length > 0 ? { adapterFiles: args.adapterFiles } : {}),
|
|
2865
3557
|
},
|
|
2866
3558
|
}, args.target.baseUrl);
|
|
2867
3559
|
const watched = await watchHostedRun({
|
|
@@ -2872,333 +3564,59 @@ async function ensureHostedImproveBaseSubject(args) {
|
|
|
2872
3564
|
timeoutMs: args.timeoutMs,
|
|
2873
3565
|
});
|
|
2874
3566
|
if (!hostedRunSucceeded(watched)) {
|
|
2875
|
-
throw new UsageError(`Parent
|
|
3567
|
+
throw new UsageError(`Parent candidate eval ${watched.id} failed; improve was not started.`);
|
|
2876
3568
|
}
|
|
2877
|
-
if (!watched.
|
|
2878
|
-
throw new UsageError(`Parent
|
|
2879
|
-
}
|
|
2880
|
-
return watched.subjectId;
|
|
2881
|
-
}
|
|
2882
|
-
async function readHostedSubjectSummary(target, subjectId) {
|
|
2883
|
-
const response = await apiRequest(projectApiPath(target.projectId, "/subjects"), {}, target.baseUrl);
|
|
2884
|
-
return response.subjects.find((entry) => entry.id === subjectId) ?? null;
|
|
2885
|
-
}
|
|
2886
|
-
async function readEvaluatedActiveHostedSubject(target) {
|
|
2887
|
-
const response = await apiRequest(projectApiPath(target.projectId), {}, target.baseUrl);
|
|
2888
|
-
const activeSubjectId = response.benchmark.activeSubjectId;
|
|
2889
|
-
if (!activeSubjectId) {
|
|
2890
|
-
return null;
|
|
3569
|
+
if (!watched.candidateId) {
|
|
3570
|
+
throw new UsageError(`Parent candidate eval ${watched.id} did not produce a candidate.`);
|
|
2891
3571
|
}
|
|
2892
|
-
|
|
2893
|
-
return
|
|
2894
|
-
}
|
|
2895
|
-
function hostedSubjectIsEvaluated(subject) {
|
|
2896
|
-
return subject.status === "evaluated" || subject.eval != null;
|
|
3572
|
+
await tryImportTerminalHostedProjectState({ target: args.target, io: args.io });
|
|
3573
|
+
return watched.candidateId;
|
|
2897
3574
|
}
|
|
2898
|
-
|
|
2899
|
-
const
|
|
2900
|
-
|
|
2901
|
-
|
|
2902
|
-
|
|
2903
|
-
|
|
2904
|
-
|
|
2905
|
-
|
|
2906
|
-
|
|
2907
|
-
|
|
2908
|
-
.map((project) => `${project.id}\t${project.name}\t${project.runCount} runs\t${project.subjectCount} subjects`)
|
|
2909
|
-
.join("\n");
|
|
2910
|
-
});
|
|
2911
|
-
return 0;
|
|
2912
|
-
}
|
|
2913
|
-
async function benchmarkShow(argv, io) {
|
|
2914
|
-
const parsed = parseArgs(argv);
|
|
2915
|
-
rejectUnknownFlags(parsed, new Set(["dir", "json"]));
|
|
2916
|
-
rejectUnexpectedPositionals(parsed, "workbench cloud benchmarks show", 1);
|
|
2917
|
-
const dir = resolveDir(parsed);
|
|
2918
|
-
const origin = await readWorkbenchOrigin(dir);
|
|
2919
|
-
const projectRef = parsed.positionals[0] ??
|
|
2920
|
-
origin?.projectId;
|
|
2921
|
-
if (!projectRef) {
|
|
2922
|
-
throw new UsageError("Missing hosted benchmark. Pass OWNER/BENCHMARK, run workbench push, or run workbench clone.");
|
|
2923
|
-
}
|
|
2924
|
-
const response = await apiRequest(benchmarkApiPath(projectRef), {}, await effectiveBaseUrl(origin?.baseUrl));
|
|
2925
|
-
writeOutput(response.benchmark, parsed, io, (project) => {
|
|
2926
|
-
const record = project;
|
|
2927
|
-
return `${record.name} (${record.id})\n${record.runs.length} runs\n${record.subjects.length} subjects`;
|
|
2928
|
-
});
|
|
2929
|
-
return 0;
|
|
2930
|
-
}
|
|
2931
|
-
async function benchmarkDelete(argv, io) {
|
|
2932
|
-
const parsed = parseArgs(argv);
|
|
2933
|
-
rejectUnknownFlags(parsed, new Set(["dir", "dry-run", "json"]));
|
|
2934
|
-
if (parsed.positionals.length > 1) {
|
|
2935
|
-
throw new UsageError(`Unexpected argument for workbench benchmarks delete: ${parsed.positionals.slice(1).join(" ")}`);
|
|
3575
|
+
function hostedWorkflowArgsForRun(args) {
|
|
3576
|
+
const next = ["--dir", args.sourceDir, "--runs", args.runId, "--json"];
|
|
3577
|
+
appendStringFlag(next, "benchmark", asOptionalString(args.parsed.flags.benchmark));
|
|
3578
|
+
appendStringFlag(next, "base", asOptionalString(args.parsed.flags.base));
|
|
3579
|
+
appendStringFlag(next, "samples", asOptionalString(args.parsed.flags.samples));
|
|
3580
|
+
appendStringFlag(next, "budget", asOptionalString(args.parsed.flags.budget));
|
|
3581
|
+
appendStringFlag(next, "interval-ms", asOptionalString(args.parsed.flags["interval-ms"]));
|
|
3582
|
+
appendStringFlag(next, "timeout-ms", asOptionalString(args.parsed.flags["timeout-ms"]));
|
|
3583
|
+
if (args.parsed.flags.watch === true) {
|
|
3584
|
+
next.push("--watch");
|
|
2936
3585
|
}
|
|
2937
|
-
|
|
2938
|
-
|
|
2939
|
-
const projectRef = parsed.positionals[0] ??
|
|
2940
|
-
origin?.projectId;
|
|
2941
|
-
if (!projectRef) {
|
|
2942
|
-
throw new UsageError("Missing hosted benchmark. Pass OWNER/BENCHMARK, run workbench push, or run workbench clone.");
|
|
2943
|
-
}
|
|
2944
|
-
const originPath = workbenchOriginPath(dir);
|
|
2945
|
-
const baseUrl = await effectiveBaseUrl(origin?.baseUrl);
|
|
2946
|
-
if (parsed.flags["dry-run"] === true) {
|
|
2947
|
-
const originProjectDeleted = originMatchesProjectRef(origin, projectRef);
|
|
2948
|
-
writeOutput({
|
|
2949
|
-
ok: true,
|
|
2950
|
-
dryRun: true,
|
|
2951
|
-
projectRef,
|
|
2952
|
-
...(isRemoteProjectId(projectRef) ? { projectId: projectRef } : {}),
|
|
2953
|
-
...(originProjectDeleted && origin?.project ? { projectName: origin.project } : {}),
|
|
2954
|
-
baseUrl,
|
|
2955
|
-
...(originProjectDeleted ? { originPath } : {}),
|
|
2956
|
-
}, parsed, io, () => originProjectDeleted
|
|
2957
|
-
? `Would delete hosted benchmark ${projectRef} and remove local origin ${originPath}.`
|
|
2958
|
-
: `Would delete hosted benchmark ${projectRef}.`);
|
|
2959
|
-
return 0;
|
|
3586
|
+
if (args.parsed.flags["dry-run"] === true) {
|
|
3587
|
+
next.push("--dry-run");
|
|
2960
3588
|
}
|
|
2961
|
-
|
|
2962
|
-
|
|
2963
|
-
const projectName = project.name;
|
|
2964
|
-
const originProjectDeleted = origin ? origin.projectId === projectId : false;
|
|
2965
|
-
await apiRequest(projectApiPath(projectId), { method: "DELETE" }, baseUrl);
|
|
2966
|
-
if (originProjectDeleted) {
|
|
2967
|
-
await fs.rm(originPath, { force: true });
|
|
3589
|
+
if (args.parsed.flags.rerun === true) {
|
|
3590
|
+
next.push("--rerun");
|
|
2968
3591
|
}
|
|
2969
|
-
|
|
2970
|
-
ok: true,
|
|
2971
|
-
deleted: true,
|
|
2972
|
-
projectId,
|
|
2973
|
-
...(projectName ? { projectName } : {}),
|
|
2974
|
-
originRemoved: originProjectDeleted,
|
|
2975
|
-
...(originProjectDeleted ? { originPath } : {}),
|
|
2976
|
-
}, parsed, io, () => originProjectDeleted
|
|
2977
|
-
? `Deleted benchmark ${formatProjectRef(project)} and removed local origin ${originPath}.`
|
|
2978
|
-
: `Deleted benchmark ${formatProjectRef(project)}.`);
|
|
2979
|
-
return 0;
|
|
2980
|
-
}
|
|
2981
|
-
async function benchmarkVersions(argv, io) {
|
|
2982
|
-
const parsed = parseArgs(argv);
|
|
2983
|
-
rejectUnknownFlags(parsed, new Set(["dir", "json"]));
|
|
2984
|
-
rejectUnexpectedPositionals(parsed, "workbench cloud benchmarks versions", 1);
|
|
2985
|
-
const projectRef = parsed.positionals[0];
|
|
2986
|
-
const origin = await readWorkbenchOrigin(resolveDir(parsed));
|
|
2987
|
-
if (!projectRef && !origin) {
|
|
2988
|
-
throw new UsageError("Missing benchmark ref. Pass OWNER/BENCHMARK or run from a benchmark clone.");
|
|
2989
|
-
}
|
|
2990
|
-
const response = await apiRequest(benchmarkApiPath(projectRef ?? origin.projectId), {}, await effectiveBaseUrl(origin?.baseUrl));
|
|
2991
|
-
const version = response.benchmark.sourceFingerprint ?? response.benchmark.currentSpecVersionId ?? "current";
|
|
2992
|
-
writeOutput({
|
|
2993
|
-
ok: true,
|
|
2994
|
-
benchmark: response.benchmark,
|
|
2995
|
-
versions: [{ ref: "main", digest: version, current: true }],
|
|
2996
|
-
}, parsed, io, () => `${response.benchmark.name ?? projectRef ?? origin.project}\tmain\t${shortDigest(version)}\tcurrent`);
|
|
2997
|
-
return 0;
|
|
2998
|
-
}
|
|
2999
|
-
async function benchmarkStarred(argv, io) {
|
|
3000
|
-
const parsed = parseArgs(argv);
|
|
3001
|
-
rejectUnknownFlags(parsed, new Set(["json"]));
|
|
3002
|
-
rejectUnexpectedPositionals(parsed, "workbench cloud benchmarks starred", 0);
|
|
3003
|
-
const response = await apiRequest("/api/workbench/benchmarks");
|
|
3004
|
-
const starred = response.benchmarks.filter((project) => project.viewerHasStarred === true);
|
|
3005
|
-
writeOutput(starred, parsed, io, (benchmarks) => {
|
|
3006
|
-
if (benchmarks.length === 0) {
|
|
3007
|
-
return "No starred benchmarks.";
|
|
3008
|
-
}
|
|
3009
|
-
return benchmarks
|
|
3010
|
-
.map((benchmark) => `${benchmark.ownerUsername ?? "-"} / ${benchmark.name ?? "-"}\t${benchmark.starCount ?? 0} stars`)
|
|
3011
|
-
.join("\n");
|
|
3012
|
-
});
|
|
3013
|
-
return 0;
|
|
3014
|
-
}
|
|
3015
|
-
async function subjectList(argv, io) {
|
|
3016
|
-
const parsed = parseArgs(argv);
|
|
3017
|
-
rejectUnknownFlags(parsed, new Set(["dir", "benchmark", "json"]));
|
|
3018
|
-
rejectUnexpectedPositionals(parsed, "workbench cloud subjects list", 0);
|
|
3019
|
-
const target = await resolveHostedTarget(parsed);
|
|
3020
|
-
const response = await apiRequest(projectApiPath(target.projectId, "/subjects"), {}, target.baseUrl);
|
|
3021
|
-
writeOutput(response.subjects, parsed, io, (subjects) => {
|
|
3022
|
-
if (subjects.length === 0) {
|
|
3023
|
-
return "No subjects yet.";
|
|
3024
|
-
}
|
|
3025
|
-
return subjects
|
|
3026
|
-
.map((subject) => `${subject.id}\t${subject.status}\tmetrics ${formatMetricSummary(subject.metrics)}\t${subject.fileChanges?.length ?? 0} files`)
|
|
3027
|
-
.join("\n");
|
|
3028
|
-
});
|
|
3029
|
-
return 0;
|
|
3030
|
-
}
|
|
3031
|
-
async function subjectShow(argv, io) {
|
|
3032
|
-
const parsed = parseArgs(argv);
|
|
3033
|
-
rejectUnknownFlags(parsed, new Set(["dir", "benchmark", "json"]));
|
|
3034
|
-
rejectUnexpectedPositionals(parsed, "workbench cloud subjects show", 1);
|
|
3035
|
-
const target = await resolveHostedTarget(parsed);
|
|
3036
|
-
const subjectId = readRequiredSubjectId(parsed);
|
|
3037
|
-
const params = new URLSearchParams({ id: subjectId });
|
|
3038
|
-
const subject = await apiRequest(projectApiPath(target.projectId, `/workbench/record?${params.toString()}`), {}, target.baseUrl);
|
|
3039
|
-
writeOutput(subject, parsed, io, (record) => {
|
|
3040
|
-
const value = record;
|
|
3041
|
-
return [
|
|
3042
|
-
`${value.id ?? subjectId}\t${value.status ?? "unknown"}`,
|
|
3043
|
-
...(value.benchmarkFingerprint ? [`Benchmark version: ${shortDigest(value.benchmarkFingerprint)}`] : []),
|
|
3044
|
-
...(value.subjectFingerprint ? [`Subject digest: ${shortDigest(value.subjectFingerprint)}`] : []),
|
|
3045
|
-
].join("\n");
|
|
3046
|
-
});
|
|
3047
|
-
return 0;
|
|
3048
|
-
}
|
|
3049
|
-
async function subjectFiles(argv, io) {
|
|
3050
|
-
const parsed = parseArgs(argv);
|
|
3051
|
-
rejectUnknownFlags(parsed, new Set(["dir", "benchmark", "json"]));
|
|
3052
|
-
rejectUnexpectedPositionals(parsed, "workbench cloud subjects files", 1);
|
|
3053
|
-
const target = await resolveHostedTarget(parsed);
|
|
3054
|
-
const subjectId = readRequiredSubjectId(parsed);
|
|
3055
|
-
const response = await apiRequest(projectApiPath(target.projectId, `/subjects/${encodeURIComponent(subjectId)}/files`), {}, target.baseUrl);
|
|
3056
|
-
writeOutput(response.files, parsed, io, (files) => files
|
|
3057
|
-
.map((file) => `${file.path}\t${file.status}\t${file.preview_kind}`)
|
|
3058
|
-
.join("\n") || "No files.");
|
|
3059
|
-
return 0;
|
|
3592
|
+
return next;
|
|
3060
3593
|
}
|
|
3061
|
-
|
|
3062
|
-
|
|
3063
|
-
|
|
3064
|
-
rejectUnexpectedPositionals(parsed, "workbench cloud subjects preview", 1);
|
|
3065
|
-
const target = await resolveHostedTarget(parsed);
|
|
3066
|
-
const subjectId = readRequiredSubjectId(parsed);
|
|
3067
|
-
const filePath = requireFlag(parsed, "path");
|
|
3068
|
-
const params = new URLSearchParams({ path: filePath });
|
|
3069
|
-
const response = await apiRequest(projectApiPath(target.projectId, `/subjects/${encodeURIComponent(subjectId)}/files?${params.toString()}`), {}, target.baseUrl);
|
|
3070
|
-
const content = response.preview.source?.content ??
|
|
3071
|
-
response.preview.rendered_html ??
|
|
3072
|
-
response.preview.diff ??
|
|
3073
|
-
"";
|
|
3074
|
-
const outputPath = asOptionalString(parsed.flags.output);
|
|
3075
|
-
if (outputPath && outputPath !== "-") {
|
|
3076
|
-
await fs.writeFile(outputPath, content);
|
|
3077
|
-
io.stdout.write(`Wrote preview to ${outputPath}\n`);
|
|
3594
|
+
function appendStringFlag(args, name, value) {
|
|
3595
|
+
if (value !== undefined) {
|
|
3596
|
+
args.push(`--${name}`, value);
|
|
3078
3597
|
}
|
|
3079
|
-
else if (parsed.flags.json === true) {
|
|
3080
|
-
writeJson(response.preview, io);
|
|
3081
|
-
}
|
|
3082
|
-
else {
|
|
3083
|
-
io.stdout.write(content);
|
|
3084
|
-
}
|
|
3085
|
-
return 0;
|
|
3086
|
-
}
|
|
3087
|
-
async function subjectExport(argv, io) {
|
|
3088
|
-
const parsed = parseArgs(argv);
|
|
3089
|
-
rejectUnknownFlags(parsed, new Set(["dir", "benchmark", "out", "json"]));
|
|
3090
|
-
rejectUnexpectedPositionals(parsed, "workbench cloud subjects pull", 1);
|
|
3091
|
-
const target = await resolveHostedTarget(parsed);
|
|
3092
|
-
const subjectId = readRequiredSubjectId(parsed);
|
|
3093
|
-
const outputDir = requireOutDir(parsed);
|
|
3094
|
-
const response = await apiRequest(projectApiPath(target.projectId, `/subjects/${encodeURIComponent(subjectId)}/export`), {}, target.baseUrl);
|
|
3095
|
-
await writeFiles(outputDir, response.files);
|
|
3096
|
-
writeOutput({ ok: true, outputDir, files: response.files.length }, parsed, io, (result) => {
|
|
3097
|
-
const record = result;
|
|
3098
|
-
return `Exported ${record.files} file(s) to ${record.outputDir}`;
|
|
3099
|
-
});
|
|
3100
|
-
return 0;
|
|
3101
|
-
}
|
|
3102
|
-
async function subjectVisibility(argv, io, visibility) {
|
|
3103
|
-
const parsed = parseArgs(argv);
|
|
3104
|
-
rejectUnknownFlags(parsed, new Set(["dir", "benchmark", "json"]));
|
|
3105
|
-
rejectUnexpectedPositionals(parsed, `workbench cloud subjects ${visibility === "public" ? "publish" : "unpublish"}`, 1);
|
|
3106
|
-
const target = await resolveHostedTarget(parsed, { requireProjectIdentity: true });
|
|
3107
|
-
const subjectId = readRequiredSubjectId(parsed);
|
|
3108
|
-
const response = await apiRequest(projectApiPath(target.projectId, `/subjects/${encodeURIComponent(subjectId)}/publish`), { method: visibility === "public" ? "PUT" : "DELETE" }, target.baseUrl);
|
|
3109
|
-
writeOutput({ ok: true, visibility, subject: response.subject }, parsed, io, () => `${visibility === "public" ? "Published" : "Unpublished"} subject ${subjectId}.`);
|
|
3110
|
-
return 0;
|
|
3111
|
-
}
|
|
3112
|
-
async function runList(argv, io) {
|
|
3113
|
-
const parsed = parseArgs(argv);
|
|
3114
|
-
rejectUnknownFlags(parsed, new Set(["dir", "benchmark", "json"]));
|
|
3115
|
-
rejectUnexpectedPositionals(parsed, "workbench cloud runs list", 0);
|
|
3116
|
-
const target = await resolveHostedTarget(parsed);
|
|
3117
|
-
const response = await apiRequest(projectApiPath(target.projectId, "/runs"), {}, target.baseUrl);
|
|
3118
|
-
writeOutput(response.runs, parsed, io, (runs) => runs
|
|
3119
|
-
.map((run) => `${run.id}\t${run.status}\t${run.subjectId ?? "pending"}`)
|
|
3120
|
-
.join("\n") || "No runs.");
|
|
3121
|
-
return 0;
|
|
3122
|
-
}
|
|
3123
|
-
async function runShow(argv, io) {
|
|
3124
|
-
const parsed = parseArgs(argv);
|
|
3125
|
-
rejectUnknownFlags(parsed, new Set(["dir", "benchmark", "json"]));
|
|
3126
|
-
rejectUnexpectedPositionals(parsed, "workbench cloud runs show", 1);
|
|
3127
|
-
const target = await resolveHostedTarget(parsed, { requireProjectIdentity: true });
|
|
3128
|
-
const runId = readRequiredRunId(parsed);
|
|
3129
|
-
const response = await apiRequest(projectApiPath(target.projectId, `/runs/${encodeURIComponent(runId)}`), {}, target.baseUrl);
|
|
3130
|
-
const detail = withRunDetailUrls(target, response);
|
|
3131
|
-
writeOutput(detail, parsed, io, formatRunDetail);
|
|
3132
|
-
return 0;
|
|
3133
|
-
}
|
|
3134
|
-
async function runCancel(argv, io) {
|
|
3135
|
-
const parsed = parseArgs(argv);
|
|
3136
|
-
rejectUnknownFlags(parsed, new Set(["dir", "benchmark", "json"]));
|
|
3137
|
-
rejectUnexpectedPositionals(parsed, "workbench cloud runs cancel", 1);
|
|
3138
|
-
const target = await resolveHostedTarget(parsed, { requireProjectIdentity: true });
|
|
3139
|
-
const runId = readRequiredRunId(parsed);
|
|
3140
|
-
const response = await apiRequest(projectApiPath(target.projectId, `/runs/${encodeURIComponent(runId)}`), { method: "DELETE" }, target.baseUrl);
|
|
3141
|
-
const run = withRunUrls(target, response.run);
|
|
3142
|
-
writeOutput(run, parsed, io, (record) => {
|
|
3143
|
-
const value = record;
|
|
3144
|
-
return [
|
|
3145
|
-
`Cancelled run ${value.id}; status ${value.status}; outcome ${value.outcome ?? "cancelled"}.`,
|
|
3146
|
-
`Open benchmark: ${value.urls?.benchmark ?? buildWorkbenchResourceUrls(target).benchmark}`,
|
|
3147
|
-
].join("\n");
|
|
3148
|
-
});
|
|
3149
|
-
return 0;
|
|
3150
3598
|
}
|
|
3151
|
-
async function
|
|
3152
|
-
const
|
|
3153
|
-
|
|
3154
|
-
rejectUnexpectedPositionals(parsed, "workbench cloud watch", 1);
|
|
3155
|
-
const target = await resolveHostedTarget(parsed, { requireProjectIdentity: true });
|
|
3156
|
-
const runId = readRequiredRunId(parsed);
|
|
3157
|
-
if (parsed.flags.json !== true) {
|
|
3158
|
-
io.stdout.write(`Watching run ${runId}.\n${HOSTED_WATCH_LIFECYCLE_NOTE}\n`);
|
|
3159
|
-
}
|
|
3160
|
-
const run = await watchHostedRun({
|
|
3161
|
-
parsed,
|
|
3162
|
-
target,
|
|
3163
|
-
runId,
|
|
3164
|
-
intervalMs: parsePositiveInt(parsed.flags["interval-ms"], 1000, "interval-ms"),
|
|
3165
|
-
timeoutMs: parseOptionalPositiveInt(parsed.flags["timeout-ms"], "timeout-ms"),
|
|
3166
|
-
});
|
|
3167
|
-
const outputRun = await withHostedRunFailureSummary(target, run);
|
|
3168
|
-
writeOutput(withRunUrls(target, outputRun), parsed, io, formatHostedRunResult);
|
|
3169
|
-
return hostedRunSucceeded(run) ? 0 : 1;
|
|
3599
|
+
async function readHostedCandidateSummary(target, candidateId) {
|
|
3600
|
+
const response = await apiRequest(projectApiPath(target.projectId, "/candidates"), {}, target.baseUrl);
|
|
3601
|
+
return response.candidates.find((entry) => entry.id === candidateId) ?? null;
|
|
3170
3602
|
}
|
|
3171
|
-
async function
|
|
3172
|
-
const
|
|
3173
|
-
|
|
3174
|
-
|
|
3175
|
-
|
|
3176
|
-
const requestedRunId = parsed.positionals[0];
|
|
3177
|
-
if (requestedRunId) {
|
|
3178
|
-
const response = await apiRequest(projectApiPath(target.projectId, `/runs/${encodeURIComponent(requestedRunId)}`), {}, target.baseUrl);
|
|
3179
|
-
writeOutput({ runId: response.run.id, jobs: response.jobs }, parsed, io, formatRunLogs);
|
|
3180
|
-
return 0;
|
|
3181
|
-
}
|
|
3182
|
-
const project = (await apiRequest(projectApiPath(target.projectId), {}, target.baseUrl)).project;
|
|
3183
|
-
const runId = project.runs.at(-1)?.id;
|
|
3184
|
-
if (!runId) {
|
|
3185
|
-
throw new UsageError("Missing RUN_ID; the benchmark has no runs.");
|
|
3603
|
+
async function readEvaluatedActiveHostedCandidate(target) {
|
|
3604
|
+
const response = await apiRequest(projectApiPath(target.projectId), {}, target.baseUrl);
|
|
3605
|
+
const activeCandidateId = response.benchmark.activeCandidateId;
|
|
3606
|
+
if (!activeCandidateId) {
|
|
3607
|
+
return null;
|
|
3186
3608
|
}
|
|
3187
|
-
const
|
|
3188
|
-
|
|
3189
|
-
return 0;
|
|
3609
|
+
const candidate = await readHostedCandidateSummary(target, activeCandidateId);
|
|
3610
|
+
return candidate && hostedCandidateIsEvaluated(candidate) ? candidate : null;
|
|
3190
3611
|
}
|
|
3191
|
-
function
|
|
3192
|
-
|
|
3193
|
-
return (value.jobs
|
|
3194
|
-
.map((job) => `${job.id}\t${job.kind}\t${job.status}\t${job.subjectId ?? "-"}${job.error ? `\t${job.error}` : ""}`)
|
|
3195
|
-
.join("\n") || `No jobs for ${value.runId}.`);
|
|
3612
|
+
function hostedCandidateIsEvaluated(candidate) {
|
|
3613
|
+
return candidate.status === "evaluated" || candidate.eval != null;
|
|
3196
3614
|
}
|
|
3197
3615
|
async function openWorkbench(argv, io) {
|
|
3198
3616
|
const parsed = parseArgs(argv);
|
|
3199
3617
|
rejectUnknownFlags(parsed, new Set(["dir", "benchmark", "no-open", "json"]));
|
|
3200
3618
|
if (parsed.positionals.length > 1) {
|
|
3201
|
-
throw new UsageError(`Unexpected argument for workbench open: ${parsed.positionals.slice(1).join(" ")}`);
|
|
3619
|
+
throw new UsageError(`Unexpected argument for workbench open --hosted: ${parsed.positionals.slice(1).join(" ")}`);
|
|
3202
3620
|
}
|
|
3203
3621
|
const target = await resolveOpenTarget(parsed);
|
|
3204
3622
|
const ref = target.openRef;
|
|
@@ -3226,7 +3644,7 @@ function buildWorkbenchWebUrl(target, ref) {
|
|
|
3226
3644
|
if (ref.startsWith("run_")) {
|
|
3227
3645
|
return benchmarkUrl;
|
|
3228
3646
|
}
|
|
3229
|
-
return buildWorkbenchResourceUrls(target, {
|
|
3647
|
+
return buildWorkbenchResourceUrls(target, { candidateId: ref }).candidateEvaluation;
|
|
3230
3648
|
}
|
|
3231
3649
|
async function resolveHostedTarget(parsed, options = {}) {
|
|
3232
3650
|
if (options.sourceArg !== undefined && parsed.flags.dir !== undefined) {
|
|
@@ -3253,11 +3671,12 @@ async function resolveHostedTarget(parsed, options = {}) {
|
|
|
3253
3671
|
if (!projectId) {
|
|
3254
3672
|
throw new UsageError("Missing hosted benchmark. Run workbench push, workbench clone, or pass --benchmark OWNER/BENCHMARK.");
|
|
3255
3673
|
}
|
|
3674
|
+
const originRemote = origin ? parseOriginRemote(origin) : null;
|
|
3256
3675
|
return {
|
|
3257
3676
|
projectId,
|
|
3258
|
-
...(!explicitProject &&
|
|
3259
|
-
...(!explicitProject &&
|
|
3260
|
-
? { projectName:
|
|
3677
|
+
...(!explicitProject && originRemote ? { owner: originRemote.owner } : {}),
|
|
3678
|
+
...(!explicitProject && originRemote
|
|
3679
|
+
? { projectName: originRemote.project }
|
|
3261
3680
|
: {}),
|
|
3262
3681
|
dir,
|
|
3263
3682
|
baseUrl,
|
|
@@ -3295,13 +3714,12 @@ async function resolveHostedDryRunTarget(parsed, options = {}) {
|
|
|
3295
3714
|
};
|
|
3296
3715
|
}
|
|
3297
3716
|
if (origin?.projectId) {
|
|
3717
|
+
const originRemote = parseOriginRemote(origin);
|
|
3298
3718
|
return {
|
|
3299
|
-
projectRef: origin.
|
|
3300
|
-
? `${origin.owner}/${origin.project}`
|
|
3301
|
-
: origin.projectId,
|
|
3719
|
+
projectRef: origin.remote,
|
|
3302
3720
|
projectId: origin.projectId,
|
|
3303
|
-
|
|
3304
|
-
|
|
3721
|
+
owner: originRemote.owner,
|
|
3722
|
+
projectName: originRemote.project,
|
|
3305
3723
|
dir,
|
|
3306
3724
|
baseUrl,
|
|
3307
3725
|
origin,
|
|
@@ -3313,7 +3731,7 @@ async function resolveOpenTarget(parsed) {
|
|
|
3313
3731
|
const ref = parsed.positionals[0];
|
|
3314
3732
|
if (ref &&
|
|
3315
3733
|
!ref.startsWith("run_") &&
|
|
3316
|
-
!ref.startsWith("
|
|
3734
|
+
!ref.startsWith("candidate_")) {
|
|
3317
3735
|
const baseUrl = await effectiveBaseUrl();
|
|
3318
3736
|
if (ref.includes("/")) {
|
|
3319
3737
|
const parsedRef = parseBenchmarkRef(ref);
|
|
@@ -3347,51 +3765,44 @@ function buildWorkbenchResourceUrls(target, refs = {}) {
|
|
|
3347
3765
|
const projectRef = `${encodeURIComponent(target.owner)}/${encodeURIComponent(target.projectName)}`;
|
|
3348
3766
|
const benchmark = `${target.baseUrl}/benchmarks/${projectRef}`;
|
|
3349
3767
|
const urls = { benchmark };
|
|
3350
|
-
if (refs.
|
|
3768
|
+
if (refs.candidateId) {
|
|
3351
3769
|
const evaluationId = refs.runId
|
|
3352
|
-
? evaluationScorecardId(refs.runId, refs.
|
|
3770
|
+
? evaluationScorecardId(refs.runId, refs.candidateId)
|
|
3353
3771
|
: null;
|
|
3354
|
-
urls.
|
|
3355
|
-
? `${benchmark}/
|
|
3356
|
-
: `${benchmark}/
|
|
3772
|
+
urls.candidateEvaluation = evaluationId
|
|
3773
|
+
? `${benchmark}/candidates/${encodeURIComponent(refs.candidateId)}?evaluation=${encodeURIComponent(evaluationId)}`
|
|
3774
|
+
: `${benchmark}/candidates/${encodeURIComponent(refs.candidateId)}`;
|
|
3357
3775
|
}
|
|
3358
3776
|
return urls;
|
|
3359
3777
|
}
|
|
3360
3778
|
function projectApiPath(projectRef, suffix = "") {
|
|
3361
3779
|
return `/api/workbench/benchmarks/${encodeURIComponent(projectRef)}${suffix}`;
|
|
3362
3780
|
}
|
|
3363
|
-
function benchmarkApiPath(benchmarkRef) {
|
|
3364
|
-
if (benchmarkRef.includes("/")) {
|
|
3365
|
-
return publicProjectApiPath(parseBenchmarkRef(benchmarkRef));
|
|
3366
|
-
}
|
|
3367
|
-
return projectApiPath(benchmarkRef);
|
|
3368
|
-
}
|
|
3369
3781
|
function publicProjectApiPath(ref) {
|
|
3370
3782
|
return `/api/workbench/public/benchmarks/${encodeURIComponent(ref.owner)}/${encodeURIComponent(ref.project)}`;
|
|
3371
3783
|
}
|
|
3372
|
-
function
|
|
3373
|
-
return `${publicProjectApiPath(ref)}/
|
|
3784
|
+
function publicProjectStateApiPath(ref) {
|
|
3785
|
+
return `${publicProjectApiPath(ref)}/state`;
|
|
3374
3786
|
}
|
|
3375
3787
|
function readRequiredBenchmarkRef(parsed) {
|
|
3376
3788
|
const ref = parsed.positionals[0];
|
|
3377
3789
|
if (!ref) {
|
|
3378
|
-
throw new UsageError("Missing required OWNER/BENCHMARK
|
|
3790
|
+
throw new UsageError("Missing required OWNER/BENCHMARK.");
|
|
3379
3791
|
}
|
|
3380
3792
|
return parseBenchmarkRef(ref);
|
|
3381
3793
|
}
|
|
3382
3794
|
function parseBenchmarkRef(value) {
|
|
3383
|
-
|
|
3384
|
-
|
|
3385
|
-
throw new UsageError("Benchmark refs must use OWNER/BENCHMARK[@REF].");
|
|
3795
|
+
if (value.includes("@")) {
|
|
3796
|
+
throw new UsageError("Benchmark refs must use OWNER/BENCHMARK.");
|
|
3386
3797
|
}
|
|
3387
|
-
const [owner, project, extra] =
|
|
3798
|
+
const [owner, project, extra] = value.split("/");
|
|
3388
3799
|
if (!owner || !project || extra !== undefined) {
|
|
3389
|
-
throw new UsageError("Benchmark refs must use OWNER/BENCHMARK
|
|
3800
|
+
throw new UsageError("Benchmark refs must use OWNER/BENCHMARK.");
|
|
3390
3801
|
}
|
|
3391
|
-
return { owner, project
|
|
3802
|
+
return { owner, project };
|
|
3392
3803
|
}
|
|
3393
3804
|
function formatBenchmarkRef(ref) {
|
|
3394
|
-
return `${ref.owner}/${ref.project}
|
|
3805
|
+
return `${ref.owner}/${ref.project}`;
|
|
3395
3806
|
}
|
|
3396
3807
|
async function resolveRemoteProject(projectRef, baseUrl) {
|
|
3397
3808
|
if (projectRef.includes("/")) {
|
|
@@ -3402,52 +3813,84 @@ async function resolveRemoteProject(projectRef, baseUrl) {
|
|
|
3402
3813
|
const response = await apiRequest(projectApiPath(projectRef), {}, baseUrl);
|
|
3403
3814
|
return response.benchmark;
|
|
3404
3815
|
}
|
|
3405
|
-
function formatProjectRef(project) {
|
|
3406
|
-
return project.name ? `${project.name} (${project.id})` : project.id;
|
|
3407
|
-
}
|
|
3408
|
-
function originMatchesProjectRef(origin, projectRef) {
|
|
3409
|
-
if (!origin) {
|
|
3410
|
-
return false;
|
|
3411
|
-
}
|
|
3412
|
-
if (origin.projectId === projectRef) {
|
|
3413
|
-
return true;
|
|
3414
|
-
}
|
|
3415
|
-
if (!projectRef.includes("/")) {
|
|
3416
|
-
return false;
|
|
3417
|
-
}
|
|
3418
|
-
const ref = parseBenchmarkRef(projectRef);
|
|
3419
|
-
return origin.owner === ref.owner && origin.project === ref.project;
|
|
3420
|
-
}
|
|
3421
3816
|
function withRunUrls(target, run) {
|
|
3422
3817
|
return {
|
|
3423
3818
|
...run,
|
|
3424
3819
|
urls: buildWorkbenchResourceUrls(target, {
|
|
3425
3820
|
runId: run.id,
|
|
3426
|
-
|
|
3821
|
+
candidateId: run.outputCandidateId ?? run.candidateId,
|
|
3427
3822
|
}),
|
|
3428
3823
|
};
|
|
3429
3824
|
}
|
|
3430
|
-
function
|
|
3431
|
-
|
|
3432
|
-
|
|
3433
|
-
|
|
3434
|
-
|
|
3435
|
-
|
|
3825
|
+
function hostedRunEvaluationCandidateId(run, jobs = []) {
|
|
3826
|
+
if (run.outputCandidateId) {
|
|
3827
|
+
return run.outputCandidateId;
|
|
3828
|
+
}
|
|
3829
|
+
const attemptCandidates = jobs
|
|
3830
|
+
.filter((job) => readRunJobPurpose(job) === "attempt")
|
|
3831
|
+
.map((job) => job.candidateId)
|
|
3832
|
+
.filter((candidateId) => Boolean(candidateId));
|
|
3833
|
+
return attemptCandidates.at(-1) ?? run.candidateId ?? null;
|
|
3834
|
+
}
|
|
3835
|
+
function localProjectState(args) {
|
|
3836
|
+
const stateSource = localProjectStateSource(args.source);
|
|
3837
|
+
const runtimeFingerprint = workbenchRuntimeBundleFingerprint(args.runtime);
|
|
3436
3838
|
return {
|
|
3437
|
-
|
|
3438
|
-
|
|
3439
|
-
|
|
3839
|
+
schema: "workbench.project.state.v1",
|
|
3840
|
+
project: {
|
|
3841
|
+
id: args.origin?.projectId ?? "",
|
|
3842
|
+
remote: args.origin?.remote ?? `local/${args.source.spec.name}`,
|
|
3843
|
+
ownerUsername: args.origin ? parseOriginRemote(args.origin).owner : "local",
|
|
3844
|
+
name: args.origin ? parseOriginRemote(args.origin).project : args.source.spec.name,
|
|
3845
|
+
visibility: args.visibility,
|
|
3846
|
+
},
|
|
3847
|
+
base: {
|
|
3848
|
+
...(args.origin ? { sourceRevisionId: args.origin.sourceRevisionId } : {}),
|
|
3849
|
+
...(args.origin ? { sourceFingerprint: args.origin.sourceFingerprint } : {}),
|
|
3850
|
+
runtimeFingerprint: args.origin?.runtimeFingerprint ?? runtimeFingerprint,
|
|
3851
|
+
},
|
|
3852
|
+
source: stateSource,
|
|
3853
|
+
runtime: args.runtime,
|
|
3440
3854
|
};
|
|
3441
3855
|
}
|
|
3442
|
-
function
|
|
3443
|
-
|
|
3444
|
-
|
|
3445
|
-
|
|
3446
|
-
|
|
3447
|
-
.
|
|
3448
|
-
.map(
|
|
3449
|
-
.
|
|
3450
|
-
|
|
3856
|
+
function localProjectStateSource(source) {
|
|
3857
|
+
const request = hostedProjectSourceRequest(source);
|
|
3858
|
+
const stateSource = {
|
|
3859
|
+
source: request.source,
|
|
3860
|
+
files: source.sourceFiles.map((file) => ({ ...file })),
|
|
3861
|
+
candidateFiles: request.candidateFiles.map(toSurfaceSnapshotFile),
|
|
3862
|
+
engineResolveFiles: request.engineResolveFiles.map(toSurfaceSnapshotFile),
|
|
3863
|
+
engineResolveBinding: request.engineResolveBinding,
|
|
3864
|
+
adapterFiles: request.adapterFiles.map(toSurfaceSnapshotFile),
|
|
3865
|
+
dockerfile: request.dockerfile,
|
|
3866
|
+
runtimeDockerfile: request.runtimeDockerfile,
|
|
3867
|
+
runtimeFiles: request.runtimeFiles.map(toSurfaceSnapshotFile),
|
|
3868
|
+
network: request.network,
|
|
3869
|
+
resources: { ...request.resources },
|
|
3870
|
+
};
|
|
3871
|
+
return {
|
|
3872
|
+
...stateSource,
|
|
3873
|
+
fingerprint: workbenchProjectSourceFingerprint(stateSource),
|
|
3874
|
+
};
|
|
3875
|
+
}
|
|
3876
|
+
function toSurfaceSnapshotFile(file) {
|
|
3877
|
+
return {
|
|
3878
|
+
path: file.path,
|
|
3879
|
+
kind: "kind" in file ? file.kind : file.encoding === "base64" ? "binary" : "text",
|
|
3880
|
+
encoding: file.encoding ?? "utf8",
|
|
3881
|
+
content: file.content,
|
|
3882
|
+
executable: file.executable === true,
|
|
3883
|
+
};
|
|
3884
|
+
}
|
|
3885
|
+
function hostedProjectSummaryFromState(state) {
|
|
3886
|
+
return {
|
|
3887
|
+
id: state.project.id,
|
|
3888
|
+
ownerUsername: state.project.ownerUsername,
|
|
3889
|
+
name: state.project.name,
|
|
3890
|
+
visibility: state.project.visibility,
|
|
3891
|
+
currentSpecVersionId: state.source.revisionId ?? state.base.sourceRevisionId,
|
|
3892
|
+
sourceFingerprint: state.source.fingerprint ?? state.base.sourceFingerprint,
|
|
3893
|
+
};
|
|
3451
3894
|
}
|
|
3452
3895
|
function sourceFileCount(source) {
|
|
3453
3896
|
return source.sourceFiles.length;
|
|
@@ -3456,7 +3899,7 @@ function hostedProjectSourceRequest(source) {
|
|
|
3456
3899
|
const { network, resources } = hostedEnvironmentOptions(source);
|
|
3457
3900
|
return {
|
|
3458
3901
|
source: source.specSource,
|
|
3459
|
-
|
|
3902
|
+
candidateFiles: source.candidateFiles,
|
|
3460
3903
|
engineResolveFiles: hostedEngineResolveFiles(source),
|
|
3461
3904
|
engineResolveBinding: engineResolveBindingForSpec(source.spec),
|
|
3462
3905
|
adapterFiles: source.adapterFiles,
|
|
@@ -3539,68 +3982,48 @@ async function watchHostedRun(args) {
|
|
|
3539
3982
|
}
|
|
3540
3983
|
}
|
|
3541
3984
|
function formatHostedRunResult(run) {
|
|
3542
|
-
const
|
|
3543
|
-
const activeDetail = run.
|
|
3544
|
-
? `; active ${run.
|
|
3985
|
+
const candidateId = run.outputCandidateId ?? run.candidateId;
|
|
3986
|
+
const activeDetail = run.activeCandidateId && candidateId && run.activeCandidateId !== candidateId
|
|
3987
|
+
? `; active ${run.activeCandidateId}`
|
|
3545
3988
|
: "";
|
|
3546
|
-
const summary = `Run ${run.id} reached ${run.status}; ${run.outcome ? `outcome ${run.outcome}; ` : ""}
|
|
3989
|
+
const summary = `Run ${run.id} reached ${run.status}; ${run.outcome ? `outcome ${run.outcome}; ` : ""}candidate ${candidateId ?? "pending"}${activeDetail}; ${run.completedJobCount ?? 0}/${run.jobCount ?? 0} jobs completed.`;
|
|
3547
3990
|
return [
|
|
3548
3991
|
run.error ? `${summary}\nError: ${run.error}` : summary,
|
|
3549
|
-
...(run.urls?.
|
|
3550
|
-
? [`Open evaluation: ${run.urls.
|
|
3992
|
+
...(run.urls?.candidateEvaluation
|
|
3993
|
+
? [`Open evaluation: ${run.urls.candidateEvaluation}`]
|
|
3551
3994
|
: [`Open benchmark: ${run.urls?.benchmark ?? ""}`].filter(Boolean)),
|
|
3552
3995
|
].join("\n");
|
|
3553
3996
|
}
|
|
3554
|
-
function
|
|
3555
|
-
const
|
|
3556
|
-
|
|
3557
|
-
|
|
3558
|
-
|
|
3559
|
-
|
|
3560
|
-
|
|
3561
|
-
"",
|
|
3562
|
-
].join("\n");
|
|
3563
|
-
}
|
|
3564
|
-
function formatRunDetail(record) {
|
|
3565
|
-
const detail = record;
|
|
3566
|
-
const { run, jobs, urls } = detail;
|
|
3567
|
-
const cost = sumJobCostUsd(jobs);
|
|
3568
|
-
const firstFailedJob = jobs.find((job) => job.status === "failed" && job.error);
|
|
3569
|
-
const subjectId = hostedRunEvaluationSubjectId(run, jobs);
|
|
3997
|
+
function formatRetryCommandResult(result) {
|
|
3998
|
+
const run = result.run;
|
|
3999
|
+
const runId = run?.id ?? result.runId ?? "unknown";
|
|
4000
|
+
const scope = `${result.retried.kind} ${result.retried.id}`;
|
|
4001
|
+
const verb = run
|
|
4002
|
+
? run.status === "finished" ? "finished as hosted run" : "started as hosted run"
|
|
4003
|
+
: "finished as local run";
|
|
3570
4004
|
return [
|
|
3571
|
-
`
|
|
3572
|
-
`
|
|
3573
|
-
`
|
|
3574
|
-
...(
|
|
3575
|
-
|
|
3576
|
-
|
|
3577
|
-
|
|
3578
|
-
`Attempts: ${run.attemptsExecuted ?? 0}/${run.attemptsRequested ?? run.attemptsExecuted ?? 0}`,
|
|
3579
|
-
`Jobs: ${run.completedJobCount ?? jobs.filter(isTerminalRunJob).length}/${run.jobCount ?? jobs.length} completed${run.failedJobCount ? `; ${run.failedJobCount} failed` : ""}`,
|
|
3580
|
-
...(typeof run.durationMs === "number"
|
|
3581
|
-
? [`Duration: ${formatDurationMs(run.durationMs)}`]
|
|
4005
|
+
`Retry of ${scope} ${verb} ${runId}.`,
|
|
4006
|
+
...(result.evaluationId ? [`Evaluation: ${result.evaluationId}`] : []),
|
|
4007
|
+
...(result.candidateId ? [`Candidate: ${result.candidateId}`] : []),
|
|
4008
|
+
...(result.failedJobCount ? [`Failed jobs: ${result.failedJobCount}`] : []),
|
|
4009
|
+
...(result.error ? [`Error: ${result.error}`] : []),
|
|
4010
|
+
...(result.localView
|
|
4011
|
+
? [`Open local view: ${result.localView.command}`, result.localView.note]
|
|
3582
4012
|
: []),
|
|
3583
|
-
...(
|
|
3584
|
-
|
|
3585
|
-
? [`
|
|
3586
|
-
: []),
|
|
3587
|
-
...(urls.subjectEvaluation
|
|
3588
|
-
? [`Open evaluation: ${urls.subjectEvaluation}`]
|
|
3589
|
-
: [`Open benchmark: ${urls.benchmark}`]),
|
|
3590
|
-
...(jobs.length > 0 ? ["", "Jobs:", ...jobs.map(formatRunJobLine)] : []),
|
|
4013
|
+
...(result.urls?.candidateEvaluation
|
|
4014
|
+
? [`Open evaluation: ${result.urls.candidateEvaluation}`]
|
|
4015
|
+
: result.urls?.benchmark ? [`Open benchmark: ${result.urls.benchmark}`] : []),
|
|
3591
4016
|
].join("\n");
|
|
3592
4017
|
}
|
|
3593
|
-
function
|
|
4018
|
+
function formatHostedRunStarted(run, fallbackWorkflow) {
|
|
4019
|
+
const candidateId = run.outputCandidateId ?? run.candidateId;
|
|
3594
4020
|
return [
|
|
3595
|
-
|
|
3596
|
-
|
|
3597
|
-
|
|
3598
|
-
|
|
3599
|
-
|
|
3600
|
-
].
|
|
3601
|
-
}
|
|
3602
|
-
function isTerminalRunJob(job) {
|
|
3603
|
-
return job.status === "succeeded" || job.status === "failed" || job.status === "cancelled";
|
|
4021
|
+
`Started ${run.workflow ?? fallbackWorkflow} run ${run.id}; ${candidateId ? `candidate ${candidateId}` : `${run.jobCount ?? 0} jobs queued`}.`,
|
|
4022
|
+
...(run.urls?.candidateEvaluation
|
|
4023
|
+
? [`Open evaluation: ${run.urls.candidateEvaluation}`]
|
|
4024
|
+
: run.urls?.benchmark ? [`Open benchmark: ${run.urls.benchmark}`] : []),
|
|
4025
|
+
"",
|
|
4026
|
+
].join("\n");
|
|
3604
4027
|
}
|
|
3605
4028
|
function readRunJobPurpose(job) {
|
|
3606
4029
|
const input = readRecord(job.input);
|
|
@@ -3608,49 +4031,22 @@ function readRunJobPurpose(job) {
|
|
|
3608
4031
|
const purpose = execution?.purpose;
|
|
3609
4032
|
return typeof purpose === "string" && purpose ? purpose : null;
|
|
3610
4033
|
}
|
|
3611
|
-
function sumJobCostUsd(jobs) {
|
|
3612
|
-
const sum = jobs.reduce((total, job) => total + costUsdFromUsage(readRecord(job.output)?.usage), 0);
|
|
3613
|
-
return Number.isFinite(sum) ? Math.round(sum * 1_000_000) / 1_000_000 : 0;
|
|
3614
|
-
}
|
|
3615
|
-
function costUsdFromUsage(value) {
|
|
3616
|
-
const usage = readRecord(value);
|
|
3617
|
-
if (!usage) {
|
|
3618
|
-
return 0;
|
|
3619
|
-
}
|
|
3620
|
-
const direct = readFiniteNumber(usage.costUsd);
|
|
3621
|
-
if (direct !== null) {
|
|
3622
|
-
return direct;
|
|
3623
|
-
}
|
|
3624
|
-
return ["total", "optimizer", "runner", "engine"].reduce((sum, key) => {
|
|
3625
|
-
const nested = readRecord(usage[key]);
|
|
3626
|
-
return sum + (readFiniteNumber(nested?.costUsd) ?? 0);
|
|
3627
|
-
}, 0);
|
|
3628
|
-
}
|
|
3629
4034
|
function readRecord(value) {
|
|
3630
4035
|
return value && typeof value === "object" && !Array.isArray(value)
|
|
3631
4036
|
? value
|
|
3632
4037
|
: null;
|
|
3633
4038
|
}
|
|
3634
|
-
function
|
|
3635
|
-
return typeof value === "
|
|
4039
|
+
function stringValue(value) {
|
|
4040
|
+
return typeof value === "string" && value.length > 0 ? value : null;
|
|
3636
4041
|
}
|
|
3637
|
-
function
|
|
3638
|
-
|
|
3639
|
-
return `${Math.max(0, Math.round(durationMs))}ms`;
|
|
3640
|
-
}
|
|
3641
|
-
const seconds = durationMs / 1000;
|
|
3642
|
-
if (seconds < 60) {
|
|
3643
|
-
return `${seconds.toFixed(seconds < 10 ? 1 : 0)}s`;
|
|
3644
|
-
}
|
|
3645
|
-
const minutes = Math.floor(seconds / 60);
|
|
3646
|
-
const remainingSeconds = Math.round(seconds % 60);
|
|
3647
|
-
return `${minutes}m ${remainingSeconds}s`;
|
|
4042
|
+
function numberValue(value) {
|
|
4043
|
+
return readFiniteNumber(value);
|
|
3648
4044
|
}
|
|
3649
|
-
function
|
|
3650
|
-
return
|
|
4045
|
+
function integerValue(value) {
|
|
4046
|
+
return Number.isSafeInteger(value) ? value : null;
|
|
3651
4047
|
}
|
|
3652
|
-
function
|
|
3653
|
-
return value
|
|
4048
|
+
function readFiniteNumber(value) {
|
|
4049
|
+
return typeof value === "number" && Number.isFinite(value) ? value : null;
|
|
3654
4050
|
}
|
|
3655
4051
|
async function withHostedRunFailureSummary(target, run) {
|
|
3656
4052
|
if (hostedRunSucceeded(run) || run.error || (run.failedJobCount ?? 0) <= 0) {
|
|
@@ -3681,23 +4077,44 @@ function hostedRunSucceeded(run) {
|
|
|
3681
4077
|
async function readWorkbenchOrigin(dir) {
|
|
3682
4078
|
try {
|
|
3683
4079
|
const parsed = JSON.parse(await fs.readFile(workbenchOriginPath(dir), "utf8"));
|
|
3684
|
-
if (!parsed.
|
|
3685
|
-
|
|
3686
|
-
|
|
3687
|
-
|
|
3688
|
-
|
|
4080
|
+
if (!parsed || typeof parsed !== "object" || Array.isArray(parsed)) {
|
|
4081
|
+
throw new UsageError(`Workbench origin is malformed: ${workbenchOriginPath(dir)}`);
|
|
4082
|
+
}
|
|
4083
|
+
const originRecord = parsed;
|
|
4084
|
+
const keys = Object.keys(originRecord).sort();
|
|
4085
|
+
const expectedKeys = [
|
|
4086
|
+
"baseUrl",
|
|
4087
|
+
"linkedAt",
|
|
4088
|
+
"projectId",
|
|
4089
|
+
"remote",
|
|
4090
|
+
"runtimeFingerprint",
|
|
4091
|
+
"sourceFingerprint",
|
|
4092
|
+
"sourceRevisionId",
|
|
4093
|
+
];
|
|
4094
|
+
if (typeof originRecord.projectId !== "string" ||
|
|
4095
|
+
typeof originRecord.baseUrl !== "string" ||
|
|
4096
|
+
typeof originRecord.remote !== "string" ||
|
|
4097
|
+
typeof originRecord.sourceRevisionId !== "string" ||
|
|
4098
|
+
typeof originRecord.sourceFingerprint !== "string" ||
|
|
4099
|
+
typeof originRecord.runtimeFingerprint !== "string" ||
|
|
4100
|
+
typeof originRecord.linkedAt !== "string" ||
|
|
4101
|
+
originRecord.projectId.length === 0 ||
|
|
4102
|
+
originRecord.sourceRevisionId.length === 0 ||
|
|
4103
|
+
originRecord.sourceFingerprint.length === 0 ||
|
|
4104
|
+
originRecord.runtimeFingerprint.length === 0) {
|
|
4105
|
+
throw new UsageError(`Workbench origin is malformed: ${workbenchOriginPath(dir)}`);
|
|
4106
|
+
}
|
|
4107
|
+
if (JSON.stringify(keys) !== JSON.stringify(expectedKeys)) {
|
|
3689
4108
|
throw new UsageError(`Workbench origin is malformed: ${workbenchOriginPath(dir)}`);
|
|
3690
4109
|
}
|
|
3691
4110
|
return {
|
|
3692
|
-
baseUrl: normalizeBaseUrl(
|
|
3693
|
-
|
|
3694
|
-
|
|
3695
|
-
|
|
3696
|
-
|
|
3697
|
-
|
|
3698
|
-
|
|
3699
|
-
...(parsed.upstream ? { upstream: parsed.upstream } : {}),
|
|
3700
|
-
linkedAt: parsed.linkedAt ?? new Date(0).toISOString(),
|
|
4111
|
+
baseUrl: normalizeBaseUrl(originRecord.baseUrl),
|
|
4112
|
+
remote: normalizeOriginRemote(originRecord.remote),
|
|
4113
|
+
projectId: originRecord.projectId,
|
|
4114
|
+
sourceRevisionId: originRecord.sourceRevisionId,
|
|
4115
|
+
sourceFingerprint: originRecord.sourceFingerprint,
|
|
4116
|
+
runtimeFingerprint: originRecord.runtimeFingerprint,
|
|
4117
|
+
linkedAt: originRecord.linkedAt,
|
|
3701
4118
|
};
|
|
3702
4119
|
}
|
|
3703
4120
|
catch (error) {
|
|
@@ -3716,8 +4133,12 @@ async function requireWorkbenchOrigin(dir) {
|
|
|
3716
4133
|
}
|
|
3717
4134
|
async function writeWorkbenchOrigin(dir, input) {
|
|
3718
4135
|
const origin = {
|
|
3719
|
-
...input,
|
|
3720
4136
|
baseUrl: normalizeBaseUrl(input.baseUrl),
|
|
4137
|
+
remote: normalizeOriginRemote(input.remote),
|
|
4138
|
+
projectId: input.projectId,
|
|
4139
|
+
sourceRevisionId: input.sourceRevisionId,
|
|
4140
|
+
sourceFingerprint: input.sourceFingerprint,
|
|
4141
|
+
runtimeFingerprint: input.runtimeFingerprint,
|
|
3721
4142
|
linkedAt: input.linkedAt ?? new Date().toISOString(),
|
|
3722
4143
|
};
|
|
3723
4144
|
const filePath = workbenchOriginPath(dir);
|
|
@@ -3725,6 +4146,56 @@ async function writeWorkbenchOrigin(dir, input) {
|
|
|
3725
4146
|
await fs.writeFile(filePath, `${JSON.stringify(origin, null, 2)}\n`);
|
|
3726
4147
|
return origin;
|
|
3727
4148
|
}
|
|
4149
|
+
async function writeWorkbenchOriginFromState(dir, args) {
|
|
4150
|
+
const owner = args.project?.ownerUsername ?? args.state.project.ownerUsername;
|
|
4151
|
+
const name = args.project?.name ?? args.state.project.name;
|
|
4152
|
+
const sourceRevisionId = args.project?.currentSpecVersionId ??
|
|
4153
|
+
args.state.source.revisionId ??
|
|
4154
|
+
args.state.base.sourceRevisionId;
|
|
4155
|
+
const sourceFingerprint = args.sourceFingerprint ??
|
|
4156
|
+
args.project?.sourceFingerprint ??
|
|
4157
|
+
args.state.source.fingerprint ??
|
|
4158
|
+
args.state.base.sourceFingerprint;
|
|
4159
|
+
const runtimeFingerprint = args.state.base.runtimeFingerprint ??
|
|
4160
|
+
workbenchRuntimeBundleFingerprint(args.state.runtime);
|
|
4161
|
+
if (!sourceRevisionId || !sourceFingerprint || !runtimeFingerprint) {
|
|
4162
|
+
throw new UsageError("Hosted project state is missing required origin metadata.");
|
|
4163
|
+
}
|
|
4164
|
+
return await writeWorkbenchOrigin(dir, {
|
|
4165
|
+
baseUrl: args.baseUrl,
|
|
4166
|
+
remote: `${owner}/${name}`,
|
|
4167
|
+
projectId: args.project?.id ?? args.state.project.id,
|
|
4168
|
+
sourceRevisionId,
|
|
4169
|
+
sourceFingerprint,
|
|
4170
|
+
runtimeFingerprint,
|
|
4171
|
+
});
|
|
4172
|
+
}
|
|
4173
|
+
async function localSourceFingerprint(dir) {
|
|
4174
|
+
const source = localProjectStateSource(await readLocalProjectSource(dir));
|
|
4175
|
+
return source.fingerprint ?? workbenchProjectSourceFingerprint(source);
|
|
4176
|
+
}
|
|
4177
|
+
function parseOriginRemote(origin) {
|
|
4178
|
+
return parseRemoteName(origin.remote);
|
|
4179
|
+
}
|
|
4180
|
+
function parseRemoteName(remote) {
|
|
4181
|
+
try {
|
|
4182
|
+
return parseBenchmarkRef(remote);
|
|
4183
|
+
}
|
|
4184
|
+
catch {
|
|
4185
|
+
throw new UsageError(`Workbench origin remote must use OWNER/BENCHMARK: ${remote}`);
|
|
4186
|
+
}
|
|
4187
|
+
}
|
|
4188
|
+
function normalizeOriginRemote(remote) {
|
|
4189
|
+
const parsed = parseRemoteName(remote.trim());
|
|
4190
|
+
return `${parsed.owner}/${parsed.project}`;
|
|
4191
|
+
}
|
|
4192
|
+
function originRemoteUrlParts(origin) {
|
|
4193
|
+
const remote = parseOriginRemote(origin);
|
|
4194
|
+
return {
|
|
4195
|
+
owner: remote.owner,
|
|
4196
|
+
projectName: remote.project,
|
|
4197
|
+
};
|
|
4198
|
+
}
|
|
3728
4199
|
function workbenchOriginPath(dir) {
|
|
3729
4200
|
return path.join(dir, ".workbench", "origin.json");
|
|
3730
4201
|
}
|
|
@@ -3763,30 +4234,6 @@ async function readWorkbenchProfileStatus(config) {
|
|
|
3763
4234
|
return { authenticated: true, profile: null };
|
|
3764
4235
|
}
|
|
3765
4236
|
}
|
|
3766
|
-
function readOptionalSubjectId(parsed) {
|
|
3767
|
-
return asOptionalString(parsed.flags.subject) ?? parsed.positionals[0];
|
|
3768
|
-
}
|
|
3769
|
-
function readRequiredSubjectId(parsed) {
|
|
3770
|
-
const subjectId = readOptionalSubjectId(parsed);
|
|
3771
|
-
if (!subjectId) {
|
|
3772
|
-
throw new UsageError("Missing required SUBJECT_ID.");
|
|
3773
|
-
}
|
|
3774
|
-
return subjectId;
|
|
3775
|
-
}
|
|
3776
|
-
function readRequiredRunId(parsed) {
|
|
3777
|
-
const runId = parsed.positionals[0];
|
|
3778
|
-
if (!runId) {
|
|
3779
|
-
throw new UsageError("Missing required RUN_ID.");
|
|
3780
|
-
}
|
|
3781
|
-
return runId;
|
|
3782
|
-
}
|
|
3783
|
-
function requireOutDir(parsed) {
|
|
3784
|
-
const output = asOptionalString(parsed.flags.out);
|
|
3785
|
-
if (!output) {
|
|
3786
|
-
throw new UsageError("Missing required --out.");
|
|
3787
|
-
}
|
|
3788
|
-
return output;
|
|
3789
|
-
}
|
|
3790
4237
|
async function apiRequest(apiPath, options = {}, baseUrlOverride) {
|
|
3791
4238
|
const config = await loadConfig();
|
|
3792
4239
|
const baseUrl = normalizeBaseUrl(baseUrlOverride ??
|
|
@@ -4002,6 +4449,38 @@ function readInitAgent(parsed, kind) {
|
|
|
4002
4449
|
function asOptionalString(value) {
|
|
4003
4450
|
return typeof value === "string" && value.length > 0 ? value : undefined;
|
|
4004
4451
|
}
|
|
4452
|
+
function singleRequestedRunId(value, command) {
|
|
4453
|
+
if (!value || value.trim() === "") {
|
|
4454
|
+
return undefined;
|
|
4455
|
+
}
|
|
4456
|
+
const trimmed = value.trim();
|
|
4457
|
+
if (trimmed === "all" || trimmed.includes(",")) {
|
|
4458
|
+
throw new UsageError(`${command} accepts one candidate run id for --runs; use workbench eval --runs all to evaluate every run.`);
|
|
4459
|
+
}
|
|
4460
|
+
return trimmed;
|
|
4461
|
+
}
|
|
4462
|
+
function resolveCandidateRunSelection(source, value) {
|
|
4463
|
+
const available = source.candidateRunIds;
|
|
4464
|
+
if (available.length === 0) {
|
|
4465
|
+
throw new UsageError("Candidate must declare at least one run.");
|
|
4466
|
+
}
|
|
4467
|
+
if (!value || value.trim() === "") {
|
|
4468
|
+
return [source.candidateRunId];
|
|
4469
|
+
}
|
|
4470
|
+
const trimmed = value.trim();
|
|
4471
|
+
if (trimmed === "all") {
|
|
4472
|
+
return available;
|
|
4473
|
+
}
|
|
4474
|
+
const requested = [...new Set(trimmed.split(",").map((entry) => entry.trim()).filter(Boolean))];
|
|
4475
|
+
if (requested.length === 0) {
|
|
4476
|
+
throw new UsageError("--runs must include at least one run id or all.");
|
|
4477
|
+
}
|
|
4478
|
+
const missing = requested.filter((runId) => !available.includes(runId));
|
|
4479
|
+
if (missing.length > 0) {
|
|
4480
|
+
throw new UsageError(`Unknown candidate run(s): ${missing.join(", ")}. Available: ${available.join(", ")}.`);
|
|
4481
|
+
}
|
|
4482
|
+
return requested;
|
|
4483
|
+
}
|
|
4005
4484
|
function readOptionalStringFlag(value, name) {
|
|
4006
4485
|
if (value == null || value === false) {
|
|
4007
4486
|
return undefined;
|
|
@@ -4226,6 +4705,27 @@ function parsePortFlag(value) {
|
|
|
4226
4705
|
}
|
|
4227
4706
|
return port;
|
|
4228
4707
|
}
|
|
4708
|
+
function formatCandidateEvaluationScore(candidate) {
|
|
4709
|
+
const score = candidate.eval?.metrics?.score?.mean;
|
|
4710
|
+
return typeof score === "number" && Number.isFinite(score)
|
|
4711
|
+
? formatMetricValue(score)
|
|
4712
|
+
: "n/a";
|
|
4713
|
+
}
|
|
4714
|
+
function formatLocalCandidateLabel(candidate) {
|
|
4715
|
+
if (!candidate) {
|
|
4716
|
+
return "none";
|
|
4717
|
+
}
|
|
4718
|
+
const name = candidate.name?.trim() || candidate.id;
|
|
4719
|
+
const displayName = candidate.version > 0
|
|
4720
|
+
? `${name} v${candidate.version}`
|
|
4721
|
+
: name;
|
|
4722
|
+
return `${displayName} (${candidate.id})`;
|
|
4723
|
+
}
|
|
4724
|
+
function formatCandidateEvaluationSummary(candidate) {
|
|
4725
|
+
return formatMetricSummary(evaluationMeanMetrics(candidate.eval), {
|
|
4726
|
+
limit: Number.POSITIVE_INFINITY,
|
|
4727
|
+
});
|
|
4728
|
+
}
|
|
4229
4729
|
function formatMetricSummary(metrics, options = {}) {
|
|
4230
4730
|
const entries = Object.entries(metrics ?? {}).filter((entry) => Number.isFinite(entry[1]));
|
|
4231
4731
|
if (entries.length === 0) {
|
|
@@ -4255,23 +4755,28 @@ function resolveSourceDir(parsed) {
|
|
|
4255
4755
|
if (parsed.positionals.length > 1) {
|
|
4256
4756
|
throw new UsageError("Expected at most one source file or directory argument.");
|
|
4257
4757
|
}
|
|
4258
|
-
|
|
4259
|
-
|
|
4758
|
+
const dir = asOptionalString(parsed.flags.dir);
|
|
4759
|
+
const source = parsed.positionals[0];
|
|
4760
|
+
if (dir && source) {
|
|
4761
|
+
return path.resolve(dir, source);
|
|
4260
4762
|
}
|
|
4261
|
-
return path.resolve(
|
|
4763
|
+
return path.resolve(dir ?? source ?? process.cwd());
|
|
4262
4764
|
}
|
|
4263
4765
|
function isWorkbenchSourceYamlPath(filePath) {
|
|
4264
4766
|
return path.basename(filePath) === WORKBENCH_BENCHMARK_FILE;
|
|
4265
4767
|
}
|
|
4266
|
-
function
|
|
4267
|
-
const explicit =
|
|
4768
|
+
function readCandidateIdFlag(parsed, snapshot) {
|
|
4769
|
+
const explicit = readOptionalCandidateFlag(parsed);
|
|
4268
4770
|
if (explicit) {
|
|
4269
4771
|
return explicit;
|
|
4270
4772
|
}
|
|
4271
4773
|
if (snapshot.activeId) {
|
|
4272
4774
|
return snapshot.activeId;
|
|
4273
4775
|
}
|
|
4274
|
-
throw new UsageError("Missing required --
|
|
4776
|
+
throw new UsageError("Missing required --candidate; no active candidate exists.");
|
|
4777
|
+
}
|
|
4778
|
+
function readOptionalCandidateFlag(parsed) {
|
|
4779
|
+
return asOptionalString(parsed.flags.candidate);
|
|
4275
4780
|
}
|
|
4276
4781
|
function readPreviewMode(parsed) {
|
|
4277
4782
|
const view = asOptionalString(parsed.flags.view) ?? "rendered";
|
|
@@ -4375,8 +4880,8 @@ async function copyInitSeedIfProvided(parsed, workspace, seed) {
|
|
|
4375
4880
|
}
|
|
4376
4881
|
});
|
|
4377
4882
|
}
|
|
4378
|
-
function
|
|
4379
|
-
return spec.improve ? `adapter:${spec.improve.use}` : "
|
|
4883
|
+
function formatSpecImprover(spec) {
|
|
4884
|
+
return spec.improve ? `adapter:${spec.improve.use}` : "improve not configured";
|
|
4380
4885
|
}
|
|
4381
4886
|
async function writeFiles(outputDir, files) {
|
|
4382
4887
|
await fs.mkdir(outputDir, { recursive: true });
|
|
@@ -4401,6 +4906,14 @@ async function syncSourceFiles(outputDir, files) {
|
|
|
4401
4906
|
}
|
|
4402
4907
|
await writeFiles(outputDir, files);
|
|
4403
4908
|
}
|
|
4909
|
+
async function assertLocalSourceMatchesOrigin(dir, origin) {
|
|
4910
|
+
const source = await readLocalProjectSource(dir);
|
|
4911
|
+
const fingerprint = localProjectStateSource(source).fingerprint;
|
|
4912
|
+
if (fingerprint === origin.sourceFingerprint) {
|
|
4913
|
+
return;
|
|
4914
|
+
}
|
|
4915
|
+
throw new UsageError("Local source changed since the last pull or push. Run `workbench push` before pulling, or restore the local source changes and try again.");
|
|
4916
|
+
}
|
|
4404
4917
|
async function readManagedSourceFilePaths(outputDir) {
|
|
4405
4918
|
try {
|
|
4406
4919
|
const source = await readLocalProjectSource(outputDir);
|