@workbench-ai/workbench 0.0.47 → 0.0.49
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/dev-open/client.css +20 -11
- package/dist/dev-open/client.js +172 -172
- package/dist/dev-open-server.d.ts +2 -0
- package/dist/dev-open-server.d.ts.map +1 -1
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +105 -45
- package/dist/init-template-pack.js +12 -0
- package/package.json +4 -4
|
@@ -22,6 +22,7 @@ export declare function localBenchmarkSnapshot(context: LocalWorkbenchRequestCon
|
|
|
22
22
|
currentBenchmarkFingerprint: string | null;
|
|
23
23
|
summaries: {
|
|
24
24
|
id: string;
|
|
25
|
+
name?: string;
|
|
25
26
|
ordinal: number;
|
|
26
27
|
benchmarkFingerprint: string;
|
|
27
28
|
subjectFingerprint: string;
|
|
@@ -42,6 +43,7 @@ export declare function localBenchmarkSnapshot(context: LocalWorkbenchRequestCon
|
|
|
42
43
|
benchmarkFingerprint: string;
|
|
43
44
|
subjectFingerprint: string;
|
|
44
45
|
subjectId: string;
|
|
46
|
+
subjectName?: string;
|
|
45
47
|
createdAt: string;
|
|
46
48
|
updatedAt: string;
|
|
47
49
|
status: import("@workbench-ai/workbench-contract").EvaluationStatus;
|
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"dev-open-server.d.ts","sourceRoot":"","sources":["../src/dev-open-server.ts"],"names":[],"mappings":"AAKA,OAAO,EAUL,KAAK,mBAAmB,EAIzB,MAAM,8BAA8B,CAAC;AAatC,OAAO,EAGL,KAAK,kBAAkB,EAExB,MAAM,qBAAqB,CAAC;AAG7B,MAAM,WAAW,uBAAuB;IACtC,GAAG,EAAE,MAAM,CAAC;IACZ,KAAK,EAAE,MAAM,OAAO,CAAC,IAAI,CAAC,CAAC;CAC5B;AAED,MAAM,WAAW,8BAA8B;IAC7C,SAAS,EAAE,MAAM,CAAC;IAClB,IAAI,EAAE,MAAM,CAAC;IACb,IAAI,EAAE,MAAM,CAAC;IACb,UAAU,CAAC,EAAE,MAAM,CAAC;CACrB;AAkBD,MAAM,WAAW,4BAA4B;IAC3C,SAAS,EAAE,MAAM,CAAC;IAClB,UAAU,EAAE,MAAM,CAAC;IACnB,iBAAiB,EAAE,MAAM,OAAO,CAAC,kBAAkB,CAAC,CAAC;CACtD;AAKD,wBAAsB,4BAA4B,CAChD,OAAO,EAAE,8BAA8B,GACtC,OAAO,CAAC,uBAAuB,CAAC,CAwClC;AAoOD,wBAAsB,sBAAsB,CAAC,OAAO,EAAE,4BAA4B
|
|
1
|
+
{"version":3,"file":"dev-open-server.d.ts","sourceRoot":"","sources":["../src/dev-open-server.ts"],"names":[],"mappings":"AAKA,OAAO,EAUL,KAAK,mBAAmB,EAIzB,MAAM,8BAA8B,CAAC;AAatC,OAAO,EAGL,KAAK,kBAAkB,EAExB,MAAM,qBAAqB,CAAC;AAG7B,MAAM,WAAW,uBAAuB;IACtC,GAAG,EAAE,MAAM,CAAC;IACZ,KAAK,EAAE,MAAM,OAAO,CAAC,IAAI,CAAC,CAAC;CAC5B;AAED,MAAM,WAAW,8BAA8B;IAC7C,SAAS,EAAE,MAAM,CAAC;IAClB,IAAI,EAAE,MAAM,CAAC;IACb,IAAI,EAAE,MAAM,CAAC;IACb,UAAU,CAAC,EAAE,MAAM,CAAC;CACrB;AAkBD,MAAM,WAAW,4BAA4B;IAC3C,SAAS,EAAE,MAAM,CAAC;IAClB,UAAU,EAAE,MAAM,CAAC;IACnB,iBAAiB,EAAE,MAAM,OAAO,CAAC,kBAAkB,CAAC,CAAC;CACtD;AAKD,wBAAsB,4BAA4B,CAChD,OAAO,EAAE,8BAA8B,GACtC,OAAO,CAAC,uBAAuB,CAAC,CAwClC;AAoOD,wBAAsB,sBAAsB,CAAC,OAAO,EAAE,4BAA4B;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAiBjF;AAUD,wBAAsB,iBAAiB,CACrC,OAAO,EAAE,4BAA4B,EACrC,oBAAoB,CAAC,EAAE,MAAM,GAAG,IAAI,uFAiCrC;AAwBD,wBAAsB,gBAAgB,CAAC,SAAS,EAAE,MAAM,GAAG,OAAO,CAAC,mBAAmB,EAAE,CAAC,CAExF;AAED,wBAAsB,0BAA0B,CAC9C,OAAO,EAAE,4BAA4B,EACrC,oBAAoB,CAAC,EAAE,MAAM,GAAG,IAAI,GACnC,OAAO,CAAC,mBAAmB,EAAE,CAAC,CAchC"}
|
package/dist/index.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAsHA,UAAU,KAAK;IACb,KAAK,EAAE,MAAM,CAAC,cAAc,CAAC;IAC7B,MAAM,EAAE,MAAM,CAAC,cAAc,CAAC;IAC9B,MAAM,EAAE,MAAM,CAAC,cAAc,CAAC;CAC/B;AA6BD,UAAU,iBAAiB;CAAG;
|
|
1
|
+
{"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAsHA,UAAU,KAAK;IACb,KAAK,EAAE,MAAM,CAAC,cAAc,CAAC;IAC7B,MAAM,EAAE,MAAM,CAAC,cAAc,CAAC;IAC9B,MAAM,EAAE,MAAM,CAAC,cAAc,CAAC;CAC/B;AA6BD,UAAU,iBAAiB;CAAG;AAqK9B,wBAAsB,MAAM,CAC1B,IAAI,EAAE,SAAS,MAAM,EAAE,EACvB,EAAE,GAAE,KAIH,EACD,cAAc,GAAE,iBAAsB,GACrC,OAAO,CAAC,MAAM,CAAC,CA8GjB"}
|
package/dist/index.js
CHANGED
|
@@ -22,6 +22,16 @@ function getCliVersion() {
|
|
|
22
22
|
const manifest = require("../package.json");
|
|
23
23
|
return typeof manifest.version === "string" ? manifest.version : "unknown";
|
|
24
24
|
}
|
|
25
|
+
class WorkbenchApiRequestError extends Error {
|
|
26
|
+
status;
|
|
27
|
+
body;
|
|
28
|
+
constructor(status, message, body) {
|
|
29
|
+
super(message);
|
|
30
|
+
this.name = "WorkbenchApiRequestError";
|
|
31
|
+
this.status = status;
|
|
32
|
+
this.body = body;
|
|
33
|
+
}
|
|
34
|
+
}
|
|
25
35
|
const DEFAULT_BASE_URL = "https://v2.workbench.ai";
|
|
26
36
|
export async function runCli(argv, io = {
|
|
27
37
|
stdin: process.stdin,
|
|
@@ -2358,52 +2368,55 @@ async function pushBenchmark(argv, io) {
|
|
|
2358
2368
|
throw new UsageError("Missing hosted benchmark. Run workbench push from a source directory.");
|
|
2359
2369
|
}
|
|
2360
2370
|
if (!origin.writable) {
|
|
2361
|
-
const
|
|
2362
|
-
if (
|
|
2371
|
+
const signedInUsername = dryRun ? null : await readAuthenticatedWorkbenchUsername(baseUrl);
|
|
2372
|
+
if (signedInUsername !== origin.owner) {
|
|
2373
|
+
const upstream = upstreamFromOrigin(origin);
|
|
2374
|
+
if (dryRun) {
|
|
2375
|
+
writeOutput({
|
|
2376
|
+
ok: true,
|
|
2377
|
+
dryRun: true,
|
|
2378
|
+
action: "create",
|
|
2379
|
+
dir,
|
|
2380
|
+
baseUrl,
|
|
2381
|
+
benchmarkName: source.spec.name,
|
|
2382
|
+
tag: asOptionalString(parsed.flags.tag) ?? null,
|
|
2383
|
+
visibility,
|
|
2384
|
+
sourceFileCount: sourceFileCount(source),
|
|
2385
|
+
upstream: upstream ?? null,
|
|
2386
|
+
}, parsed, io, () => `Would create a writable benchmark from read-only origin ${origin.owner}/${origin.project}.`);
|
|
2387
|
+
return 0;
|
|
2388
|
+
}
|
|
2389
|
+
const { project, publishedProject, origin: nextOrigin } = await createHostedBenchmarkFromSource({
|
|
2390
|
+
baseUrl,
|
|
2391
|
+
dir,
|
|
2392
|
+
source,
|
|
2393
|
+
visibility,
|
|
2394
|
+
upstream,
|
|
2395
|
+
});
|
|
2363
2396
|
writeOutput({
|
|
2364
2397
|
ok: true,
|
|
2365
|
-
dryRun: true,
|
|
2366
2398
|
action: "create",
|
|
2367
|
-
|
|
2368
|
-
baseUrl,
|
|
2369
|
-
benchmarkName: source.spec.name,
|
|
2399
|
+
benchmark: publishedProject,
|
|
2370
2400
|
tag: asOptionalString(parsed.flags.tag) ?? null,
|
|
2371
2401
|
visibility,
|
|
2372
|
-
|
|
2402
|
+
origin: nextOrigin,
|
|
2373
2403
|
upstream: upstream ?? null,
|
|
2374
|
-
|
|
2404
|
+
urls: buildWorkbenchResourceUrls({
|
|
2405
|
+
baseUrl,
|
|
2406
|
+
projectId: publishedProject.id ?? project.id,
|
|
2407
|
+
owner: nextOrigin.owner,
|
|
2408
|
+
projectName: nextOrigin.project,
|
|
2409
|
+
}),
|
|
2410
|
+
}, parsed, io, (record) => {
|
|
2411
|
+
const value = record;
|
|
2412
|
+
return [
|
|
2413
|
+
`Pushed ${value.origin.owner}/${value.origin.project} (${value.origin.projectId}).`,
|
|
2414
|
+
...(value.upstream ? [`Upstream: ${value.upstream.owner}/${value.upstream.project}`] : []),
|
|
2415
|
+
`Open benchmark: ${value.urls.benchmark}`,
|
|
2416
|
+
].join("\n");
|
|
2417
|
+
});
|
|
2375
2418
|
return 0;
|
|
2376
2419
|
}
|
|
2377
|
-
const { project, publishedProject, origin: nextOrigin } = await createHostedBenchmarkFromSource({
|
|
2378
|
-
baseUrl,
|
|
2379
|
-
dir,
|
|
2380
|
-
source,
|
|
2381
|
-
visibility,
|
|
2382
|
-
upstream,
|
|
2383
|
-
});
|
|
2384
|
-
writeOutput({
|
|
2385
|
-
ok: true,
|
|
2386
|
-
action: "create",
|
|
2387
|
-
benchmark: publishedProject,
|
|
2388
|
-
tag: asOptionalString(parsed.flags.tag) ?? null,
|
|
2389
|
-
visibility,
|
|
2390
|
-
origin: nextOrigin,
|
|
2391
|
-
upstream: upstream ?? null,
|
|
2392
|
-
urls: buildWorkbenchResourceUrls({
|
|
2393
|
-
baseUrl,
|
|
2394
|
-
projectId: publishedProject.id ?? project.id,
|
|
2395
|
-
owner: nextOrigin.owner,
|
|
2396
|
-
projectName: nextOrigin.project,
|
|
2397
|
-
}),
|
|
2398
|
-
}, parsed, io, (record) => {
|
|
2399
|
-
const value = record;
|
|
2400
|
-
return [
|
|
2401
|
-
`Pushed ${value.origin.owner}/${value.origin.project} (${value.origin.projectId}).`,
|
|
2402
|
-
...(value.upstream ? [`Upstream: ${value.upstream.owner}/${value.upstream.project}`] : []),
|
|
2403
|
-
`Open benchmark: ${value.urls.benchmark}`,
|
|
2404
|
-
].join("\n");
|
|
2405
|
-
});
|
|
2406
|
-
return 0;
|
|
2407
2420
|
}
|
|
2408
2421
|
if (dryRun) {
|
|
2409
2422
|
writeOutput({
|
|
@@ -2480,6 +2493,11 @@ async function createHostedBenchmarkFromSource(args) {
|
|
|
2480
2493
|
});
|
|
2481
2494
|
return { project, publishedProject, origin };
|
|
2482
2495
|
}
|
|
2496
|
+
async function readAuthenticatedWorkbenchUsername(baseUrl) {
|
|
2497
|
+
const config = await loadConfig();
|
|
2498
|
+
const status = await readWorkbenchProfileStatus({ ...config, baseUrl });
|
|
2499
|
+
return status.authenticated ? status.profile?.username ?? null : null;
|
|
2500
|
+
}
|
|
2483
2501
|
function upstreamFromOrigin(origin) {
|
|
2484
2502
|
if (!origin.owner || !origin.project || !origin.projectId || !origin.sourceRevisionId) {
|
|
2485
2503
|
return undefined;
|
|
@@ -2824,15 +2842,20 @@ async function startHostedWorkflow(workflow, argv, io) {
|
|
|
2824
2842
|
}
|
|
2825
2843
|
async function ensureHostedImproveBaseSubject(args) {
|
|
2826
2844
|
if (args.subjectId) {
|
|
2827
|
-
const
|
|
2828
|
-
const subject = response.subjects.find((entry) => entry.id === args.subjectId);
|
|
2845
|
+
const subject = await readHostedSubjectSummary(args.target, args.subjectId);
|
|
2829
2846
|
if (!subject) {
|
|
2830
2847
|
throw new UsageError(`Base subject ${args.subjectId} was not found for the current benchmark.`);
|
|
2831
2848
|
}
|
|
2832
|
-
if (
|
|
2849
|
+
if (hostedSubjectIsEvaluated(subject)) {
|
|
2833
2850
|
return args.subjectId;
|
|
2834
2851
|
}
|
|
2835
2852
|
}
|
|
2853
|
+
else {
|
|
2854
|
+
const activeSubject = await readEvaluatedActiveHostedSubject(args.target);
|
|
2855
|
+
if (activeSubject) {
|
|
2856
|
+
return activeSubject.id;
|
|
2857
|
+
}
|
|
2858
|
+
}
|
|
2836
2859
|
const response = await apiRequest(projectApiPath(args.target.projectId, "/runs"), {
|
|
2837
2860
|
method: "POST",
|
|
2838
2861
|
body: {
|
|
@@ -2856,6 +2879,22 @@ async function ensureHostedImproveBaseSubject(args) {
|
|
|
2856
2879
|
}
|
|
2857
2880
|
return watched.subjectId;
|
|
2858
2881
|
}
|
|
2882
|
+
async function readHostedSubjectSummary(target, subjectId) {
|
|
2883
|
+
const response = await apiRequest(projectApiPath(target.projectId, "/subjects"), {}, target.baseUrl);
|
|
2884
|
+
return response.subjects.find((entry) => entry.id === subjectId) ?? null;
|
|
2885
|
+
}
|
|
2886
|
+
async function readEvaluatedActiveHostedSubject(target) {
|
|
2887
|
+
const response = await apiRequest(projectApiPath(target.projectId), {}, target.baseUrl);
|
|
2888
|
+
const activeSubjectId = response.benchmark.activeSubjectId;
|
|
2889
|
+
if (!activeSubjectId) {
|
|
2890
|
+
return null;
|
|
2891
|
+
}
|
|
2892
|
+
const subject = await readHostedSubjectSummary(target, activeSubjectId);
|
|
2893
|
+
return subject && hostedSubjectIsEvaluated(subject) ? subject : null;
|
|
2894
|
+
}
|
|
2895
|
+
function hostedSubjectIsEvaluated(subject) {
|
|
2896
|
+
return subject.status === "evaluated" || subject.eval != null;
|
|
2897
|
+
}
|
|
2859
2898
|
async function benchmarkList(argv, io) {
|
|
2860
2899
|
const parsed = parseArgs(argv);
|
|
2861
2900
|
rejectUnknownFlags(parsed, new Set(["json"]));
|
|
@@ -3475,7 +3514,20 @@ async function watchHostedRun(args) {
|
|
|
3475
3514
|
const deadline = args.timeoutMs === undefined ? undefined : Date.now() + args.timeoutMs;
|
|
3476
3515
|
let lastRun = null;
|
|
3477
3516
|
while (true) {
|
|
3478
|
-
|
|
3517
|
+
let response;
|
|
3518
|
+
try {
|
|
3519
|
+
response = await apiRequest(projectApiPath(args.target.projectId, `/runs/${encodeURIComponent(args.runId)}`), {}, args.target.baseUrl);
|
|
3520
|
+
}
|
|
3521
|
+
catch (error) {
|
|
3522
|
+
if (isTransientApiRequestError(error)) {
|
|
3523
|
+
if (deadline !== undefined && Date.now() > deadline) {
|
|
3524
|
+
throw new Error(`Timed out waiting for run ${args.runId}; last status was ${lastRun?.status ?? "unknown"} and the latest poll failed with ${error.message}.`);
|
|
3525
|
+
}
|
|
3526
|
+
await sleep(args.intervalMs);
|
|
3527
|
+
continue;
|
|
3528
|
+
}
|
|
3529
|
+
throw error;
|
|
3530
|
+
}
|
|
3479
3531
|
lastRun = response.run;
|
|
3480
3532
|
if (response.run.status === "finished") {
|
|
3481
3533
|
return response.run;
|
|
@@ -3753,8 +3805,8 @@ async function apiRequest(apiPath, options = {}, baseUrlOverride) {
|
|
|
3753
3805
|
});
|
|
3754
3806
|
if (!response.ok) {
|
|
3755
3807
|
const text = await response.text();
|
|
3756
|
-
throw new
|
|
3757
|
-
`Request failed with status ${response.status}
|
|
3808
|
+
throw new WorkbenchApiRequestError(response.status, readResponseError(text) ||
|
|
3809
|
+
`Request failed with status ${response.status}${response.statusText ? ` ${response.statusText}` : ""}.`, text);
|
|
3758
3810
|
}
|
|
3759
3811
|
return (await response.json());
|
|
3760
3812
|
}
|
|
@@ -3853,9 +3905,17 @@ function readResponseError(text) {
|
|
|
3853
3905
|
: "";
|
|
3854
3906
|
}
|
|
3855
3907
|
catch {
|
|
3856
|
-
|
|
3908
|
+
const trimmed = text.trim();
|
|
3909
|
+
if (trimmed.startsWith("<")) {
|
|
3910
|
+
return "";
|
|
3911
|
+
}
|
|
3912
|
+
return trimmed;
|
|
3857
3913
|
}
|
|
3858
3914
|
}
|
|
3915
|
+
function isTransientApiRequestError(error) {
|
|
3916
|
+
return error instanceof WorkbenchApiRequestError
|
|
3917
|
+
&& (error.status === 408 || error.status === 429 || error.status >= 500);
|
|
3918
|
+
}
|
|
3859
3919
|
function readOAuthError(text) {
|
|
3860
3920
|
try {
|
|
3861
3921
|
const body = JSON.parse(text);
|
|
@@ -84,6 +84,7 @@ function skillBenchmarkSpec(name, agent) {
|
|
|
84
84
|
" parallelism: 2",
|
|
85
85
|
" judge:",
|
|
86
86
|
` use: ${agent}`,
|
|
87
|
+
...agentDefaultWithLines(agent, " "),
|
|
87
88
|
" criteria:",
|
|
88
89
|
" - id: task_fit",
|
|
89
90
|
" description: The response follows the task prompt and uses the skill's workflow.",
|
|
@@ -104,6 +105,7 @@ function skillSubjectSpec(name, agent) {
|
|
|
104
105
|
" command: sh input/subject/prepare.sh",
|
|
105
106
|
"run:",
|
|
106
107
|
` use: ${agent}`,
|
|
108
|
+
...agentDefaultWithLines(agent, " "),
|
|
107
109
|
"",
|
|
108
110
|
].join("\n");
|
|
109
111
|
}
|
|
@@ -116,9 +118,19 @@ function optimizerSpec(name, editablePath, agent) {
|
|
|
116
118
|
` - ${editablePath}`,
|
|
117
119
|
"improve:",
|
|
118
120
|
` use: ${agent}`,
|
|
121
|
+
...agentDefaultWithLines(agent, " "),
|
|
119
122
|
"",
|
|
120
123
|
].join("\n");
|
|
121
124
|
}
|
|
125
|
+
function agentDefaultWithLines(agent, indent) {
|
|
126
|
+
if (agent !== "codex") {
|
|
127
|
+
return [];
|
|
128
|
+
}
|
|
129
|
+
return [
|
|
130
|
+
`${indent}with:`,
|
|
131
|
+
`${indent} model: gpt-5.5`,
|
|
132
|
+
];
|
|
133
|
+
}
|
|
122
134
|
function commandBenchmarkSpec(name) {
|
|
123
135
|
return [
|
|
124
136
|
"version: 3",
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@workbench-ai/workbench",
|
|
3
|
-
"version": "0.0.
|
|
3
|
+
"version": "0.0.49",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"repository": {
|
|
6
6
|
"type": "git",
|
|
@@ -21,9 +21,9 @@
|
|
|
21
21
|
],
|
|
22
22
|
"dependencies": {
|
|
23
23
|
"yaml": "^2.8.2",
|
|
24
|
-
"@workbench-ai/workbench-
|
|
25
|
-
"@workbench-ai/workbench-
|
|
26
|
-
"@workbench-ai/workbench-core": "0.0.
|
|
24
|
+
"@workbench-ai/workbench-built-in-adapters": "0.0.49",
|
|
25
|
+
"@workbench-ai/workbench-protocol": "0.0.49",
|
|
26
|
+
"@workbench-ai/workbench-core": "0.0.49"
|
|
27
27
|
},
|
|
28
28
|
"devDependencies": {
|
|
29
29
|
"@tailwindcss/postcss": "^4.2.2",
|