@workbench-ai/workbench 0.0.47 → 0.0.49

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -22,6 +22,7 @@ export declare function localBenchmarkSnapshot(context: LocalWorkbenchRequestCon
22
22
  currentBenchmarkFingerprint: string | null;
23
23
  summaries: {
24
24
  id: string;
25
+ name?: string;
25
26
  ordinal: number;
26
27
  benchmarkFingerprint: string;
27
28
  subjectFingerprint: string;
@@ -42,6 +43,7 @@ export declare function localBenchmarkSnapshot(context: LocalWorkbenchRequestCon
42
43
  benchmarkFingerprint: string;
43
44
  subjectFingerprint: string;
44
45
  subjectId: string;
46
+ subjectName?: string;
45
47
  createdAt: string;
46
48
  updatedAt: string;
47
49
  status: import("@workbench-ai/workbench-contract").EvaluationStatus;
@@ -1 +1 @@
1
- {"version":3,"file":"dev-open-server.d.ts","sourceRoot":"","sources":["../src/dev-open-server.ts"],"names":[],"mappings":"AAKA,OAAO,EAUL,KAAK,mBAAmB,EAIzB,MAAM,8BAA8B,CAAC;AAatC,OAAO,EAGL,KAAK,kBAAkB,EAExB,MAAM,qBAAqB,CAAC;AAG7B,MAAM,WAAW,uBAAuB;IACtC,GAAG,EAAE,MAAM,CAAC;IACZ,KAAK,EAAE,MAAM,OAAO,CAAC,IAAI,CAAC,CAAC;CAC5B;AAED,MAAM,WAAW,8BAA8B;IAC7C,SAAS,EAAE,MAAM,CAAC;IAClB,IAAI,EAAE,MAAM,CAAC;IACb,IAAI,EAAE,MAAM,CAAC;IACb,UAAU,CAAC,EAAE,MAAM,CAAC;CACrB;AAkBD,MAAM,WAAW,4BAA4B;IAC3C,SAAS,EAAE,MAAM,CAAC;IAClB,UAAU,EAAE,MAAM,CAAC;IACnB,iBAAiB,EAAE,MAAM,OAAO,CAAC,kBAAkB,CAAC,CAAC;CACtD;AAKD,wBAAsB,4BAA4B,CAChD,OAAO,EAAE,8BAA8B,GACtC,OAAO,CAAC,uBAAuB,CAAC,CAwClC;AAoOD,wBAAsB,sBAAsB,CAAC,OAAO,EAAE,4BAA4B;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAiBjF;AAUD,wBAAsB,iBAAiB,CACrC,OAAO,EAAE,4BAA4B,EACrC,oBAAoB,CAAC,EAAE,MAAM,GAAG,IAAI,uFAiCrC;AAwBD,wBAAsB,gBAAgB,CAAC,SAAS,EAAE,MAAM,GAAG,OAAO,CAAC,mBAAmB,EAAE,CAAC,CAExF;AAED,wBAAsB,0BAA0B,CAC9C,OAAO,EAAE,4BAA4B,EACrC,oBAAoB,CAAC,EAAE,MAAM,GAAG,IAAI,GACnC,OAAO,CAAC,mBAAmB,EAAE,CAAC,CAchC"}
1
+ {"version":3,"file":"dev-open-server.d.ts","sourceRoot":"","sources":["../src/dev-open-server.ts"],"names":[],"mappings":"AAKA,OAAO,EAUL,KAAK,mBAAmB,EAIzB,MAAM,8BAA8B,CAAC;AAatC,OAAO,EAGL,KAAK,kBAAkB,EAExB,MAAM,qBAAqB,CAAC;AAG7B,MAAM,WAAW,uBAAuB;IACtC,GAAG,EAAE,MAAM,CAAC;IACZ,KAAK,EAAE,MAAM,OAAO,CAAC,IAAI,CAAC,CAAC;CAC5B;AAED,MAAM,WAAW,8BAA8B;IAC7C,SAAS,EAAE,MAAM,CAAC;IAClB,IAAI,EAAE,MAAM,CAAC;IACb,IAAI,EAAE,MAAM,CAAC;IACb,UAAU,CAAC,EAAE,MAAM,CAAC;CACrB;AAkBD,MAAM,WAAW,4BAA4B;IAC3C,SAAS,EAAE,MAAM,CAAC;IAClB,UAAU,EAAE,MAAM,CAAC;IACnB,iBAAiB,EAAE,MAAM,OAAO,CAAC,kBAAkB,CAAC,CAAC;CACtD;AAKD,wBAAsB,4BAA4B,CAChD,OAAO,EAAE,8BAA8B,GACtC,OAAO,CAAC,uBAAuB,CAAC,CAwClC;AAoOD,wBAAsB,sBAAsB,CAAC,OAAO,EAAE,4BAA4B;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;GAiBjF;AAUD,wBAAsB,iBAAiB,CACrC,OAAO,EAAE,4BAA4B,EACrC,oBAAoB,CAAC,EAAE,MAAM,GAAG,IAAI,uFAiCrC;AAwBD,wBAAsB,gBAAgB,CAAC,SAAS,EAAE,MAAM,GAAG,OAAO,CAAC,mBAAmB,EAAE,CAAC,CAExF;AAED,wBAAsB,0BAA0B,CAC9C,OAAO,EAAE,4BAA4B,EACrC,oBAAoB,CAAC,EAAE,MAAM,GAAG,IAAI,GACnC,OAAO,CAAC,mBAAmB,EAAE,CAAC,CAchC"}
@@ -1 +1 @@
1
- {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAsHA,UAAU,KAAK;IACb,KAAK,EAAE,MAAM,CAAC,cAAc,CAAC;IAC7B,MAAM,EAAE,MAAM,CAAC,cAAc,CAAC;IAC9B,MAAM,EAAE,MAAM,CAAC,cAAc,CAAC;CAC/B;AA6BD,UAAU,iBAAiB;CAAG;AAwJ9B,wBAAsB,MAAM,CAC1B,IAAI,EAAE,SAAS,MAAM,EAAE,EACvB,EAAE,GAAE,KAIH,EACD,cAAc,GAAE,iBAAsB,GACrC,OAAO,CAAC,MAAM,CAAC,CA8GjB"}
1
+ {"version":3,"file":"index.d.ts","sourceRoot":"","sources":["../src/index.ts"],"names":[],"mappings":"AAsHA,UAAU,KAAK;IACb,KAAK,EAAE,MAAM,CAAC,cAAc,CAAC;IAC7B,MAAM,EAAE,MAAM,CAAC,cAAc,CAAC;IAC9B,MAAM,EAAE,MAAM,CAAC,cAAc,CAAC;CAC/B;AA6BD,UAAU,iBAAiB;CAAG;AAqK9B,wBAAsB,MAAM,CAC1B,IAAI,EAAE,SAAS,MAAM,EAAE,EACvB,EAAE,GAAE,KAIH,EACD,cAAc,GAAE,iBAAsB,GACrC,OAAO,CAAC,MAAM,CAAC,CA8GjB"}
package/dist/index.js CHANGED
@@ -22,6 +22,16 @@ function getCliVersion() {
22
22
  const manifest = require("../package.json");
23
23
  return typeof manifest.version === "string" ? manifest.version : "unknown";
24
24
  }
25
+ class WorkbenchApiRequestError extends Error {
26
+ status;
27
+ body;
28
+ constructor(status, message, body) {
29
+ super(message);
30
+ this.name = "WorkbenchApiRequestError";
31
+ this.status = status;
32
+ this.body = body;
33
+ }
34
+ }
25
35
  const DEFAULT_BASE_URL = "https://v2.workbench.ai";
26
36
  export async function runCli(argv, io = {
27
37
  stdin: process.stdin,
@@ -2358,52 +2368,55 @@ async function pushBenchmark(argv, io) {
2358
2368
  throw new UsageError("Missing hosted benchmark. Run workbench push from a source directory.");
2359
2369
  }
2360
2370
  if (!origin.writable) {
2361
- const upstream = upstreamFromOrigin(origin);
2362
- if (dryRun) {
2371
+ const signedInUsername = dryRun ? null : await readAuthenticatedWorkbenchUsername(baseUrl);
2372
+ if (signedInUsername !== origin.owner) {
2373
+ const upstream = upstreamFromOrigin(origin);
2374
+ if (dryRun) {
2375
+ writeOutput({
2376
+ ok: true,
2377
+ dryRun: true,
2378
+ action: "create",
2379
+ dir,
2380
+ baseUrl,
2381
+ benchmarkName: source.spec.name,
2382
+ tag: asOptionalString(parsed.flags.tag) ?? null,
2383
+ visibility,
2384
+ sourceFileCount: sourceFileCount(source),
2385
+ upstream: upstream ?? null,
2386
+ }, parsed, io, () => `Would create a writable benchmark from read-only origin ${origin.owner}/${origin.project}.`);
2387
+ return 0;
2388
+ }
2389
+ const { project, publishedProject, origin: nextOrigin } = await createHostedBenchmarkFromSource({
2390
+ baseUrl,
2391
+ dir,
2392
+ source,
2393
+ visibility,
2394
+ upstream,
2395
+ });
2363
2396
  writeOutput({
2364
2397
  ok: true,
2365
- dryRun: true,
2366
2398
  action: "create",
2367
- dir,
2368
- baseUrl,
2369
- benchmarkName: source.spec.name,
2399
+ benchmark: publishedProject,
2370
2400
  tag: asOptionalString(parsed.flags.tag) ?? null,
2371
2401
  visibility,
2372
- sourceFileCount: sourceFileCount(source),
2402
+ origin: nextOrigin,
2373
2403
  upstream: upstream ?? null,
2374
- }, parsed, io, () => `Would create a writable benchmark from read-only origin ${origin.owner}/${origin.project}.`);
2404
+ urls: buildWorkbenchResourceUrls({
2405
+ baseUrl,
2406
+ projectId: publishedProject.id ?? project.id,
2407
+ owner: nextOrigin.owner,
2408
+ projectName: nextOrigin.project,
2409
+ }),
2410
+ }, parsed, io, (record) => {
2411
+ const value = record;
2412
+ return [
2413
+ `Pushed ${value.origin.owner}/${value.origin.project} (${value.origin.projectId}).`,
2414
+ ...(value.upstream ? [`Upstream: ${value.upstream.owner}/${value.upstream.project}`] : []),
2415
+ `Open benchmark: ${value.urls.benchmark}`,
2416
+ ].join("\n");
2417
+ });
2375
2418
  return 0;
2376
2419
  }
2377
- const { project, publishedProject, origin: nextOrigin } = await createHostedBenchmarkFromSource({
2378
- baseUrl,
2379
- dir,
2380
- source,
2381
- visibility,
2382
- upstream,
2383
- });
2384
- writeOutput({
2385
- ok: true,
2386
- action: "create",
2387
- benchmark: publishedProject,
2388
- tag: asOptionalString(parsed.flags.tag) ?? null,
2389
- visibility,
2390
- origin: nextOrigin,
2391
- upstream: upstream ?? null,
2392
- urls: buildWorkbenchResourceUrls({
2393
- baseUrl,
2394
- projectId: publishedProject.id ?? project.id,
2395
- owner: nextOrigin.owner,
2396
- projectName: nextOrigin.project,
2397
- }),
2398
- }, parsed, io, (record) => {
2399
- const value = record;
2400
- return [
2401
- `Pushed ${value.origin.owner}/${value.origin.project} (${value.origin.projectId}).`,
2402
- ...(value.upstream ? [`Upstream: ${value.upstream.owner}/${value.upstream.project}`] : []),
2403
- `Open benchmark: ${value.urls.benchmark}`,
2404
- ].join("\n");
2405
- });
2406
- return 0;
2407
2420
  }
2408
2421
  if (dryRun) {
2409
2422
  writeOutput({
@@ -2480,6 +2493,11 @@ async function createHostedBenchmarkFromSource(args) {
2480
2493
  });
2481
2494
  return { project, publishedProject, origin };
2482
2495
  }
2496
+ async function readAuthenticatedWorkbenchUsername(baseUrl) {
2497
+ const config = await loadConfig();
2498
+ const status = await readWorkbenchProfileStatus({ ...config, baseUrl });
2499
+ return status.authenticated ? status.profile?.username ?? null : null;
2500
+ }
2483
2501
  function upstreamFromOrigin(origin) {
2484
2502
  if (!origin.owner || !origin.project || !origin.projectId || !origin.sourceRevisionId) {
2485
2503
  return undefined;
@@ -2824,15 +2842,20 @@ async function startHostedWorkflow(workflow, argv, io) {
2824
2842
  }
2825
2843
  async function ensureHostedImproveBaseSubject(args) {
2826
2844
  if (args.subjectId) {
2827
- const response = await apiRequest(projectApiPath(args.target.projectId, "/subjects"), {}, args.target.baseUrl);
2828
- const subject = response.subjects.find((entry) => entry.id === args.subjectId);
2845
+ const subject = await readHostedSubjectSummary(args.target, args.subjectId);
2829
2846
  if (!subject) {
2830
2847
  throw new UsageError(`Base subject ${args.subjectId} was not found for the current benchmark.`);
2831
2848
  }
2832
- if (subject && (subject.status === "evaluated" || subject.eval != null)) {
2849
+ if (hostedSubjectIsEvaluated(subject)) {
2833
2850
  return args.subjectId;
2834
2851
  }
2835
2852
  }
2853
+ else {
2854
+ const activeSubject = await readEvaluatedActiveHostedSubject(args.target);
2855
+ if (activeSubject) {
2856
+ return activeSubject.id;
2857
+ }
2858
+ }
2836
2859
  const response = await apiRequest(projectApiPath(args.target.projectId, "/runs"), {
2837
2860
  method: "POST",
2838
2861
  body: {
@@ -2856,6 +2879,22 @@ async function ensureHostedImproveBaseSubject(args) {
2856
2879
  }
2857
2880
  return watched.subjectId;
2858
2881
  }
2882
+ async function readHostedSubjectSummary(target, subjectId) {
2883
+ const response = await apiRequest(projectApiPath(target.projectId, "/subjects"), {}, target.baseUrl);
2884
+ return response.subjects.find((entry) => entry.id === subjectId) ?? null;
2885
+ }
2886
+ async function readEvaluatedActiveHostedSubject(target) {
2887
+ const response = await apiRequest(projectApiPath(target.projectId), {}, target.baseUrl);
2888
+ const activeSubjectId = response.benchmark.activeSubjectId;
2889
+ if (!activeSubjectId) {
2890
+ return null;
2891
+ }
2892
+ const subject = await readHostedSubjectSummary(target, activeSubjectId);
2893
+ return subject && hostedSubjectIsEvaluated(subject) ? subject : null;
2894
+ }
2895
+ function hostedSubjectIsEvaluated(subject) {
2896
+ return subject.status === "evaluated" || subject.eval != null;
2897
+ }
2859
2898
  async function benchmarkList(argv, io) {
2860
2899
  const parsed = parseArgs(argv);
2861
2900
  rejectUnknownFlags(parsed, new Set(["json"]));
@@ -3475,7 +3514,20 @@ async function watchHostedRun(args) {
3475
3514
  const deadline = args.timeoutMs === undefined ? undefined : Date.now() + args.timeoutMs;
3476
3515
  let lastRun = null;
3477
3516
  while (true) {
3478
- const response = await apiRequest(projectApiPath(args.target.projectId, `/runs/${encodeURIComponent(args.runId)}`), {}, args.target.baseUrl);
3517
+ let response;
3518
+ try {
3519
+ response = await apiRequest(projectApiPath(args.target.projectId, `/runs/${encodeURIComponent(args.runId)}`), {}, args.target.baseUrl);
3520
+ }
3521
+ catch (error) {
3522
+ if (isTransientApiRequestError(error)) {
3523
+ if (deadline !== undefined && Date.now() > deadline) {
3524
+ throw new Error(`Timed out waiting for run ${args.runId}; last status was ${lastRun?.status ?? "unknown"} and the latest poll failed with ${error.message}.`);
3525
+ }
3526
+ await sleep(args.intervalMs);
3527
+ continue;
3528
+ }
3529
+ throw error;
3530
+ }
3479
3531
  lastRun = response.run;
3480
3532
  if (response.run.status === "finished") {
3481
3533
  return response.run;
@@ -3753,8 +3805,8 @@ async function apiRequest(apiPath, options = {}, baseUrlOverride) {
3753
3805
  });
3754
3806
  if (!response.ok) {
3755
3807
  const text = await response.text();
3756
- throw new Error(readResponseError(text) ||
3757
- `Request failed with status ${response.status}.`);
3808
+ throw new WorkbenchApiRequestError(response.status, readResponseError(text) ||
3809
+ `Request failed with status ${response.status}${response.statusText ? ` ${response.statusText}` : ""}.`, text);
3758
3810
  }
3759
3811
  return (await response.json());
3760
3812
  }
@@ -3853,9 +3905,17 @@ function readResponseError(text) {
3853
3905
  : "";
3854
3906
  }
3855
3907
  catch {
3856
- return text;
3908
+ const trimmed = text.trim();
3909
+ if (trimmed.startsWith("<")) {
3910
+ return "";
3911
+ }
3912
+ return trimmed;
3857
3913
  }
3858
3914
  }
3915
+ function isTransientApiRequestError(error) {
3916
+ return error instanceof WorkbenchApiRequestError
3917
+ && (error.status === 408 || error.status === 429 || error.status >= 500);
3918
+ }
3859
3919
  function readOAuthError(text) {
3860
3920
  try {
3861
3921
  const body = JSON.parse(text);
@@ -84,6 +84,7 @@ function skillBenchmarkSpec(name, agent) {
84
84
  " parallelism: 2",
85
85
  " judge:",
86
86
  ` use: ${agent}`,
87
+ ...agentDefaultWithLines(agent, " "),
87
88
  " criteria:",
88
89
  " - id: task_fit",
89
90
  " description: The response follows the task prompt and uses the skill's workflow.",
@@ -104,6 +105,7 @@ function skillSubjectSpec(name, agent) {
104
105
  " command: sh input/subject/prepare.sh",
105
106
  "run:",
106
107
  ` use: ${agent}`,
108
+ ...agentDefaultWithLines(agent, " "),
107
109
  "",
108
110
  ].join("\n");
109
111
  }
@@ -116,9 +118,19 @@ function optimizerSpec(name, editablePath, agent) {
116
118
  ` - ${editablePath}`,
117
119
  "improve:",
118
120
  ` use: ${agent}`,
121
+ ...agentDefaultWithLines(agent, " "),
119
122
  "",
120
123
  ].join("\n");
121
124
  }
125
+ function agentDefaultWithLines(agent, indent) {
126
+ if (agent !== "codex") {
127
+ return [];
128
+ }
129
+ return [
130
+ `${indent}with:`,
131
+ `${indent} model: gpt-5.5`,
132
+ ];
133
+ }
122
134
  function commandBenchmarkSpec(name) {
123
135
  return [
124
136
  "version: 3",
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@workbench-ai/workbench",
3
- "version": "0.0.47",
3
+ "version": "0.0.49",
4
4
  "type": "module",
5
5
  "repository": {
6
6
  "type": "git",
@@ -21,9 +21,9 @@
21
21
  ],
22
22
  "dependencies": {
23
23
  "yaml": "^2.8.2",
24
- "@workbench-ai/workbench-protocol": "0.0.47",
25
- "@workbench-ai/workbench-built-in-adapters": "0.0.47",
26
- "@workbench-ai/workbench-core": "0.0.47"
24
+ "@workbench-ai/workbench-built-in-adapters": "0.0.49",
25
+ "@workbench-ai/workbench-protocol": "0.0.49",
26
+ "@workbench-ai/workbench-core": "0.0.49"
27
27
  },
28
28
  "devDependencies": {
29
29
  "@tailwindcss/postcss": "^4.2.2",