@workbench-ai/workbench-built-in-adapters 0.0.67 → 0.0.69

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -244,9 +244,9 @@ function defaultWorkbenchAgentConfig(provider, providerName) {
244
244
  };
245
245
  }
246
246
  async function resolveAgentAuth(provider, providerSpec, workspaceRoot, agentHome, adapterAuth) {
247
- const candidate = adapterAuthProviderCandidate(adapterAuth.request, providerSpec.use) ??
247
+ const authConfig = adapterAuthProviderOption(adapterAuth.request, providerSpec.use) ??
248
248
  (provider.manifest.defaults.auth ?? {});
249
- const parsed = provider.schemas.auth.safeParse(candidate);
249
+ const parsed = provider.schemas.auth.safeParse(authConfig);
250
250
  if (!parsed.success) {
251
251
  throw new Error(`Agent provider "${provider.manifest.id}" auth is invalid: ${formatValidationIssues(parsed.error.issues)}`);
252
252
  }
@@ -254,7 +254,7 @@ async function resolveAgentAuth(provider, providerSpec, workspaceRoot, agentHome
254
254
  void agentHome;
255
255
  return { ...parsed.data };
256
256
  }
257
- function adapterAuthProviderCandidate(auth, providerName) {
257
+ function adapterAuthProviderOption(auth, providerName) {
258
258
  const record = jsonRecord(auth);
259
259
  const self = jsonRecord(record?.self);
260
260
  const adapters = jsonRecord(record?.adapters);
@@ -1 +1 @@
1
- {"version":3,"file":"execute.d.ts","sourceRoot":"","sources":["../src/execute.ts"],"names":[],"mappings":"AAKA,OAAO,KAAK,EAEV,IAAI,EAKL,MAAM,kCAAkC,CAAC;AAoB1C,OAAO,KAAK,EAEV,0BAA0B,EAG3B,MAAM,iBAAiB,CAAC;AAQzB,MAAM,WAAW,4CAA4C;IAC3D,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,aAAa,CAAC,EAAE,0BAA0B,CAAC;IAC3C,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB,kBAAkB,CAAC,EAAE,IAAI,CAAC;IAC1B,cAAc,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;CACzC;AAwDD,wBAAsB,qCAAqC,CACzD,IAAI,GAAE,4CAAiD,GACtD,OAAO,CAAC,IAAI,CAAC,CAiDf"}
1
+ {"version":3,"file":"execute.d.ts","sourceRoot":"","sources":["../src/execute.ts"],"names":[],"mappings":"AAKA,OAAO,KAAK,EAEV,IAAI,EAKL,MAAM,kCAAkC,CAAC;AAqB1C,OAAO,KAAK,EAEV,0BAA0B,EAG3B,MAAM,iBAAiB,CAAC;AAOzB,MAAM,WAAW,4CAA4C;IAC3D,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,aAAa,CAAC,EAAE,0BAA0B,CAAC;IAC3C,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB,kBAAkB,CAAC,EAAE,IAAI,CAAC;IAC1B,cAAc,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;CACzC;AA2DD,wBAAsB,qCAAqC,CACzD,IAAI,GAAE,4CAAiD,GACtD,OAAO,CAAC,IAAI,CAAC,CAyDf"}
package/dist/execute.js CHANGED
@@ -2,17 +2,18 @@ import { spawn } from "node:child_process";
2
2
  import { promises as fs } from "node:fs";
3
3
  import os from "node:os";
4
4
  import path from "node:path";
5
- import { jsonRecord, normalizeRelativePath, readSurfaceFiles, writeSurfaceFiles, } from "@workbench-ai/workbench-core";
6
- import { ensureWorkbenchAdapterOutputDir, readWorkbenchAdapterOperationResult, readWorkbenchAdapterOperationRequest, runWorkbenchRuntimeOperationSequence, writeWorkbenchAdapterOperationResult, workbenchAdapterOperationResultPath, } from "@workbench-ai/workbench-protocol";
5
+ import { createWorkbenchExecutionEventPublisher, jsonRecord, normalizeRelativePath, publishCommandStepEvent, readSurfaceFiles, writeSurfaceFiles, } from "@workbench-ai/workbench-core";
6
+ import { ensureWorkbenchAdapterOutputDir, readWorkbenchAdapterOperationResult, readWorkbenchAdapterOperationRequest, writeWorkbenchAdapterOperationResult, workbenchAdapterOperationResultPath, } from "@workbench-ai/workbench-protocol";
7
7
  import YAML from "yaml";
8
- import { isWorkbenchBuiltInAdapterId, adapterCommandName, } from "./manifests.js";
8
+ import { isWorkbenchBuiltInAdapterId, } from "./manifests.js";
9
9
  import { importWorkbenchRuntime } from "./runtime.js";
10
10
  const DIRECT_ADAPTER_HANDLERS = {
11
11
  command: executeCommandAdapterRequest,
12
12
  tests: executeTestsEngineRequest,
13
13
  workbench: executeWorkbenchEngineRequest,
14
14
  };
15
- const TASK_CONTROL_FILE = "task.yaml";
15
+ const CASE_CONTROL_FILE = "case.yaml";
16
+ const COMMAND_SKILL_PATCH_FILE = "skill-patch.json";
16
17
  const DEFAULT_RUBRIC_PARALLELISM = 4;
17
18
  export async function executeWorkbenchBuiltInAdapterCommand(args = {}) {
18
19
  const request = await readWorkbenchAdapterOperationRequest(args.requestPath);
@@ -32,10 +33,19 @@ export async function executeWorkbenchBuiltInAdapterCommand(args = {}) {
32
33
  adapterAuthRoot: args.adapterAuthRoot,
33
34
  adapterAuthRequest: args.adapterAuthRequest ?? request.auth,
34
35
  adapterAuthEnv: args.adapterAuthEnv,
36
+ eventPublisher: eventPublisherForAdapterRequest(request),
35
37
  };
36
38
  const directHandler = DIRECT_ADAPTER_HANDLERS[adapterId];
37
39
  if (directHandler) {
38
- await directHandler(request);
40
+ await publishDirectAdapterStep(agentOptions.eventPublisher, adapterId, request, "started");
41
+ try {
42
+ await directHandler(request);
43
+ await publishDirectAdapterStep(agentOptions.eventPublisher, adapterId, request, "succeeded");
44
+ }
45
+ catch (error) {
46
+ await publishDirectAdapterStep(agentOptions.eventPublisher, adapterId, request, "failed", error);
47
+ throw error;
48
+ }
39
49
  return;
40
50
  }
41
51
  if (adapterId === "rubric") {
@@ -48,17 +58,34 @@ export async function executeWorkbenchBuiltInAdapterCommand(args = {}) {
48
58
  if (isBuiltInAgentAdapterId(adapterId)) {
49
59
  const workload = workloadFromAdapterOperationRequest(request);
50
60
  const agent = builtInAgentSpecFromRequest(request);
51
- if (request.operation === "candidate.improve") {
52
- await writeAgentCandidateRevisionOutput(request, workload, agent, agentOptions);
61
+ if (request.operation === "skill.improve") {
62
+ await writeAgentSkillRevisionOutput(request, workload, agent, agentOptions);
53
63
  return;
54
64
  }
55
- if (request.operation === "candidate.run") {
56
- await writeAgentCandidateOutput(request, workload, agent, agentOptions);
65
+ if (request.operation === "skill.run") {
66
+ await writeAgentSkillOutput(request, workload, agent, agentOptions);
57
67
  return;
58
68
  }
59
69
  throw new Error(`Agent adapter ${adapterId} cannot handle ${request.operation}.`);
60
70
  }
61
71
  }
72
+ async function publishDirectAdapterStep(publisher, adapterId, request, status, error) {
73
+ await publishCommandStepEvent(publisher, {
74
+ step: `${adapterId}.${request.operation}`,
75
+ status,
76
+ role: directAdapterProgressRole(request.operation),
77
+ ...(error ? { error: error instanceof Error ? error.message : String(error) } : {}),
78
+ });
79
+ }
80
+ function directAdapterProgressRole(operation) {
81
+ if (operation === "skill.run") {
82
+ return "runner";
83
+ }
84
+ if (operation === "skill.improve") {
85
+ return "improver";
86
+ }
87
+ return "engine";
88
+ }
62
89
  async function executeWorkbenchEngineRequest(request) {
63
90
  if (request.operation === "engine.resolve") {
64
91
  await executeWorkbenchEngineResolveRequest(request);
@@ -71,13 +98,13 @@ async function executeWorkbenchEngineRequest(request) {
71
98
  throw new Error(`Workbench engine adapter cannot handle ${request.operation}.`);
72
99
  }
73
100
  async function executeWorkbenchEngineResolveRequest(request) {
74
- const configuredPath = workbenchEngineTasksPath(request);
101
+ const configuredPath = workbenchEngineCasesPath(request);
75
102
  const sourcePath = path.resolve(request.paths.workspace, configuredPath);
76
103
  const stat = await fs.stat(sourcePath).catch(() => null);
77
104
  if (!stat?.isDirectory()) {
78
- throw new Error(`Workbench engine tasks path is not a directory: ${sourcePath}`);
105
+ throw new Error(`Workbench engine cases path is not a directory: ${sourcePath}`);
79
106
  }
80
- const cases = await readEngineCasesFromWorkbenchTaskRoot(sourcePath);
107
+ const cases = await readEngineCasesFromWorkbenchCaseRoot(sourcePath);
81
108
  await writeWorkbenchAdapterOperationResult(request.paths.output, {
82
109
  protocol: "workbench.adapter-result.v1",
83
110
  operation: "engine.resolve",
@@ -91,187 +118,20 @@ async function executeWorkbenchEngineResolveRequest(request) {
91
118
  });
92
119
  }
93
120
  async function executeWorkbenchEngineRunRequest(request) {
94
- const outcome = await workbenchEngineGradingIsolation(request) === "separate"
95
- ? await runWorkbenchEngineSeparateGrading(request)
96
- : await runWorkbenchEngineSharedGrading(request);
97
- if (!outcome.result) {
98
- throw new Error("Workbench engine scoring completed without an engine result.");
99
- }
100
- await writeSurfaceFiles(request.paths.output, outcome.files.map((file) => remapRuntimeControlTraceFile(request, file)));
101
- const usage = await workbenchEngineOutcomeUsage(outcome);
102
- await writeWorkbenchAdapterOperationResult(request.paths.output, {
103
- protocol: "workbench.adapter-result.v1",
104
- operation: "engine.run",
105
- ok: true,
106
- value: outcome.result,
107
- ...(usage ? { usage } : {}),
108
- ...(outcome.summary !== undefined ? { summary: outcome.summary } : {}),
109
- ...(outcome.feedback !== undefined ? { feedback: outcome.feedback } : {}),
110
- });
121
+ void request;
122
+ throw new Error("Workbench engine.run is no longer an orchestration adapter. Run the selected skill in core and invoke the score adapter directly.");
111
123
  }
112
- async function workbenchEngineOutcomeUsage(outcome) {
113
- const runtime = await importWorkbenchRuntime();
114
- const operationUsage = outcome.usage
115
- ? undefined
116
- : runtime.mergeUsageSummaries(outcome.operationResults.map((result) => {
117
- if (result.operation === "candidate.run") {
118
- return runtime.assignUsageRole("runner", result.usage);
119
- }
120
- if (result.operation === "engine.run") {
121
- return runtime.assignUsageRole("engine", result.usage);
122
- }
123
- return result.usage;
124
- }));
125
- const runtimeUsage = runtime.mergeUsageSummaries([outcome.usage, operationUsage]);
126
- const resultUsage = runtimeUsage?.engine
127
- ? undefined
128
- : runtime.assignUsageRole("engine", outcome.result?.usage);
129
- return runtime.mergeUsageSummaries([runtimeUsage, resultUsage]);
130
- }
131
- function workbenchEngineTasksPath(request) {
124
+ function workbenchEngineCasesPath(request) {
132
125
  const config = adapterCommandConfigRecord(request);
133
- const tasks = config.tasks;
134
- if (tasks === undefined) {
135
- return "tasks";
136
- }
137
- const taskConfig = jsonRecord(tasks);
138
- if (typeof taskConfig.path === "string" && taskConfig.path.trim().length > 0) {
139
- return taskConfig.path;
140
- }
141
- throw new Error("Workbench engine tasks must be an object with path.");
142
- }
143
- function workbenchEngineScoreInvocation(request) {
144
- const score = jsonRecord(adapterCommandConfigRecord(request).score);
145
- if (!score || typeof score.use !== "string" || score.use.length === 0) {
146
- throw new Error("Workbench engine requires invocation.with.score.use.");
147
- }
148
- return {
149
- use: score.use,
150
- with: (score.with ?? {}),
151
- ...(score.auth !== undefined ? { auth: score.auth } : {}),
152
- command: typeof score.command === "string" && score.command.length > 0
153
- ? score.command
154
- : adapterCommandName(score.use),
155
- };
156
- }
157
- function workbenchEngineCandidateInvocation(request) {
158
- const candidate = request.context?.candidate?.run;
159
- if (!candidate?.use || !candidate.command) {
160
- throw new Error("Workbench engine requires context.candidate.run.use and context.candidate.run.command.");
161
- }
162
- return {
163
- use: candidate.use,
164
- with: (candidate.with ?? {}),
165
- ...(candidate.auth !== undefined ? { auth: candidate.auth } : {}),
166
- command: candidate.command,
167
- };
168
- }
169
- async function workbenchEngineGradingIsolation(request) {
170
- const grading = jsonRecord(adapterCommandConfigRecord(request).grading);
171
- const isolation = grading?.isolation;
172
- if (isolation !== undefined &&
173
- isolation !== "shared" &&
174
- isolation !== "separate") {
175
- throw new Error("Workbench engine grading.isolation must be shared or separate.");
176
- }
177
- if (await workbenchEnginePrivateFilesPresent(request)) {
178
- return "separate";
179
- }
180
- return isolation ?? "shared";
181
- }
182
- async function workbenchEnginePrivateFilesPresent(request) {
183
- if (!request.paths.enginePrivate) {
184
- return false;
185
- }
186
- const files = await readOptionalSurfaceFiles(request.paths.enginePrivate);
187
- return files.length > 0;
188
- }
189
- async function runWorkbenchEngineSharedGrading(request) {
190
- const inputs = await workbenchEngineRuntimeInputs(request);
191
- const candidate = workbenchEngineCandidateInvocation(request);
192
- const score = workbenchEngineScoreInvocation(request);
193
- const result = await runWorkbenchRuntimeOperationSequence({
194
- inputs,
195
- prepare: true,
196
- operations: [
197
- { label: "candidate", operation: "candidate.run", invocation: candidate },
198
- { label: "score", operation: "engine.run", invocation: score },
199
- ],
200
- });
201
- assertRuntimeControlResultOk(result, "Workbench shared grading");
202
- return result;
203
- }
204
- async function runWorkbenchEngineSeparateGrading(request) {
205
- const inputs = await workbenchEngineRuntimeInputs(request);
206
- const candidate = workbenchEngineCandidateInvocation(request);
207
- const score = workbenchEngineScoreInvocation(request);
208
- const runtime = await importWorkbenchRuntime();
209
- const runner = await runWorkbenchRuntimeOperationSequence({
210
- inputs: {
211
- candidate: inputs.candidate,
212
- case: inputs.case,
213
- traces: inputs.traces,
214
- },
215
- prepare: true,
216
- collectWorkspace: true,
217
- operations: [
218
- { label: "candidate", operation: "candidate.run", invocation: candidate },
219
- ],
220
- });
221
- assertRuntimeControlResultOk(runner, "Workbench separate runner");
222
- const grader = await runWorkbenchRuntimeOperationSequence({
223
- inputs: {
224
- candidate: inputs.candidate,
225
- case: inputs.case,
226
- enginePrivate: inputs.enginePrivate,
227
- traces: inputs.traces,
228
- workspace: runner.workspaceFiles ?? [],
229
- output: runner.files.filter((file) => !runtime.isWorkbenchInternalOutputPath(file.path)),
230
- },
231
- prepare: false,
232
- operations: [
233
- { label: "score", operation: "engine.run", invocation: score },
234
- ],
235
- });
236
- assertRuntimeControlResultOk(grader, "Workbench separate grader");
237
- return {
238
- ...grader,
239
- files: dedupeSurfaceFiles([...runner.files, ...grader.files]),
240
- fileChanges: [...new Set([...runner.fileChanges, ...grader.fileChanges])].sort(),
241
- usage: runtime.mergeUsageSummaries([runner.usage, grader.usage]),
242
- operationResults: [...runner.operationResults, ...grader.operationResults],
243
- };
244
- }
245
- async function workbenchEngineRuntimeInputs(request) {
246
- const [candidate, caseFiles, enginePrivate, traces] = await Promise.all([
247
- readOptionalSurfaceFiles(request.paths.candidate),
248
- readOptionalSurfaceFiles(request.paths.case),
249
- readOptionalSurfaceFiles(request.paths.enginePrivate),
250
- readOptionalSurfaceFiles(request.paths.traces),
251
- ]);
252
- return {
253
- candidate,
254
- case: caseFiles,
255
- enginePrivate,
256
- traces,
257
- };
258
- }
259
- async function readOptionalSurfaceFiles(root) {
260
- if (!root) {
261
- return [];
126
+ const cases = config.cases;
127
+ if (cases === undefined) {
128
+ return "cases";
262
129
  }
263
- return await readSurfaceFiles(root).catch((error) => {
264
- if (error.code === "ENOENT") {
265
- return [];
266
- }
267
- throw error;
268
- });
269
- }
270
- function assertRuntimeControlResultOk(result, label) {
271
- if (result.ok) {
272
- return;
130
+ const caseConfig = jsonRecord(cases);
131
+ if (typeof caseConfig.path === "string" && caseConfig.path.trim().length > 0) {
132
+ return caseConfig.path;
273
133
  }
274
- throw new Error(`${label} failed${result.error ? `: ${result.error}` : "."}`);
134
+ throw new Error("Workbench engine cases must be an object with path.");
275
135
  }
276
136
  function dedupeSurfaceFiles(files) {
277
137
  const byPath = new Map();
@@ -284,36 +144,18 @@ function dedupeSurfaceFiles(files) {
284
144
  }
285
145
  return [...byPath.values()].sort((left, right) => left.path.localeCompare(right.path));
286
146
  }
287
- function remapRuntimeControlTraceFile(request, file) {
288
- const normalized = normalizeRelativePath(file.path);
289
- if (!normalized.startsWith(".workbench/traces/")) {
290
- return { ...file, path: normalized };
291
- }
292
- const segments = normalized.split("/");
293
- const rest = segments.length >= 6
294
- ? segments.slice(5)
295
- : segments.length >= 3
296
- ? segments.slice(3)
297
- : [];
298
- if (rest.length === 0) {
299
- return { ...file, path: normalized };
300
- }
301
- return {
302
- ...file,
303
- path: `.workbench/traces/${request.jobId ?? request.id}/${rest.join("/")}`,
304
- };
305
- }
306
147
  function safeInternalPathSegment(value) {
307
148
  const safe = value.replace(/[^a-z0-9._-]+/giu, "_").replace(/^_+|_+$/gu, "");
308
149
  return safe || "nested";
309
150
  }
310
151
  async function executeCommandAdapterRequest(request) {
311
152
  const command = requiredAdapterCommandString(request, "command");
312
- const before = request.operation === "candidate.improve"
313
- ? await snapshotEditableCandidateWorkspace(request)
153
+ await ensureRunSkillDirectories(request);
154
+ const before = request.operation === "skill.improve"
155
+ ? await snapshotEditableSkillWorkspace(request)
314
156
  : null;
315
157
  try {
316
- await runAdapterShellCommand(command, request.paths.workspace);
158
+ await runAdapterShellCommand(command, commandAdapterWorkingDirectory(request), commandAdapterEnvironment(request));
317
159
  if (request.operation === "engine.run") {
318
160
  await requireCommandScoreResult(request);
319
161
  return;
@@ -336,6 +178,7 @@ async function executeTestsEngineRequest(request) {
336
178
  if (request.operation !== "engine.run") {
337
179
  throw new Error(`Tests adapter cannot handle ${request.operation}.`);
338
180
  }
181
+ await ensureRunSkillDirectories(request);
339
182
  const testsRoot = requiredRequestPath(request.paths.enginePrivate, "paths.enginePrivate");
340
183
  const verifierRoot = testsVerifierOutputDir(request.paths.output);
341
184
  await fs.rm(verifierRoot, { recursive: true, force: true }).catch(() => undefined);
@@ -348,7 +191,12 @@ async function executeTestsEngineRequest(request) {
348
191
  throw new Error(`Tests engine requires ${path.join(testsRoot, "test.sh")}.`);
349
192
  }
350
193
  await runAdapterShellCommand(`sh ${shellQuote(script)}`, request.paths.workspace, {
194
+ SKILL_DIR: request.paths.skill ?? path.join(request.paths.workspace, "input", "skills", "primary"),
195
+ SKILLS_DIR: request.paths.skills ?? path.join(request.paths.workspace, "input", "skills"),
196
+ CASE_DIR: request.paths.case ?? path.join(request.paths.workspace, "input", "case"),
197
+ OUTPUT_DIR: request.paths.output,
351
198
  WORKBENCH_TESTS_VERIFIER_DIR: verifierRoot,
199
+ WORKBENCH_CASE_ID: request.context?.attempt?.caseId ?? "current",
352
200
  });
353
201
  const result = await readTestsResult({
354
202
  verifierRoot,
@@ -387,16 +235,100 @@ async function runAdapterShellCommand(command, cwd, env = {}) {
387
235
  });
388
236
  });
389
237
  }
238
+ function commandAdapterWorkingDirectory(request) {
239
+ return request.operation === "skill.improve"
240
+ ? requiredRequestPath(request.paths.skill, "paths.skill")
241
+ : request.paths.workspace;
242
+ }
243
+ function commandAdapterEnvironment(request) {
244
+ return {
245
+ SKILL_DIR: request.paths.skill ?? path.join(request.paths.workspace, "input", "skills", "primary"),
246
+ SKILLS_DIR: request.paths.skills ?? path.join(request.paths.workspace, "input", "skills"),
247
+ CASE_DIR: request.paths.case ?? path.join(request.paths.workspace, "input", "case"),
248
+ TRACE_DIR: request.paths.traces ?? path.join(request.paths.workspace, "input", "traces"),
249
+ OUTPUT_DIR: request.paths.output,
250
+ WORKBENCH_SKILL_PATCH: commandSkillPatchPath(request),
251
+ WORKBENCH_CASE_ID: request.context?.attempt?.caseId ?? "current",
252
+ };
253
+ }
254
+ async function ensureRunSkillDirectories(request) {
255
+ if (request.operation === "skill.improve") {
256
+ return;
257
+ }
258
+ await Promise.all([
259
+ request.paths.skills ? fs.mkdir(request.paths.skills, { recursive: true }) : Promise.resolve(),
260
+ request.paths.skill ? fs.mkdir(request.paths.skill, { recursive: true }) : Promise.resolve(),
261
+ ]);
262
+ }
263
+ function commandSkillPatchPath(request) {
264
+ return path.join(request.paths.output, COMMAND_SKILL_PATCH_FILE);
265
+ }
266
+ async function readSkillPatchFile(filePath) {
267
+ if (!await fileExists(filePath)) {
268
+ return null;
269
+ }
270
+ const record = jsonRecord(JSON.parse(await fs.readFile(filePath, "utf8")));
271
+ const rawFiles = Array.isArray(record.files) ? record.files : [];
272
+ const files = rawFiles.map((entry, index) => {
273
+ if (!isPatchSurfaceSnapshotFile(entry)) {
274
+ throw new Error(`Skill patch file ${filePath} files[${index}] must be an object with string path and content fields, got: ${describePatchEntry(entry)}.`);
275
+ }
276
+ return {
277
+ ...entry,
278
+ path: normalizeRelativePath(entry.path),
279
+ };
280
+ });
281
+ if (record.fileChanges !== undefined && !Array.isArray(record.fileChanges)) {
282
+ throw new Error(`Skill patch file ${filePath} fileChanges must be an array of strings when provided.`);
283
+ }
284
+ const fileChanges = Array.isArray(record.fileChanges)
285
+ ? record.fileChanges.map((entry, index) => {
286
+ if (typeof entry !== "string") {
287
+ throw new Error(`Skill patch file ${filePath} fileChanges[${index}] must be a string path, got: ${describePatchEntry(entry)}.`);
288
+ }
289
+ return normalizeRelativePath(entry);
290
+ })
291
+ : files.map((file) => file.path);
292
+ return {
293
+ files,
294
+ fileChanges,
295
+ ...(typeof record.summary === "string" ? { summary: record.summary } : {}),
296
+ ...(record.feedback !== undefined ? { feedback: record.feedback } : {}),
297
+ };
298
+ }
299
+ function describePatchEntry(value) {
300
+ if (value === null) {
301
+ return "null";
302
+ }
303
+ if (Array.isArray(value)) {
304
+ return "an array";
305
+ }
306
+ if (typeof value === "object") {
307
+ const record = value;
308
+ const keys = Object.keys(record);
309
+ return `an object with key${keys.length === 1 ? "" : "s"} [${keys.join(", ")}]`;
310
+ }
311
+ return `a ${typeof value}`;
312
+ }
313
+ function isPatchSurfaceSnapshotFile(value) {
314
+ if (!value || typeof value !== "object" || Array.isArray(value)) {
315
+ return false;
316
+ }
317
+ const record = value;
318
+ return typeof record.path === "string" && typeof record.content === "string";
319
+ }
390
320
  async function writeOperationOkUnlessPresent(request, beforeRoot) {
391
321
  if (await fileExists(workbenchAdapterOperationResultPath(request.paths.output))) {
392
322
  return;
393
323
  }
394
- if (request.operation === "candidate.improve") {
395
- const patch = await createCandidatePatchFromWorkspace({
396
- beforeRoot: beforeRoot ?? requiredRequestPath(request.paths.candidate, "paths.candidate"),
397
- afterRoot: request.paths.workspace,
398
- edits: request.context?.improve?.edits ?? [],
399
- });
324
+ if (request.operation === "skill.improve") {
325
+ const skillRoot = requiredRequestPath(request.paths.skill, "paths.skill");
326
+ const patch = await readSkillPatchFile(commandSkillPatchPath(request)) ??
327
+ await createSkillPatchFromWorkspace({
328
+ beforeRoot: beforeRoot ?? skillRoot,
329
+ afterRoot: skillRoot,
330
+ edits: request.context?.improve?.edits ?? [],
331
+ });
400
332
  await writeWorkbenchAdapterOperationResult(request.paths.output, {
401
333
  protocol: "workbench.adapter-result.v1",
402
334
  operation: request.operation,
@@ -411,10 +343,10 @@ async function writeOperationOkUnlessPresent(request, beforeRoot) {
411
343
  ok: true,
412
344
  });
413
345
  }
414
- async function snapshotEditableCandidateWorkspace(request) {
415
- const root = await fs.mkdtemp(path.join(os.tmpdir(), "workbench-candidate-before-"));
346
+ async function snapshotEditableSkillWorkspace(request) {
347
+ const root = await fs.mkdtemp(path.join(os.tmpdir(), "workbench-skill-before-"));
416
348
  const edits = request.context?.improve?.edits ?? [];
417
- const files = await readEditableCandidateWorkspaceFiles(request.paths.workspace, edits);
349
+ const files = await readEditableSkillWorkspaceFiles(requiredRequestPath(request.paths.skill, "paths.skill"), edits);
418
350
  await writeSurfaceFiles(root, files);
419
351
  return {
420
352
  root,
@@ -423,7 +355,7 @@ async function snapshotEditableCandidateWorkspace(request) {
423
355
  },
424
356
  };
425
357
  }
426
- async function readEditableCandidateWorkspaceFiles(root, edits) {
358
+ async function readEditableSkillWorkspaceFiles(root, edits) {
427
359
  const editPaths = edits
428
360
  .map(normalizeRelativePath)
429
361
  .filter((filePath) => !isRuntimeWorkspacePath(filePath));
@@ -431,7 +363,7 @@ async function readEditableCandidateWorkspaceFiles(root, edits) {
431
363
  return [];
432
364
  }
433
365
  const files = await readSurfaceFiles(root);
434
- return dedupeSurfaceFiles(files.filter((file) => isCandidateEditPath(file.path, editPaths) &&
366
+ return dedupeSurfaceFiles(files.filter((file) => isAllowedSkillEditPath(file.path, editPaths) &&
435
367
  !isRuntimeWorkspacePath(file.path)));
436
368
  }
437
369
  async function firstExistingFile(files) {
@@ -449,64 +381,63 @@ function requiredRequestPath(value, label) {
449
381
  }
450
382
  return value;
451
383
  }
452
- async function readEngineCasesFromWorkbenchTaskRoot(tasksRoot) {
453
- const taskDirs = await listWorkbenchTaskDirectories(tasksRoot);
454
- if (taskDirs.length === 0) {
455
- throw new Error(`Engine resolve has no Workbench task packages: ${tasksRoot}`);
384
+ async function readEngineCasesFromWorkbenchCaseRoot(casesRoot) {
385
+ const caseDirs = await listWorkbenchCaseDirectories(casesRoot);
386
+ if (caseDirs.length === 0) {
387
+ throw new Error(`Engine resolve has no Workbench case packages: ${casesRoot}`);
456
388
  }
457
- return await Promise.all(taskDirs.map(async (taskDir) => readWorkbenchEngineCase({
458
- taskDir,
459
- id: path.basename(taskDir),
389
+ return await Promise.all(caseDirs.map(async (caseDir) => readWorkbenchEngineCase({
390
+ caseDir,
391
+ id: path.basename(caseDir),
460
392
  })));
461
393
  }
462
- async function listWorkbenchTaskDirectories(root) {
463
- if (await fileExists(path.join(root, TASK_CONTROL_FILE))) {
464
- throw new Error(`Workbench engine tasks root must contain task directories, not a direct ${TASK_CONTROL_FILE}: ${root}`);
394
+ async function listWorkbenchCaseDirectories(root) {
395
+ if (await fileExists(path.join(root, CASE_CONTROL_FILE))) {
396
+ throw new Error(`Workbench engine cases root must contain case directories, not a direct ${CASE_CONTROL_FILE}: ${root}`);
465
397
  }
466
398
  const entries = await fs.readdir(root, { withFileTypes: true });
467
- const tasks = [];
399
+ const cases = [];
468
400
  for (const entry of entries) {
469
401
  if (!entry.isDirectory()) {
470
402
  continue;
471
403
  }
472
- const taskDir = path.join(root, entry.name);
473
- if (await fileExists(path.join(taskDir, TASK_CONTROL_FILE))) {
474
- tasks.push(taskDir);
404
+ const caseDir = path.join(root, entry.name);
405
+ if (await fileExists(path.join(caseDir, CASE_CONTROL_FILE))) {
406
+ cases.push(caseDir);
475
407
  }
476
408
  }
477
- return tasks.sort((left, right) => left.localeCompare(right));
409
+ return cases.sort((left, right) => left.localeCompare(right));
478
410
  }
479
411
  async function readWorkbenchEngineCase(args) {
480
- const sourceFiles = await readSurfaceFiles(args.taskDir);
481
- const taskFile = sourceFiles.find((file) => normalizeRelativePath(file.path) === TASK_CONTROL_FILE && file.encoding === "utf8");
482
- if (!taskFile) {
483
- throw new Error(`Task ${args.id} is missing ${TASK_CONTROL_FILE}.`);
484
- }
485
- const parsed = YAML.parse(taskFile.content);
486
- const taskRecord = jsonRecord(parsed);
487
- if (taskRecord.version !== 3) {
488
- throw new Error(`Task ${args.id} ${TASK_CONTROL_FILE} version must be 3.`);
489
- }
490
- if (typeof taskRecord.task !== "string" || taskRecord.task.trim().length === 0) {
491
- throw new Error(`Task ${args.id} ${TASK_CONTROL_FILE} must include a task string.`);
492
- }
493
- const unsupportedTaskFields = Object.keys(taskRecord)
494
- .filter((key) => !["version", "task", "split", "files", "tests", "solution", "environment"].includes(key));
495
- if (unsupportedTaskFields.length > 0) {
496
- throw new Error(`Task ${args.id} ${TASK_CONTROL_FILE} has unsupported field${unsupportedTaskFields.length === 1 ? "" : "s"}: ${unsupportedTaskFields.join(", ")}.`);
497
- }
498
- if (taskRecord.split !== undefined && (typeof taskRecord.split !== "string" || taskRecord.split.trim().length === 0)) {
499
- throw new Error(`Task ${args.id} ${TASK_CONTROL_FILE} split must be a non-empty string when provided.`);
500
- }
501
- const publicPrefix = taskDirectoryPrefix(taskRecord.files, "files", args.id);
502
- const testsPrefix = taskDirectoryPrefix(taskRecord.tests, "tests", args.id);
503
- const solutionPrefix = taskDirectoryPrefix(taskRecord.solution, "solution", args.id);
504
- const publicFiles = stripTaskDirectory(sourceFiles, publicPrefix);
412
+ const sourceFiles = await readSurfaceFiles(args.caseDir);
413
+ const caseFile = sourceFiles.find((file) => normalizeRelativePath(file.path) === CASE_CONTROL_FILE && file.encoding === "utf8");
414
+ if (!caseFile) {
415
+ throw new Error(`Case ${args.id} is missing ${CASE_CONTROL_FILE}.`);
416
+ }
417
+ const caseRecord = jsonRecord(YAML.parse(caseFile.content));
418
+ if (caseRecord.version !== 1) {
419
+ throw new Error(`Case ${args.id} ${CASE_CONTROL_FILE} version must be 1.`);
420
+ }
421
+ if (typeof caseRecord.case !== "string" || caseRecord.case.trim().length === 0) {
422
+ throw new Error(`Case ${args.id} ${CASE_CONTROL_FILE} must include a case string.`);
423
+ }
424
+ const unsupportedCaseFields = Object.keys(caseRecord)
425
+ .filter((key) => !["version", "case", "split", "files", "tests", "solution", "environment"].includes(key));
426
+ if (unsupportedCaseFields.length > 0) {
427
+ throw new Error(`Case ${args.id} ${CASE_CONTROL_FILE} has unsupported field${unsupportedCaseFields.length === 1 ? "" : "s"}: ${unsupportedCaseFields.join(", ")}.`);
428
+ }
429
+ if (caseRecord.split !== undefined && (typeof caseRecord.split !== "string" || caseRecord.split.trim().length === 0)) {
430
+ throw new Error(`Case ${args.id} ${CASE_CONTROL_FILE} split must be a non-empty string when provided.`);
431
+ }
432
+ const publicPrefix = caseDirectoryPrefix(caseRecord.files, "files", args.id);
433
+ const testsPrefix = caseDirectoryPrefix(caseRecord.tests, "tests", args.id);
434
+ const solutionPrefix = caseDirectoryPrefix(caseRecord.solution, "solution", args.id);
435
+ const publicFiles = stripCaseDirectory(sourceFiles, publicPrefix);
505
436
  const privateFiles = [
506
- ...stripTaskDirectory(sourceFiles, testsPrefix),
507
- ...stripTaskDirectory(sourceFiles, solutionPrefix),
437
+ ...stripCaseDirectory(sourceFiles, testsPrefix),
438
+ ...stripCaseDirectory(sourceFiles, solutionPrefix),
508
439
  ].sort((left, right) => left.path.localeCompare(right.path));
509
- assertWorkbenchTaskPackageLayout(args.id, sourceFiles, [
440
+ assertWorkbenchCasePackageLayout(args.id, sourceFiles, [
510
441
  publicPrefix,
511
442
  testsPrefix,
512
443
  solutionPrefix,
@@ -516,10 +447,10 @@ async function readWorkbenchEngineCase(args) {
516
447
  id: normalizeRelativePath(args.id),
517
448
  case: {
518
449
  version: 3,
519
- prompt: taskRecord.task,
520
- ...(typeof taskRecord.split === "string" ? { split: taskRecord.split.trim() } : {}),
521
- ...(taskRecord.environment !== undefined
522
- ? { environment: taskRecord.environment }
450
+ prompt: caseRecord.case,
451
+ ...(typeof caseRecord.split === "string" ? { split: caseRecord.split.trim() } : {}),
452
+ ...(caseRecord.environment !== undefined
453
+ ? { environment: caseRecord.environment }
523
454
  : {}),
524
455
  },
525
456
  files: {
@@ -529,26 +460,26 @@ async function readWorkbenchEngineCase(args) {
529
460
  },
530
461
  };
531
462
  }
532
- function taskDirectoryPrefix(value, fallback, taskId) {
463
+ function caseDirectoryPrefix(value, fallback, caseId) {
533
464
  if (value === undefined) {
534
465
  return `${fallback}/`;
535
466
  }
536
467
  const record = jsonRecord(value);
537
468
  if (typeof record.path !== "string" || record.path.trim().length === 0) {
538
- throw new Error(`Task ${taskId} ${TASK_CONTROL_FILE} path config must include a path string.`);
469
+ throw new Error(`Case ${caseId} ${CASE_CONTROL_FILE} path config must include a path string.`);
539
470
  }
540
471
  return `${normalizeRelativePath(record.path)}/`;
541
472
  }
542
- function assertWorkbenchTaskPackageLayout(taskId, files, allowedPrefixes) {
473
+ function assertWorkbenchCasePackageLayout(caseId, files, allowedPrefixes) {
543
474
  const invalid = files
544
475
  .map((file) => normalizeRelativePath(file.path))
545
- .filter((filePath) => filePath !== TASK_CONTROL_FILE &&
476
+ .filter((filePath) => filePath !== CASE_CONTROL_FILE &&
546
477
  !allowedPrefixes.some((prefix) => filePath.startsWith(prefix)));
547
478
  if (invalid.length > 0) {
548
- throw new Error(`Task ${taskId} contains unsupported file${invalid.length === 1 ? "" : "s"} outside task.yaml or declared task directories: ${invalid.join(", ")}`);
479
+ throw new Error(`Case ${caseId} contains unsupported file${invalid.length === 1 ? "" : "s"} outside case.yaml or declared case directories: ${invalid.join(", ")}`);
549
480
  }
550
481
  }
551
- function stripTaskDirectory(files, prefix) {
482
+ function stripCaseDirectory(files, prefix) {
552
483
  return files.flatMap((file) => {
553
484
  const normalized = normalizeRelativePath(file.path);
554
485
  if (!normalized.startsWith(prefix)) {
@@ -578,7 +509,9 @@ async function readTestsResult(args) {
578
509
  }
579
510
  return normalizeTestsResult({ reward: score }, args.caseId);
580
511
  }
581
- throw new Error("Tests engine did not find reward.json or reward.txt under its verifier output directory.");
512
+ throw new Error("Tests engine did not find reward.json or reward.txt under its verifier output directory " +
513
+ `(${args.verifierRoot}). The tests script must write a reward to ` +
514
+ "$WORKBENCH_TESTS_VERIFIER_DIR/reward.json or $WORKBENCH_TESTS_VERIFIER_DIR/reward.txt.");
582
515
  }
583
516
  function testsVerifierOutputDir(outputRoot) {
584
517
  return path.join(outputRoot, ".workbench", "internal", "verifier");
@@ -643,17 +576,17 @@ function workloadFromAdapterOperationRequest(request) {
643
576
  const attempt = context.attempt ?? {};
644
577
  return {
645
578
  job: { id: request.jobId ?? request.id },
646
- benchmark: {
647
- name: context.benchmark?.name ?? "",
648
- description: context.benchmark?.description ?? "",
579
+ eval: {
580
+ name: context.eval?.name ?? "",
581
+ description: context.eval?.description ?? "",
649
582
  },
650
- candidate: {
651
- path: context.candidate?.path ?? "",
583
+ skill: {
584
+ path: context.skill?.path ?? "",
652
585
  },
653
586
  improve: {
654
587
  edits: context.improve?.edits ?? [],
655
588
  },
656
- candidateId: context.candidate?.id ?? "",
589
+ versionId: context.skill?.id ?? "",
657
590
  attemptIndex: attempt.attemptIndex ?? 0,
658
591
  sampleIndex: attempt.sampleIndex ?? 0,
659
592
  caseId: attempt.caseId ?? "",
@@ -725,39 +658,53 @@ function requiredAdapterCommandString(request, key) {
725
658
  }
726
659
  return value;
727
660
  }
661
+ function eventPublisherForAdapterRequest(request) {
662
+ if (!request.progress) {
663
+ return undefined;
664
+ }
665
+ return createWorkbenchExecutionEventPublisher({
666
+ projectId: request.progress.projectId,
667
+ runId: request.progress.runId,
668
+ jobId: request.progress.jobId,
669
+ executionId: request.progress.executionId,
670
+ attempt: request.progress.attempt,
671
+ target: request.progress.target,
672
+ });
673
+ }
728
674
  async function executeBuiltInAgentTurn(executor, request) {
729
675
  const { defaultWorkbenchAgentTurnExecutor, executeWorkbenchAgentTurn, } = await import("./agent-turn.js");
730
676
  return await executeWorkbenchAgentTurn(executor ?? defaultWorkbenchAgentTurnExecutor, request);
731
677
  }
732
- async function writeAgentCandidateOutput(request, workload, candidate, options = {}) {
733
- if (request.operation !== "candidate.run") {
734
- throw new Error("Agent candidate results can only complete candidate.run operations.");
678
+ async function writeAgentSkillOutput(request, workload, adapter, options = {}) {
679
+ if (request.operation !== "skill.run") {
680
+ throw new Error("Agent skill execution results can only complete skill.run operations.");
735
681
  }
736
- const traceRoot = path.join(request.paths.output, ".workbench", "internal", "agent-candidate");
682
+ const traceRoot = path.join(request.paths.output, ".workbench", "internal", "agent-skill");
737
683
  const agentResult = await executeBuiltInAgentTurn(options.agentExecutor, {
738
684
  role: "runner",
739
- provider: candidate.agent,
685
+ provider: adapter.agent,
740
686
  adapterAuthRoot: options.adapterAuthRoot,
741
687
  adapterAuthRequest: options.adapterAuthRequest,
742
688
  adapterAuthEnv: options.adapterAuthEnv,
743
689
  workspaceRoot: request.paths.workspace,
744
690
  cwd: request.paths.workspace,
745
- prompt: buildAgentCandidatePrompt(workload, candidate),
691
+ prompt: buildAgentSkillPrompt(workload, adapter),
746
692
  traceRoot,
747
693
  jobId: workload.job.id,
694
+ eventPublisher: options.eventPublisher,
748
695
  });
749
- const outputPath = path.join(request.paths.output, "candidate-summary.md");
696
+ const outputPath = path.join(request.paths.output, "skill-summary.md");
750
697
  await fs.mkdir(path.dirname(outputPath), { recursive: true });
751
698
  await fs.writeFile(outputPath, agentResult.output);
752
699
  const trace = {
753
- path: `.workbench/traces/${workload.job.id}/candidate.json`,
700
+ path: `.workbench/traces/${workload.job.id}/skill.json`,
754
701
  kind: "text",
755
702
  encoding: "utf8",
756
703
  executable: false,
757
704
  content: `${JSON.stringify({
758
- kind: "agent_candidate",
759
- provider: candidate.agent.use,
760
- candidateId: workload.candidateId,
705
+ kind: "agent_skill",
706
+ provider: adapter.agent.use,
707
+ versionId: workload.versionId,
761
708
  attemptIndex: workload.attemptIndex,
762
709
  sampleIndex: workload.sampleIndex,
763
710
  summary: agentResult.output,
@@ -769,37 +716,37 @@ async function writeAgentCandidateOutput(request, workload, candidate, options =
769
716
  const usage = runtime.assignUsageRole("runner", agentResult.usage);
770
717
  await writeWorkbenchAdapterOperationResult(request.paths.output, {
771
718
  protocol: "workbench.adapter-result.v1",
772
- operation: "candidate.run",
719
+ operation: "skill.run",
773
720
  ok: true,
774
721
  ...(agentResult.output ? { summary: agentResult.output } : {}),
775
722
  feedback: {
776
- candidate: "agent",
777
- agent: candidate.agent.use,
723
+ skill: "agent",
724
+ agent: adapter.agent.use,
778
725
  metadata: agentResult.metadata,
779
726
  },
780
727
  ...(usage ? { usage } : {}),
781
728
  });
782
729
  }
783
- function buildAgentCandidatePrompt(workload, candidate) {
730
+ function buildAgentSkillPrompt(workload, adapter) {
784
731
  return [
785
- ...(candidate.instructions ? ["Instructions:", candidate.instructions, ""] : []),
732
+ ...(adapter.instructions ? ["Instructions:", adapter.instructions, ""] : []),
786
733
  "Context:",
787
- "- Candidate source files are mounted at /workspace/input/candidate.",
788
- "- Follow any candidate guidance, skill files, scripts, or configuration under /workspace/input/candidate.",
734
+ "- The entry skill is mounted at /workspace/input/skills/primary unless another skill is selected.",
735
+ "- All skills installed for this run are mounted under /workspace/input/skills.",
789
736
  "- The mutable working directory is /workspace.",
790
- "- If the candidate declares prepare.command, it has already run and may have copied files into /workspace.",
737
+ "- If the skill declares prepare.command, it has already run and may have copied files into /workspace.",
791
738
  ...(workload.case?.prompt ? ["Case:", workload.case.prompt, ""] : []),
792
739
  "- Public case files are mounted at /workspace/input/case.",
793
740
  "- Verifier tests are not present while you run.",
794
- "- Mutate the current working directory to complete the task.",
741
+ "- Mutate the current working directory to complete the case.",
795
742
  "- You may write inspection artifacts under /workspace/output.",
796
743
  ].join("\n");
797
744
  }
798
- async function writeAgentCandidateRevisionOutput(request, workload, improver, options) {
799
- if (request.operation !== "candidate.improve") {
800
- throw new Error("Agent improve results can only complete candidate.improve operations.");
745
+ async function writeAgentSkillRevisionOutput(request, workload, improver, options) {
746
+ if (request.operation !== "skill.improve") {
747
+ throw new Error("Agent skill improvement results can only complete skill.improve operations.");
801
748
  }
802
- const before = await snapshotEditableCandidateWorkspace(request);
749
+ const before = await snapshotEditableSkillWorkspace(request);
803
750
  const traceRoot = path.join(request.paths.output, ".workbench", "internal", "agent-improver");
804
751
  try {
805
752
  const agentResult = await executeBuiltInAgentTurn(options.agentExecutor, {
@@ -809,19 +756,20 @@ async function writeAgentCandidateRevisionOutput(request, workload, improver, op
809
756
  adapterAuthRequest: options.adapterAuthRequest,
810
757
  adapterAuthEnv: options.adapterAuthEnv,
811
758
  workspaceRoot: request.paths.workspace,
812
- cwd: request.paths.workspace,
759
+ cwd: requiredRequestPath(request.paths.skill, "paths.skill"),
813
760
  prompt: buildAgentImproverPrompt(workload),
814
761
  traceRoot,
815
762
  jobId: workload.job.id,
763
+ eventPublisher: options.eventPublisher,
816
764
  });
817
- const candidatePatch = await createCandidatePatchFromWorkspace({
765
+ const skillPatch = await createSkillPatchFromWorkspace({
818
766
  beforeRoot: before.root,
819
- afterRoot: request.paths.workspace,
767
+ afterRoot: requiredRequestPath(request.paths.skill, "paths.skill"),
820
768
  edits: workload.improve.edits,
821
769
  });
822
- const changedCandidatePaths = candidatePatch.fileChanges.filter((filePath) => isCandidateEditPath(filePath, workload.improve.edits));
823
- if (changedCandidatePaths.length === 0) {
824
- throw new Error("Agent improve adapter completed without changing a candidate file covered by improve edits.");
770
+ const changedSkillPaths = skillPatch.fileChanges.filter((filePath) => isAllowedSkillEditPath(filePath, workload.improve.edits));
771
+ if (changedSkillPaths.length === 0) {
772
+ throw new Error("Agent improve adapter completed without changing a skill file covered by improve edits.");
825
773
  }
826
774
  const trace = {
827
775
  path: `.workbench/traces/${workload.job.id}/improver.json`,
@@ -831,9 +779,9 @@ async function writeAgentCandidateRevisionOutput(request, workload, improver, op
831
779
  content: `${JSON.stringify({
832
780
  kind: "agent_improver",
833
781
  provider: improver.agent.use,
834
- candidateId: workload.candidateId,
782
+ versionId: workload.versionId,
835
783
  attemptIndex: workload.attemptIndex,
836
- changedPaths: changedCandidatePaths,
784
+ changedPaths: changedSkillPaths,
837
785
  summary: agentResult.output,
838
786
  metadata: agentResult.metadata,
839
787
  }, null, 2)}\n`,
@@ -843,16 +791,16 @@ async function writeAgentCandidateRevisionOutput(request, workload, improver, op
843
791
  const usage = runtime.assignUsageRole("improver", agentResult.usage);
844
792
  await writeWorkbenchAdapterOperationResult(request.paths.output, {
845
793
  protocol: "workbench.adapter-result.v1",
846
- operation: "candidate.improve",
794
+ operation: "skill.improve",
847
795
  ok: true,
848
796
  value: {
849
- ...candidatePatch,
850
- fileChanges: changedCandidatePaths,
797
+ ...skillPatch,
798
+ fileChanges: changedSkillPaths,
851
799
  },
852
800
  ...(agentResult.output ? { summary: agentResult.output } : {}),
853
801
  feedback: {
854
802
  improver: improver.agent.use,
855
- changedPaths: changedCandidatePaths,
803
+ changedPaths: changedSkillPaths,
856
804
  metadata: agentResult.metadata,
857
805
  },
858
806
  ...(usage ? { usage } : {}),
@@ -864,12 +812,12 @@ async function writeAgentCandidateRevisionOutput(request, workload, improver, op
864
812
  }
865
813
  function buildAgentImproverPrompt(workload) {
866
814
  return [
867
- "Benchmark:",
868
- workload.benchmark.description || workload.benchmark.name,
815
+ "Eval:",
816
+ workload.eval.description || workload.eval.name,
869
817
  "",
870
- "Improve the candidate for this benchmark.",
818
+ "Improve the skill for this eval.",
871
819
  "",
872
- "Candidate files are in the current directory.",
820
+ "Skill files are in the current directory.",
873
821
  "Prior adapter executions are in /workspace/input/traces.",
874
822
  "",
875
823
  "Editable paths:",
@@ -893,6 +841,7 @@ async function writeRubricJudgeResult(request, workload, engine, options = {}) {
893
841
  adapterAuthRequest: options.adapterAuthRequest,
894
842
  adapterAuthEnv: options.adapterAuthEnv,
895
843
  runtime,
844
+ eventPublisher: options.eventPublisher,
896
845
  }));
897
846
  const usage = runtime.mergeUsageSummaries(criterionRuns.map((run) => run.usage));
898
847
  const result = rubricJudgeResultFromCriteria({
@@ -935,7 +884,7 @@ async function writeRubricEvidenceFiles(args) {
935
884
  schema: "workbench.engine.rubric.evidence.v1",
936
885
  safeForImprover: true,
937
886
  jobId: args.workload.job.id,
938
- candidateId: args.workload.candidateId,
887
+ versionId: args.workload.versionId,
939
888
  attemptIndex: args.workload.attemptIndex,
940
889
  sampleIndex: args.workload.sampleIndex,
941
890
  caseId: args.workload.caseId,
@@ -1011,6 +960,7 @@ async function runRubricCriterionJudge(args) {
1011
960
  traceRoot: path.join(traceRoot, "judge"),
1012
961
  tracePath,
1013
962
  jobId: args.workload.job.id,
963
+ eventPublisher: args.eventPublisher,
1014
964
  });
1015
965
  let usage = args.runtime.assignUsageRole("engine", agentResult.usage);
1016
966
  try {
@@ -1040,6 +990,7 @@ async function runRubricCriterionJudge(args) {
1040
990
  traceRoot: path.join(traceRoot, "repair"),
1041
991
  tracePath: repairTracePath,
1042
992
  jobId: args.workload.job.id,
993
+ eventPublisher: args.eventPublisher,
1043
994
  });
1044
995
  usage = args.runtime.mergeUsageSummaries([
1045
996
  usage,
@@ -1076,7 +1027,7 @@ function rubricCriterionTracePath(jobId, criterionId, turn) {
1076
1027
  return `.workbench/traces/${jobId}/engine/rubric/criteria/${safeInternalPathSegment(criterionId)}/${turn}`;
1077
1028
  }
1078
1029
  function buildRubricCriterionJudgePrompt(workload, engine, criterion) {
1079
- requireWorkloadTask(workload, "Rubric judge");
1030
+ requireWorkloadCase(workload, "Rubric judge");
1080
1031
  return [
1081
1032
  ...(engine.instructions ? ["Instructions:", engine.instructions, ""] : []),
1082
1033
  ...(workload.case?.prompt ? ["Case:", workload.case.prompt, ""] : []),
@@ -1084,10 +1035,10 @@ function buildRubricCriterionJudgePrompt(workload, engine, criterion) {
1084
1035
  JSON.stringify(criterion, null, 2),
1085
1036
  "",
1086
1037
  "Context:",
1087
- "- The candidate already ran in this same working directory.",
1088
- "- Candidate outputs are available in the current working directory.",
1038
+ "- The skill already ran in this same working directory.",
1039
+ "- Skill outputs are available in the current working directory.",
1089
1040
  "- Public case files are mounted at /workspace/input/case.",
1090
- "- Verifier-private files are mounted at /workspace/private/engine when the task provides them.",
1041
+ "- Verifier-private files are mounted at /workspace/private/engine when the case provides them.",
1091
1042
  "- Score only from the current working directory, public case files, verifier-private files, and the criterion above.",
1092
1043
  "",
1093
1044
  "Output:",
@@ -1276,17 +1227,17 @@ async function mapWithConcurrency(inputs, concurrency, mapper) {
1276
1227
  await Promise.all(Array.from({ length: limit }, async () => worker()));
1277
1228
  return results;
1278
1229
  }
1279
- function requireWorkloadTask(workload, label) {
1230
+ function requireWorkloadCase(workload, label) {
1280
1231
  if (!workload.case) {
1281
1232
  throw new Error(`${label} workload is missing case text.`);
1282
1233
  }
1283
1234
  }
1284
- async function createCandidatePatchFromWorkspace(args) {
1235
+ async function createSkillPatchFromWorkspace(args) {
1285
1236
  const before = new Map((await readSurfaceFiles(args.beforeRoot))
1286
1237
  .map((file) => [normalizeRelativePath(file.path), file]));
1287
1238
  const changedFiles = (await readSurfaceFiles(args.afterRoot))
1288
1239
  .map((file) => ({ ...file, path: normalizeRelativePath(file.path) }))
1289
- .filter((file) => isCandidateEditPath(file.path, args.edits) &&
1240
+ .filter((file) => isAllowedSkillEditPath(file.path, args.edits) &&
1290
1241
  !isRuntimeWorkspacePath(file.path) &&
1291
1242
  !sameSurfaceFile(before.get(file.path), file))
1292
1243
  .sort((left, right) => left.path.localeCompare(right.path));
@@ -1313,7 +1264,7 @@ function isRuntimeWorkspacePath(filePath) {
1313
1264
  normalized === "private" ||
1314
1265
  normalized.startsWith("private/");
1315
1266
  }
1316
- function isCandidateEditPath(filePath, edits) {
1267
+ function isAllowedSkillEditPath(filePath, edits) {
1317
1268
  const normalized = normalizeRelativePath(filePath);
1318
1269
  return edits.some((entry) => {
1319
1270
  const editPath = normalizeRelativePath(entry).replace(/\/+$/u, "");
@@ -1,9 +1,2 @@
1
- import type { WorkbenchAdapterManifest } from "@workbench-ai/workbench-protocol";
2
- export type WorkbenchPublicBuiltInAdapterId = "workbench" | "codex" | "claude" | "command";
3
- export type WorkbenchEngineHelperAdapterId = "rubric" | "tests";
4
- export type WorkbenchBuiltInAdapterId = WorkbenchPublicBuiltInAdapterId | WorkbenchEngineHelperAdapterId;
5
- export declare function builtinWorkbenchAdapterManifest(id: string): WorkbenchAdapterManifest | null;
6
- export declare function builtinWorkbenchAdapterManifests(): WorkbenchAdapterManifest[];
7
- export declare function isWorkbenchBuiltInAdapterId(id: string): id is WorkbenchBuiltInAdapterId;
8
- export declare function adapterCommandName(adapterId: string): string;
1
+ export { adapterCommandName, builtinWorkbenchAdapterManifest, builtinWorkbenchAdapterManifests, isWorkbenchBuiltInAdapterId, type WorkbenchBuiltInAdapterId, type WorkbenchEngineHelperAdapterId, type WorkbenchPublicBuiltInAdapterId, } from "@workbench-ai/workbench-protocol";
9
2
  //# sourceMappingURL=manifests.d.ts.map
@@ -1 +1 @@
1
- {"version":3,"file":"manifests.d.ts","sourceRoot":"","sources":["../src/manifests.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EACV,wBAAwB,EACzB,MAAM,kCAAkC,CAAC;AAW1C,MAAM,MAAM,+BAA+B,GACvC,WAAW,GACX,OAAO,GACP,QAAQ,GACR,SAAS,CAAC;AAEd,MAAM,MAAM,8BAA8B,GACtC,QAAQ,GACR,OAAO,CAAC;AAEZ,MAAM,MAAM,yBAAyB,GACjC,+BAA+B,GAC/B,8BAA8B,CAAC;AA+EnC,wBAAgB,+BAA+B,CAAC,EAAE,EAAE,MAAM,GAAG,wBAAwB,GAAG,IAAI,CAI3F;AAED,wBAAgB,gCAAgC,IAAI,wBAAwB,EAAE,CAI7E;AAED,wBAAgB,2BAA2B,CAAC,EAAE,EAAE,MAAM,GAAG,EAAE,IAAI,yBAAyB,CAEvF;AAED,wBAAgB,kBAAkB,CAAC,SAAS,EAAE,MAAM,GAAG,MAAM,CAE5D"}
1
+ {"version":3,"file":"manifests.d.ts","sourceRoot":"","sources":["../src/manifests.ts"],"names":[],"mappings":"AAAA,OAAO,EACL,kBAAkB,EAClB,+BAA+B,EAC/B,gCAAgC,EAChC,2BAA2B,EAC3B,KAAK,yBAAyB,EAC9B,KAAK,8BAA8B,EACnC,KAAK,+BAA+B,GACrC,MAAM,kCAAkC,CAAC"}
package/dist/manifests.js CHANGED
@@ -1,100 +1 @@
1
- import { adapterSlot, defineAdapter, defineEngineResolver, defineCandidate, defineImprover, defineEngineRunner, workbenchAdapterManifestFromDefinition, } from "@workbench-ai/workbench-protocol";
2
- const BUILT_IN_ADAPTER_MANIFESTS = Object.fromEntries(Object.entries({
3
- workbench: defineAdapter({
4
- id: "workbench",
5
- engineResolve: defineEngineResolver(),
6
- engineRun: defineEngineRunner({ executor: "host" }),
7
- slots: {
8
- score: adapterSlot("/score", "engine.run"),
9
- },
10
- }),
11
- codex: defineAdapter({
12
- id: "codex",
13
- candidate: defineCandidate(),
14
- improve: defineImprover(),
15
- setup: [
16
- "npm install --global @openai/codex@0.125.0",
17
- ],
18
- auth: {
19
- methods: {
20
- oauth: { files: [{ path: ".codex/auth.json" }] },
21
- "api-key": { env: [{ name: "OPENAI_API_KEY" }] },
22
- },
23
- },
24
- }),
25
- claude: defineAdapter({
26
- id: "claude",
27
- candidate: defineCandidate(),
28
- improve: defineImprover(),
29
- setup: [
30
- "npm install --global @anthropic-ai/claude-code@2.1.119",
31
- ],
32
- auth: {
33
- methods: {
34
- oauth: {
35
- files: [
36
- { path: ".claude.json" },
37
- { path: ".claude/oauth-token", required: false },
38
- { path: ".claude/.credentials.json", required: false },
39
- ],
40
- },
41
- "api-key": { env: [{ name: "ANTHROPIC_API_KEY" }] },
42
- bedrock: {
43
- env: [
44
- { name: "CLAUDE_CODE_USE_BEDROCK" },
45
- { name: "AWS_ACCESS_KEY_ID", required: false },
46
- { name: "AWS_SECRET_ACCESS_KEY", required: false },
47
- { name: "AWS_SESSION_TOKEN", required: false },
48
- { name: "AWS_REGION" },
49
- { name: "AWS_DEFAULT_REGION", required: false },
50
- { name: "AWS_BEARER_TOKEN_BEDROCK", required: false },
51
- { name: "ANTHROPIC_MODEL", required: false },
52
- { name: "ANTHROPIC_SMALL_FAST_MODEL", required: false },
53
- ],
54
- },
55
- },
56
- },
57
- }),
58
- command: defineAdapter({
59
- id: "command",
60
- candidate: defineCandidate(),
61
- engineRun: defineEngineRunner(),
62
- improve: defineImprover(),
63
- }),
64
- rubric: defineAdapter({
65
- id: "rubric",
66
- engineRun: defineEngineRunner(),
67
- slots: {
68
- judge: adapterSlot("/judge", "candidate.run"),
69
- },
70
- }),
71
- tests: defineAdapter({
72
- id: "tests",
73
- engineRun: defineEngineRunner(),
74
- }),
75
- }).map(([id, definition]) => [id, workbenchAdapterManifestFromDefinition(definition)]));
76
- export function builtinWorkbenchAdapterManifest(id) {
77
- return isWorkbenchBuiltInAdapterId(id)
78
- ? cloneManifest(BUILT_IN_ADAPTER_MANIFESTS[id])
79
- : null;
80
- }
81
- export function builtinWorkbenchAdapterManifests() {
82
- return Object.keys(BUILT_IN_ADAPTER_MANIFESTS)
83
- .sort()
84
- .map((id) => cloneManifest(BUILT_IN_ADAPTER_MANIFESTS[id]));
85
- }
86
- export function isWorkbenchBuiltInAdapterId(id) {
87
- return Object.prototype.hasOwnProperty.call(BUILT_IN_ADAPTER_MANIFESTS, id);
88
- }
89
- export function adapterCommandName(adapterId) {
90
- return `workbench-adapter-${adapterId}`;
91
- }
92
- function cloneManifest(manifest) {
93
- return {
94
- ...manifest,
95
- operations: JSON.parse(JSON.stringify(manifest.operations)),
96
- setup: [...manifest.setup],
97
- ...(manifest.auth ? { auth: JSON.parse(JSON.stringify(manifest.auth)) } : {}),
98
- ...(manifest.slots ? { slots: JSON.parse(JSON.stringify(manifest.slots)) } : {}),
99
- };
100
- }
1
+ export { adapterCommandName, builtinWorkbenchAdapterManifest, builtinWorkbenchAdapterManifests, isWorkbenchBuiltInAdapterId, } from "@workbench-ai/workbench-protocol";
package/dist/runtime.js CHANGED
@@ -12,11 +12,11 @@ export async function importWorkbenchRuntime() {
12
12
  return await runtimeModule;
13
13
  }
14
14
  async function importWorkbenchRuntimeUncached() {
15
- const candidates = runtimeImportCandidates();
15
+ const specifiers = runtimeImportOptions();
16
16
  let lastError;
17
- for (const candidate of candidates) {
17
+ for (const specifier of specifiers) {
18
18
  try {
19
- return await import(__rewriteRelativeImportExtension(candidate));
19
+ return await import(__rewriteRelativeImportExtension(specifier));
20
20
  }
21
21
  catch (error) {
22
22
  lastError = error;
@@ -24,11 +24,11 @@ async function importWorkbenchRuntimeUncached() {
24
24
  }
25
25
  throw new Error(`Unable to load @workbench-ai/workbench-core for built-in adapters: ${lastError instanceof Error ? lastError.message : String(lastError)}`);
26
26
  }
27
- function runtimeImportCandidates() {
27
+ function runtimeImportOptions() {
28
28
  return [
29
29
  process.env.WORKBENCH_RUNTIME_IMPORT,
30
30
  "/app/products/workbench/packages/core/src/index.ts",
31
31
  new URL("../../core/src/index.ts", import.meta.url).href,
32
32
  "@workbench-ai/workbench-core",
33
- ].filter((candidate) => typeof candidate === "string" && candidate.length > 0);
33
+ ].filter((specifier) => typeof specifier === "string" && specifier.length > 0);
34
34
  }
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@workbench-ai/workbench-built-in-adapters",
3
- "version": "0.0.67",
3
+ "version": "0.0.69",
4
4
  "repository": {
5
5
  "type": "git",
6
6
  "url": "git+https://github.com/workbench-ai/workbench.git",
@@ -32,12 +32,12 @@
32
32
  ],
33
33
  "dependencies": {
34
34
  "yaml": "^2.8.2",
35
- "@workbench-ai/agent-driver-anthropic-claude-code": "0.0.46",
36
- "@workbench-ai/workbench-contract": "0.0.67",
37
- "@workbench-ai/agent-driver": "0.0.46",
38
35
  "@workbench-ai/agent-driver-openai-codex": "0.0.46",
39
- "@workbench-ai/workbench-core": "0.0.67",
40
- "@workbench-ai/workbench-protocol": "0.0.67"
36
+ "@workbench-ai/agent-driver": "0.0.46",
37
+ "@workbench-ai/workbench-contract": "0.0.69",
38
+ "@workbench-ai/workbench-protocol": "0.0.69",
39
+ "@workbench-ai/agent-driver-anthropic-claude-code": "0.0.46",
40
+ "@workbench-ai/workbench-core": "0.0.69"
41
41
  },
42
42
  "devDependencies": {
43
43
  "@types/node": "^24.3.1",