@workbench-ai/workbench-built-in-adapters 0.0.46 → 0.0.48

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -16,6 +16,7 @@ export interface WorkbenchAgentTurnRequest {
16
16
  cwd: string;
17
17
  prompt: string;
18
18
  traceRoot: string;
19
+ tracePath?: string;
19
20
  jobId: string;
20
21
  eventPublisher?: WorkbenchExecutionEventPublisher;
21
22
  }
@@ -1 +1 @@
1
- {"version":3,"file":"agent-turn.d.ts","sourceRoot":"","sources":["../src/agent-turn.ts"],"names":[],"mappings":"AAMA,OAAO,KAAK,EAEV,mBAAmB,EACnB,YAAY,EACb,MAAM,kCAAkC,CAAC;AAC1C,OAAO,EAQL,KAAK,SAAS,EAEf,MAAM,4BAA4B,CAAC;AACpC,OAAO,KAAK,EACV,gCAAgC,EACjC,MAAM,8BAA8B,CAAC;AAetC,MAAM,WAAW,iBAAiB;IAChC,GAAG,EAAE,MAAM,CAAC;IACZ,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,MAAM,CAAC,EAAE,MAAM,CAAC;CACjB;AAED,MAAM,WAAW,yBAAyB;IACxC,IAAI,EAAE,WAAW,GAAG,QAAQ,GAAG,QAAQ,CAAC;IACxC,QAAQ,EAAE,iBAAiB,CAAC;IAC5B,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB,kBAAkB,CAAC,EAAE,SAAS,CAAC;IAC/B,cAAc,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IACxC,aAAa,EAAE,MAAM,CAAC;IACtB,GAAG,EAAE,MAAM,CAAC;IACZ,MAAM,EAAE,MAAM,CAAC;IACf,SAAS,EAAE,MAAM,CAAC;IAClB,KAAK,EAAE,MAAM,CAAC;IACd,cAAc,CAAC,EAAE,gCAAgC,CAAC;CACnD;AAED,MAAM,WAAW,wBAAwB;IACvC,MAAM,EAAE,MAAM,CAAC;IACf,UAAU,EAAE,mBAAmB,EAAE,CAAC;IAClC,QAAQ,EAAE,MAAM,CAAC,MAAM,EAAE,SAAS,CAAC,CAAC;IACpC,KAAK,CAAC,EAAE,YAAY,CAAC;CACtB;AAED,MAAM,MAAM,0BAA0B,GAAG,CAAC,OAAO,EAAE,yBAAyB,KAAK,OAAO,CAAC,wBAAwB,CAAC,CAAC;AAoCnH,wBAAsB,yBAAyB,CAC7C,QAAQ,EAAE,CAAC,OAAO,EAAE,yBAAyB,KAAK,OAAO,CAAC,wBAAwB,CAAC,EACnF,OAAO,EAAE,yBAAyB,GACjC,OAAO,CAAC,wBAAwB,CAAC,CAenC;AAED,wBAAsB,iCAAiC,CACrD,OAAO,EAAE,yBAAyB,GACjC,OAAO,CAAC,wBAAwB,CAAC,CA2FnC"}
1
+ {"version":3,"file":"agent-turn.d.ts","sourceRoot":"","sources":["../src/agent-turn.ts"],"names":[],"mappings":"AAMA,OAAO,KAAK,EAEV,mBAAmB,EACnB,YAAY,EACb,MAAM,kCAAkC,CAAC;AAC1C,OAAO,EAQL,KAAK,SAAS,EAEf,MAAM,4BAA4B,CAAC;AACpC,OAAO,KAAK,EACV,gCAAgC,EACjC,MAAM,8BAA8B,CAAC;AAetC,MAAM,WAAW,iBAAiB;IAChC,GAAG,EAAE,MAAM,CAAC;IACZ,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,MAAM,CAAC,EAAE,MAAM,CAAC;CACjB;AAED,MAAM,WAAW,yBAAyB;IACxC,IAAI,EAAE,WAAW,GAAG,QAAQ,GAAG,QAAQ,CAAC;IACxC,QAAQ,EAAE,iBAAiB,CAAC;IAC5B,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB,kBAAkB,CAAC,EAAE,SAAS,CAAC;IAC/B,cAAc,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IACxC,aAAa,EAAE,MAAM,CAAC;IACtB,GAAG,EAAE,MAAM,CAAC;IACZ,MAAM,EAAE,MAAM,CAAC;IACf,SAAS,EAAE,MAAM,CAAC;IAClB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,KAAK,EAAE,MAAM,CAAC;IACd,cAAc,CAAC,EAAE,gCAAgC,CAAC;CACnD;AAED,MAAM,WAAW,wBAAwB;IACvC,MAAM,EAAE,MAAM,CAAC;IACf,UAAU,EAAE,mBAAmB,EAAE,CAAC;IAClC,QAAQ,EAAE,MAAM,CAAC,MAAM,EAAE,SAAS,CAAC,CAAC;IACpC,KAAK,CAAC,EAAE,YAAY,CAAC;CACtB;AAED,MAAM,MAAM,0BAA0B,GAAG,CAAC,OAAO,EAAE,yBAAyB,KAAK,OAAO,CAAC,wBAAwB,CAAC,CAAC;AA4BnH,wBAAsB,yBAAyB,CAC7C,QAAQ,EAAE,CAAC,OAAO,EAAE,yBAAyB,KAAK,OAAO,CAAC,wBAAwB,CAAC,EACnF,OAAO,EAAE,yBAAyB,GACjC,OAAO,CAAC,wBAAwB,CAAC,CAenC;AAED,wBAAsB,iCAAiC,CACrD,OAAO,EAAE,yBAAyB,GACjC,OAAO,CAAC,wBAAwB,CAAC,CAwFnC"}
@@ -8,7 +8,7 @@ import { importWorkbenchRuntime } from "./runtime.js";
8
8
  const DEFAULT_AGENT_TURN_MAX_ATTEMPTS = 3;
9
9
  const DEFAULT_AGENT_TURN_RETRY_BASE_MS = 5_000;
10
10
  const DEFAULT_AGENT_TURN_RETRY_MAX_MS = 30_000;
11
- const AGENT_HARNESS_REGISTRY = {
11
+ const AGENT_PROVIDER_REGISTRY = {
12
12
  codex: {
13
13
  executable: "codex",
14
14
  installHint: "@openai/codex",
@@ -32,14 +32,6 @@ const AGENT_HARNESS_REGISTRY = {
32
32
  return module.claudeCodeHarness();
33
33
  },
34
34
  },
35
- pi: {
36
- executable: "pi",
37
- installHint: "@mariozechner/pi-coding-agent",
38
- async load() {
39
- const module = await import("@workbench-ai/agent-driver-badlogic-pi-coding-agent");
40
- return module.piCodingAgentHarness();
41
- },
42
- },
43
35
  };
44
36
  export async function executeWorkbenchAgentTurn(executor, request) {
45
37
  const maxAttempts = workbenchAgentTurnMaxAttempts();
@@ -61,13 +53,13 @@ export async function executeWorkbenchAgentTurn(executor, request) {
61
53
  export async function defaultWorkbenchAgentTurnExecutor(request) {
62
54
  const execFileAsync = promisify(execFile);
63
55
  await ensureAgentExecutableOnPath(request.provider.use, execFileAsync);
64
- const provider = await loadAgentHarnessProvider(request.provider.use);
56
+ const provider = await loadAgentProvider(request.provider.use);
65
57
  const agentHome = resolveRuntimeHome();
66
58
  const stageSessionPath = path.join(request.traceRoot, "session");
67
59
  await fs.mkdir(stageSessionPath, { recursive: true });
68
60
  const restoreEnv = applyAdapterAuthEnv(request.adapterAuthEnv);
69
61
  try {
70
- const plan = await buildAgentHarnessExecutionPlan(provider, request.provider, request.workspaceRoot, agentHome, {
62
+ const plan = await buildAgentExecutionPlan(provider, request.provider, request.workspaceRoot, agentHome, {
71
63
  root: request.adapterAuthRoot,
72
64
  request: request.adapterAuthRequest,
73
65
  });
@@ -120,15 +112,9 @@ export async function defaultWorkbenchAgentTurnExecutor(request) {
120
112
  const usage = runtime.extractExecutionUsageFromTrace(turnResult.trace, request.provider, provider.manifest.id, turnResult.events);
121
113
  const eventCount = Math.max(turnResult.events.length, traceEventCount(turnResult.trace));
122
114
  await writeAgentTraceFile(path.join(stageSessionPath, "trace.json"), turnResult.trace);
123
- await fs.writeFile(path.join(stageSessionPath, "agent-result.json"), `${JSON.stringify({
124
- sessionId: turnResult.sessionId,
125
- finalOutput: turnResult.finalOutput,
126
- eventCount,
127
- ...(usage ? { usage } : {}),
128
- }, null, 2)}\n`);
129
115
  return {
130
116
  output: turnResult.finalOutput,
131
- traceFiles: await runtime.readOutputTraceFiles(request.traceRoot, `.workbench/traces/${request.jobId}/${request.role}`),
117
+ traceFiles: await runtime.readOutputTraceFiles(request.traceRoot, request.tracePath ?? `.workbench/traces/${request.jobId}/${request.role}`),
132
118
  metadata: {
133
119
  providerId: provider.manifest.id,
134
120
  sessionId: turnResult.sessionId,
@@ -186,8 +172,8 @@ function traceEventCount(trace) {
186
172
  : {};
187
173
  return Array.isArray(traceRecord.events) ? traceRecord.events.length : 0;
188
174
  }
189
- async function loadAgentHarnessProvider(providerName) {
190
- return await agentHarnessRegistration(providerName).load();
175
+ async function loadAgentProvider(providerName) {
176
+ return await agentProviderRegistration(providerName).load();
191
177
  }
192
178
  async function ensureAgentExecutableOnPath(providerName, execFileAsync) {
193
179
  const executable = agentExecutableName(providerName);
@@ -202,28 +188,28 @@ async function ensureAgentExecutableOnPath(providerName, execFileAsync) {
202
188
  }
203
189
  }
204
190
  function agentExecutableName(providerName) {
205
- return agentHarnessRegistration(providerName).executable;
191
+ return agentProviderRegistration(providerName).executable;
206
192
  }
207
193
  function agentExecutableInstallHint(providerName) {
208
- return agentHarnessRegistration(providerName).installHint;
194
+ return agentProviderRegistration(providerName).installHint;
209
195
  }
210
- function agentHarnessRegistration(providerName) {
211
- const registration = AGENT_HARNESS_REGISTRY[providerName];
196
+ function agentProviderRegistration(providerName) {
197
+ const registration = AGENT_PROVIDER_REGISTRY[providerName];
212
198
  if (!registration) {
213
199
  throw new Error(`Unsupported first-party agent adapter: ${providerName}`);
214
200
  }
215
201
  return registration;
216
202
  }
217
- async function buildAgentHarnessExecutionPlan(provider, providerSpec, workspaceRoot, agentHome, adapterAuth) {
203
+ async function buildAgentExecutionPlan(provider, providerSpec, workspaceRoot, agentHome, adapterAuth) {
218
204
  const turnTimeoutMs = provider.manifest.defaults.turn_timeout_ms ?? 3_600_000;
219
205
  const harness = {
220
206
  id: provider.manifest.id,
221
- auth: await resolveAgentHarnessAuth(provider, providerSpec, workspaceRoot, agentHome, adapterAuth),
207
+ auth: await resolveAgentAuth(provider, providerSpec, workspaceRoot, agentHome, adapterAuth),
222
208
  ...(firstNonEmpty(providerSpec.model, provider.manifest.defaults.model) ? { model: firstNonEmpty(providerSpec.model, provider.manifest.defaults.model) } : {}),
223
209
  ...(firstNonEmpty(providerSpec.effort, provider.manifest.defaults.effort) ? { effort: firstNonEmpty(providerSpec.effort, provider.manifest.defaults.effort) } : {}),
224
210
  turn_timeout_ms: turnTimeoutMs,
225
211
  stall_timeout_ms: Math.max(provider.manifest.defaults.stall_timeout_ms ?? 0, turnTimeoutMs),
226
- config: resolveAgentHarnessConfig(provider, defaultWorkbenchAgentHarnessConfig(provider, providerSpec.use)),
212
+ config: resolveAgentConfig(provider, defaultWorkbenchAgentConfig(provider, providerSpec.use)),
227
213
  retry: DEFAULT_HARNESS_RETRY,
228
214
  cancel: DEFAULT_HARNESS_CANCEL,
229
215
  };
@@ -235,15 +221,15 @@ async function buildAgentHarnessExecutionPlan(provider, providerSpec, workspaceR
235
221
  harness,
236
222
  };
237
223
  }
238
- function defaultWorkbenchAgentHarnessConfig(provider, providerName) {
224
+ function defaultWorkbenchAgentConfig(provider, providerName) {
239
225
  const fallback = (provider.manifest.defaults.config ?? {});
240
226
  return {
241
227
  ...fallback,
242
- ...(AGENT_HARNESS_REGISTRY[providerName]?.defaultConfig ?? {}),
228
+ ...(AGENT_PROVIDER_REGISTRY[providerName]?.defaultConfig ?? {}),
243
229
  };
244
230
  }
245
- async function resolveAgentHarnessAuth(provider, providerSpec, workspaceRoot, agentHome, adapterAuth) {
246
- const subject = adapterAuthHarnessSubject(adapterAuth.request, providerSpec.use) ??
231
+ async function resolveAgentAuth(provider, providerSpec, workspaceRoot, agentHome, adapterAuth) {
232
+ const subject = adapterAuthProviderSubject(adapterAuth.request, providerSpec.use) ??
247
233
  (provider.manifest.defaults.auth ?? {});
248
234
  const parsed = provider.schemas.auth.safeParse(subject);
249
235
  if (!parsed.success) {
@@ -253,7 +239,7 @@ async function resolveAgentHarnessAuth(provider, providerSpec, workspaceRoot, ag
253
239
  void agentHome;
254
240
  return { ...parsed.data };
255
241
  }
256
- function adapterAuthHarnessSubject(auth, providerName) {
242
+ function adapterAuthProviderSubject(auth, providerName) {
257
243
  const record = jsonRecord(auth);
258
244
  const self = jsonRecord(record?.self);
259
245
  const adapters = jsonRecord(record?.adapters);
@@ -285,7 +271,7 @@ function adapterAuthHarnessSubject(auth, providerName) {
285
271
  }
286
272
  return null;
287
273
  }
288
- function resolveAgentHarnessConfig(provider, fallback) {
274
+ function resolveAgentConfig(provider, fallback) {
289
275
  const parsed = provider.schemas.config.safeParse(fallback);
290
276
  if (!parsed.success) {
291
277
  throw new Error(`Agent provider "${provider.manifest.id}" config is invalid: ${formatValidationIssues(parsed.error.issues)}`);
@@ -1 +1 @@
1
- {"version":3,"file":"execute.d.ts","sourceRoot":"","sources":["../src/execute.ts"],"names":[],"mappings":"AAIA,OAAO,KAAK,EAEV,IAAI,EAKL,MAAM,kCAAkC,CAAC;AAY1C,OAAO,KAAK,EAEV,0BAA0B,EAG3B,MAAM,iBAAiB,CAAC;AAQzB,MAAM,WAAW,4CAA4C;IAC3D,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,aAAa,CAAC,EAAE,0BAA0B,CAAC;IAC3C,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB,kBAAkB,CAAC,EAAE,IAAI,CAAC;IAC1B,cAAc,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;CACzC;AA4CD,wBAAsB,qCAAqC,CACzD,IAAI,GAAE,4CAAiD,GACtD,OAAO,CAAC,IAAI,CAAC,CAiEf"}
1
+ {"version":3,"file":"execute.d.ts","sourceRoot":"","sources":["../src/execute.ts"],"names":[],"mappings":"AAIA,OAAO,KAAK,EAEV,IAAI,EAKL,MAAM,kCAAkC,CAAC;AAc1C,OAAO,KAAK,EAEV,0BAA0B,EAG3B,MAAM,iBAAiB,CAAC;AAQzB,MAAM,WAAW,4CAA4C;IAC3D,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,aAAa,CAAC,EAAE,0BAA0B,CAAC;IAC3C,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB,kBAAkB,CAAC,EAAE,IAAI,CAAC;IAC1B,cAAc,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;CACzC;AA4CD,wBAAsB,qCAAqC,CACzD,IAAI,GAAE,4CAAiD,GACtD,OAAO,CAAC,IAAI,CAAC,CAiEf"}
package/dist/execute.js CHANGED
@@ -1,7 +1,7 @@
1
1
  import { spawn } from "node:child_process";
2
2
  import { promises as fs } from "node:fs";
3
3
  import path from "node:path";
4
- import { ensureWorkbenchAdapterOutputDir, readWorkbenchAdapterOperationResult, readWorkbenchAdapterOperationRequest, writeWorkbenchAdapterOperationResult, workbenchAdapterOperationResultPath, } from "@workbench-ai/workbench-protocol";
4
+ import { ensureWorkbenchAdapterOutputDir, readWorkbenchAdapterOperationResult, readWorkbenchAdapterOperationRequest, runWorkbenchRuntimeOperationSequence, writeWorkbenchAdapterOperationResult, workbenchAdapterOperationResultPath, } from "@workbench-ai/workbench-protocol";
5
5
  import YAML from "yaml";
6
6
  import { isWorkbenchBuiltInAdapterId, adapterCommandName, } from "./manifests.js";
7
7
  import { importWorkbenchRuntime } from "./runtime.js";
@@ -81,7 +81,7 @@ async function executeWorkbenchEngineRequest(request) {
81
81
  }
82
82
  async function executeWorkbenchEngineResolveRequest(request) {
83
83
  const configuredPath = workbenchEngineTasksPath(request);
84
- const sourcePath = path.resolve(request.paths.cwd ?? request.paths.workspace, configuredPath);
84
+ const sourcePath = path.resolve(request.paths.workspace, configuredPath);
85
85
  const stat = await fs.stat(sourcePath).catch(() => null);
86
86
  if (!stat?.isDirectory()) {
87
87
  throw new Error(`Workbench engine tasks path is not a directory: ${sourcePath}`);
@@ -100,50 +100,42 @@ async function executeWorkbenchEngineResolveRequest(request) {
100
100
  });
101
101
  }
102
102
  async function executeWorkbenchEngineRunRequest(request) {
103
- const enginePrivateFiles = await hideWorkbenchEnginePrivateFiles(request);
104
- const subjectResult = await runSubjectFromWorkbenchEngine(request);
105
- await stageWorkbenchEngineScoringInputs(request, enginePrivateFiles);
106
- const score = workbenchEngineScoreInvocation(request);
107
- await runNestedAdapterOperation({
108
- parent: request,
109
- invocation: score,
103
+ const outcome = workbenchEngineGradingIsolation(request) === "separate"
104
+ ? await runWorkbenchEngineSeparateGrading(request)
105
+ : await runWorkbenchEngineSharedGrading(request);
106
+ if (!outcome.result) {
107
+ throw new Error("Workbench engine scoring completed without an engine result.");
108
+ }
109
+ await writeSurfaceFiles(request.paths.output, outcome.files.map((file) => remapRuntimeControlTraceFile(request, file)));
110
+ const usage = await workbenchEngineOutcomeUsage(outcome);
111
+ await writeWorkbenchAdapterOperationResult(request.paths.output, {
112
+ protocol: "workbench.adapter-result.v1",
110
113
  operation: "engine.run",
111
- command: score.command,
112
- requestName: "score-request.json",
113
- });
114
- const engineResult = await readWorkbenchAdapterOperationResult(request.paths.output, "engine.run");
115
- const usage = mergeNestedEngineUsage(subjectResult.usage, engineResult.usage);
116
- if (usage) {
117
- await writeWorkbenchAdapterOperationResult(request.paths.output, {
118
- ...engineResult,
119
- usage,
120
- });
121
- }
122
- }
123
- async function hideWorkbenchEnginePrivateFiles(request) {
124
- if (!request.paths.enginePrivate) {
125
- return [];
126
- }
127
- const files = await readSurfaceFilesRecursive(request.paths.enginePrivate).catch((error) => {
128
- if (error.code === "ENOENT") {
129
- return [];
130
- }
131
- throw error;
114
+ ok: true,
115
+ value: outcome.result,
116
+ ...(usage ? { usage } : {}),
117
+ ...(outcome.summary !== undefined ? { summary: outcome.summary } : {}),
118
+ ...(outcome.feedback !== undefined ? { feedback: outcome.feedback } : {}),
132
119
  });
133
- await fs.rm(request.paths.enginePrivate, { recursive: true, force: true }).catch(() => undefined);
134
- return files;
135
120
  }
136
- async function stageWorkbenchEngineScoringInputs(request, enginePrivateFiles) {
137
- if (request.paths.enginePrivate) {
138
- await fs.rm(request.paths.enginePrivate, { recursive: true, force: true }).catch(() => undefined);
139
- await fs.mkdir(request.paths.enginePrivate, { recursive: true });
140
- await writeSurfaceFiles(request.paths.enginePrivate, enginePrivateFiles);
141
- }
142
- if (request.paths.logs) {
143
- const verifierLogs = path.join(request.paths.logs, "verifier");
144
- await fs.rm(verifierLogs, { recursive: true, force: true }).catch(() => undefined);
145
- await fs.mkdir(verifierLogs, { recursive: true });
146
- }
121
+ async function workbenchEngineOutcomeUsage(outcome) {
122
+ const runtime = await importWorkbenchRuntime();
123
+ const operationUsage = outcome.usage
124
+ ? undefined
125
+ : runtime.mergeUsageSummaries(outcome.operationResults.map((result) => {
126
+ if (result.operation === "subject.run") {
127
+ return runtime.assignUsageRole("runner", result.usage);
128
+ }
129
+ if (result.operation === "engine.run") {
130
+ return runtime.assignUsageRole("engine", result.usage);
131
+ }
132
+ return result.usage;
133
+ }));
134
+ const runtimeUsage = runtime.mergeUsageSummaries([outcome.usage, operationUsage]);
135
+ const resultUsage = runtimeUsage?.engine
136
+ ? undefined
137
+ : runtime.assignUsageRole("engine", outcome.result?.usage);
138
+ return runtime.mergeUsageSummaries([runtimeUsage, resultUsage]);
147
139
  }
148
140
  function workbenchEngineTasksPath(request) {
149
141
  const config = adapterCommandConfigRecord(request);
@@ -171,124 +163,145 @@ function workbenchEngineScoreInvocation(request) {
171
163
  : adapterCommandName(score.use),
172
164
  };
173
165
  }
174
- async function runSubjectFromWorkbenchEngine(request) {
166
+ function workbenchEngineSubjectInvocation(request) {
175
167
  const subject = request.context?.subject?.run;
176
- if (!subject?.command) {
177
- throw new Error("engine.run request context.subject.run.command is required to invoke the subject.");
178
- }
179
- const subjectOutput = await runNestedAdapterOperation({
180
- parent: request,
181
- invocation: {
182
- use: subject.use,
183
- with: (subject.with ?? {}),
184
- ...(subject.auth !== undefined ? { auth: subject.auth } : {}),
185
- command: subject.command,
186
- },
187
- operation: "subject.run",
168
+ if (!subject?.use || !subject.command) {
169
+ throw new Error("Workbench engine requires context.subject.run.use and context.subject.run.command.");
170
+ }
171
+ return {
172
+ use: subject.use,
173
+ with: (subject.with ?? {}),
174
+ ...(subject.auth !== undefined ? { auth: subject.auth } : {}),
188
175
  command: subject.command,
189
- requestName: "subject-request.json",
190
- outputName: "subject-run",
191
- visibility: "subject",
176
+ };
177
+ }
178
+ function workbenchEngineGradingIsolation(request) {
179
+ const grading = jsonRecord(adapterCommandConfigRecord(request).grading);
180
+ const isolation = grading?.isolation;
181
+ if (isolation === undefined) {
182
+ return "shared";
183
+ }
184
+ if (isolation === "shared" || isolation === "separate") {
185
+ return isolation;
186
+ }
187
+ throw new Error("Workbench engine grading.isolation must be shared or separate.");
188
+ }
189
+ async function runWorkbenchEngineSharedGrading(request) {
190
+ const inputs = await workbenchEngineRuntimeInputs(request);
191
+ const subject = workbenchEngineSubjectInvocation(request);
192
+ const score = workbenchEngineScoreInvocation(request);
193
+ const result = await runWorkbenchRuntimeOperationSequence({
194
+ inputs,
195
+ prepare: true,
196
+ operations: [
197
+ { label: "subject", operation: "subject.run", invocation: subject },
198
+ { label: "score", operation: "engine.run", invocation: score },
199
+ ],
192
200
  });
193
- const result = await readWorkbenchAdapterOperationResult(subjectOutput, "subject.run");
194
- await copySubjectOutputArtifacts(subjectOutput, request.paths.output);
201
+ assertRuntimeControlResultOk(result, "Workbench shared grading");
195
202
  return result;
196
203
  }
197
- function mergeNestedEngineUsage(subject, engine) {
198
- const usage = {};
199
- if (subject?.runner) {
200
- usage.runner = subject.runner;
201
- }
202
- else if (subject?.total) {
203
- usage.runner = subject.total;
204
- }
205
- if (engine?.engine) {
206
- usage.engine = engine.engine;
207
- }
208
- else if (engine?.total) {
209
- usage.engine = engine.total;
210
- }
211
- if (subject?.optimizer) {
212
- usage.optimizer = subject.optimizer;
213
- }
214
- return Object.keys(usage).length > 0 ? usage : undefined;
215
- }
216
- async function runNestedAdapterOperation(args) {
217
- const internalRoot = path.join(args.parent.paths.output, ".workbench", "internal", args.outputName ?? "engine-slot", safeInternalPathSegment(args.parent.id));
218
- const output = args.outputName ? path.join(internalRoot, "output") : args.parent.paths.output;
219
- const result = args.outputName
220
- ? workbenchAdapterOperationResultPath(output)
221
- : args.parent.paths.result;
222
- const requestPath = path.join(internalRoot, args.requestName);
223
- await fs.mkdir(path.dirname(requestPath), { recursive: true });
224
- await fs.mkdir(output, { recursive: true });
225
- const nestedPaths = {
226
- ...args.parent.paths,
227
- output,
228
- result,
229
- };
230
- if (args.visibility === "subject") {
231
- delete nestedPaths.enginePrivate;
232
- }
233
- await fs.writeFile(requestPath, `${JSON.stringify({
234
- ...args.parent,
235
- id: `${args.parent.id}:${args.invocation.use}:${args.operation}`,
236
- operation: args.operation,
237
- invocation: {
238
- use: args.invocation.use,
239
- with: args.invocation.with,
240
- ...(args.invocation.auth !== undefined ? { auth: args.invocation.auth } : {}),
204
+ async function runWorkbenchEngineSeparateGrading(request) {
205
+ const inputs = await workbenchEngineRuntimeInputs(request);
206
+ const subject = workbenchEngineSubjectInvocation(request);
207
+ const score = workbenchEngineScoreInvocation(request);
208
+ const runtime = await importWorkbenchRuntime();
209
+ const runner = await runWorkbenchRuntimeOperationSequence({
210
+ inputs: {
211
+ subject: inputs.subject,
212
+ case: inputs.case,
213
+ traces: inputs.traces,
241
214
  },
242
- ...(args.parent.auth !== undefined
243
- ? { auth: adapterScopedAuth(args.parent.auth, args.invocation.use) }
244
- : {}),
245
- paths: nestedPaths,
246
- }, null, 2)}\n`);
247
- await runAdapterShellCommand(args.command, args.parent.paths.cwd ?? args.parent.paths.workspace, {
248
- WORKBENCH_ADAPTER_REQUEST: requestPath,
249
- WORKBENCH_OUTPUT: output,
250
- WORKBENCH_RESULT: result,
215
+ prepare: true,
216
+ collectWorkspace: true,
217
+ operations: [
218
+ { label: "subject", operation: "subject.run", invocation: subject },
219
+ ],
251
220
  });
252
- return output;
221
+ assertRuntimeControlResultOk(runner, "Workbench separate runner");
222
+ const grader = await runWorkbenchRuntimeOperationSequence({
223
+ inputs: {
224
+ subject: inputs.subject,
225
+ case: inputs.case,
226
+ enginePrivate: inputs.enginePrivate,
227
+ traces: inputs.traces,
228
+ workspace: runner.workspaceFiles ?? [],
229
+ output: runner.files.filter((file) => !runtime.isWorkbenchInternalOutputPath(file.path)),
230
+ },
231
+ prepare: false,
232
+ operations: [
233
+ { label: "score", operation: "engine.run", invocation: score },
234
+ ],
235
+ });
236
+ assertRuntimeControlResultOk(grader, "Workbench separate grader");
237
+ return {
238
+ ...grader,
239
+ files: dedupeSurfaceFiles([...runner.files, ...grader.files]),
240
+ fileChanges: [...new Set([...runner.fileChanges, ...grader.fileChanges])].sort(),
241
+ usage: runtime.mergeUsageSummaries([runner.usage, grader.usage]),
242
+ operationResults: [...runner.operationResults, ...grader.operationResults],
243
+ };
253
244
  }
254
- async function copySubjectOutputArtifacts(source, target) {
255
- await copyDirectoryEntries(source, target, "");
245
+ async function workbenchEngineRuntimeInputs(request) {
246
+ const [subject, caseFiles, enginePrivate, traces] = await Promise.all([
247
+ readOptionalSurfaceFiles(request.paths.subject),
248
+ readOptionalSurfaceFiles(request.paths.case),
249
+ readOptionalSurfaceFiles(request.paths.enginePrivate),
250
+ readOptionalSurfaceFiles(request.paths.traces),
251
+ ]);
252
+ return {
253
+ subject,
254
+ case: caseFiles,
255
+ enginePrivate,
256
+ traces,
257
+ };
256
258
  }
257
- async function copyDirectoryEntries(sourceRoot, targetRoot, relativeDir) {
258
- const sourceDir = path.join(sourceRoot, relativeDir);
259
- const entries = await fs.readdir(sourceDir, { withFileTypes: true }).catch(() => []);
260
- for (const entry of entries) {
261
- const relativePath = path.join(relativeDir, entry.name);
262
- const normalized = normalizeRelativePath(relativePath);
263
- if (normalized === "workbench-result.json" || normalized.startsWith(".workbench/internal/")) {
264
- continue;
265
- }
266
- const sourcePath = path.join(sourceRoot, relativePath);
267
- const targetPath = path.join(targetRoot, relativePath);
268
- if (entry.isDirectory()) {
269
- await copyDirectoryEntries(sourceRoot, targetRoot, relativePath);
270
- continue;
271
- }
272
- if (!entry.isFile()) {
273
- continue;
259
+ async function readOptionalSurfaceFiles(root) {
260
+ if (!root) {
261
+ return [];
262
+ }
263
+ return await readSurfaceFilesRecursive(root).catch((error) => {
264
+ if (error.code === "ENOENT") {
265
+ return [];
274
266
  }
275
- await fs.mkdir(path.dirname(targetPath), { recursive: true });
276
- await fs.copyFile(sourcePath, targetPath);
267
+ throw error;
268
+ });
269
+ }
270
+ function assertRuntimeControlResultOk(result, label) {
271
+ if (result.ok) {
272
+ return;
273
+ }
274
+ throw new Error(`${label} failed${result.error ? `: ${result.error}` : "."}`);
275
+ }
276
+ function dedupeSurfaceFiles(files) {
277
+ const byPath = new Map();
278
+ for (const file of files) {
279
+ const normalized = normalizeRelativePath(file.path);
280
+ byPath.set(normalized, {
281
+ ...file,
282
+ path: normalized,
283
+ });
277
284
  }
285
+ return [...byPath.values()].sort((left, right) => left.path.localeCompare(right.path));
278
286
  }
279
- function adapterScopedAuth(auth, adapterId) {
280
- if (!auth || typeof auth !== "object" || Array.isArray(auth)) {
281
- return auth;
287
+ function remapRuntimeControlTraceFile(request, file) {
288
+ const normalized = normalizeRelativePath(file.path);
289
+ if (!normalized.startsWith(".workbench/traces/")) {
290
+ return { ...file, path: normalized };
282
291
  }
283
- const record = JSON.parse(JSON.stringify(auth));
284
- const adapters = record.adapters;
285
- if (adapters && typeof adapters === "object" && !Array.isArray(adapters)) {
286
- const scoped = adapters[adapterId];
287
- if (scoped !== undefined) {
288
- record.self = scoped;
289
- }
292
+ const segments = normalized.split("/");
293
+ const rest = segments.length >= 6
294
+ ? segments.slice(5)
295
+ : segments.length >= 3
296
+ ? segments.slice(3)
297
+ : [];
298
+ if (rest.length === 0) {
299
+ return { ...file, path: normalized };
290
300
  }
291
- return record;
301
+ return {
302
+ ...file,
303
+ path: `.workbench/traces/${request.jobId ?? request.id}/${rest.join("/")}`,
304
+ };
292
305
  }
293
306
  function safeInternalPathSegment(value) {
294
307
  const safe = value.replace(/[^a-z0-9._-]+/giu, "_").replace(/^_+|_+$/gu, "");
@@ -296,7 +309,7 @@ function safeInternalPathSegment(value) {
296
309
  }
297
310
  async function executeCommandAdapterRequest(request) {
298
311
  const command = requiredAdapterCommandString(request, "command");
299
- await runAdapterShellCommand(command, request.paths.cwd ?? request.paths.workspace);
312
+ await runAdapterShellCommand(command, request.paths.workspace);
300
313
  if (request.operation === "engine.run") {
301
314
  await requireCommandScoreResult(request);
302
315
  return;
@@ -316,9 +329,9 @@ async function executeTestsEngineRequest(request) {
316
329
  throw new Error(`Tests adapter cannot handle ${request.operation}.`);
317
330
  }
318
331
  const testsRoot = requiredRequestPath(request.paths.enginePrivate, "paths.enginePrivate");
319
- const logsRoot = requiredRequestPath(request.paths.logs, "paths.logs");
320
- const verifierLogs = path.join(logsRoot, "verifier");
321
- await fs.mkdir(verifierLogs, { recursive: true });
332
+ const verifierRoot = testsVerifierOutputDir(request.paths.output);
333
+ await fs.rm(verifierRoot, { recursive: true, force: true }).catch(() => undefined);
334
+ await fs.mkdir(verifierRoot, { recursive: true });
322
335
  const script = await firstExistingFile([
323
336
  path.join(testsRoot, "test.sh"),
324
337
  path.join(testsRoot, "run.sh"),
@@ -326,9 +339,11 @@ async function executeTestsEngineRequest(request) {
326
339
  if (!script) {
327
340
  throw new Error(`Tests engine requires ${path.join(testsRoot, "test.sh")}.`);
328
341
  }
329
- await runAdapterShellCommand(`sh ${shellQuote(script)}`, request.paths.cwd ?? request.paths.workspace);
342
+ await runAdapterShellCommand(`sh ${shellQuote(script)}`, request.paths.workspace, {
343
+ WORKBENCH_TESTS_VERIFIER_DIR: verifierRoot,
344
+ });
330
345
  const result = await readTestsResult({
331
- logsRoot,
346
+ verifierRoot,
332
347
  caseId: request.context?.attempt?.caseId ?? "current",
333
348
  });
334
349
  await writeWorkbenchAdapterOperationResult(request.paths.output, {
@@ -371,7 +386,7 @@ async function writeOperationOkUnlessPresent(request) {
371
386
  if (request.operation === "optimizer.improve") {
372
387
  const patch = await createSubjectPatchFromWorkspace({
373
388
  beforeRoot: requiredRequestPath(request.paths.subject, "paths.subject"),
374
- afterRoot: request.paths.cwd ?? request.paths.workspace,
389
+ afterRoot: request.paths.workspace,
375
390
  edits: request.context?.optimizer?.edits ?? [],
376
391
  });
377
392
  await writeWorkbenchAdapterOperationResult(request.paths.output, {
@@ -452,8 +467,8 @@ async function readWorkbenchEngineCase(args) {
452
467
  const publicPrefix = taskDirectoryPrefix(taskRecord.files, "files", args.id);
453
468
  const testsPrefix = taskDirectoryPrefix(taskRecord.tests, "tests", args.id);
454
469
  const solutionPrefix = taskDirectoryPrefix(taskRecord.solution, "solution", args.id);
455
- const subjectVisible = stripTaskDirectory(sourceFiles, publicPrefix);
456
- const enginePrivate = [
470
+ const publicFiles = stripTaskDirectory(sourceFiles, publicPrefix);
471
+ const privateFiles = [
457
472
  ...stripTaskDirectory(sourceFiles, testsPrefix),
458
473
  ...stripTaskDirectory(sourceFiles, solutionPrefix),
459
474
  ].sort((left, right) => left.path.localeCompare(right.path));
@@ -473,8 +488,8 @@ async function readWorkbenchEngineCase(args) {
473
488
  : {}),
474
489
  },
475
490
  files: {
476
- subjectVisible,
477
- enginePrivate,
491
+ public: publicFiles,
492
+ private: privateFiles,
478
493
  source: sourceFiles,
479
494
  },
480
495
  };
@@ -543,11 +558,11 @@ async function fileExists(filePath) {
543
558
  return fs.stat(filePath).then((stat) => stat.isFile(), () => false);
544
559
  }
545
560
  async function readTestsResult(args) {
546
- const rewardJson = await readOptionalJson(path.join(args.logsRoot, "verifier", "reward.json"));
561
+ const rewardJson = await readOptionalJson(path.join(args.verifierRoot, "reward.json"));
547
562
  if (rewardJson) {
548
563
  return normalizeTestsResult(rewardJson, args.caseId);
549
564
  }
550
- const rewardText = await fs.readFile(path.join(args.logsRoot, "verifier", "reward.txt"), "utf8").catch((error) => {
565
+ const rewardText = await fs.readFile(path.join(args.verifierRoot, "reward.txt"), "utf8").catch((error) => {
551
566
  if (error.code === "ENOENT") {
552
567
  return null;
553
568
  }
@@ -560,7 +575,10 @@ async function readTestsResult(args) {
560
575
  }
561
576
  return normalizeTestsResult({ reward: score }, args.caseId);
562
577
  }
563
- throw new Error("Tests engine did not find reward.json or reward.txt under the request logs verifier directory.");
578
+ throw new Error("Tests engine did not find reward.json or reward.txt under its verifier output directory.");
579
+ }
580
+ function testsVerifierOutputDir(outputRoot) {
581
+ return path.join(outputRoot, ".workbench", "internal", "verifier");
564
582
  }
565
583
  async function readOptionalJson(filePath) {
566
584
  const source = await fs.readFile(filePath, "utf8").catch((error) => {
@@ -640,7 +658,7 @@ function workloadFromAdapterOperationRequest(request) {
640
658
  };
641
659
  }
642
660
  function isBuiltInAgentAdapterId(value) {
643
- return value === "codex" || value === "claude" || value === "pi";
661
+ return value === "codex" || value === "claude";
644
662
  }
645
663
  function builtInAgentSpecFromRequest(request) {
646
664
  const config = adapterCommandConfigRecord(request);
@@ -720,7 +738,7 @@ async function writeAgentSubjectOutput(request, workload, subject, options = {})
720
738
  adapterAuthRequest: options.adapterAuthRequest,
721
739
  adapterAuthEnv: options.adapterAuthEnv,
722
740
  workspaceRoot: request.paths.workspace,
723
- cwd: request.paths.cwd ?? request.paths.workspace,
741
+ cwd: request.paths.workspace,
724
742
  prompt: buildAgentSubjectPrompt(workload, subject),
725
743
  traceRoot,
726
744
  jobId: workload.job.id,
@@ -763,8 +781,10 @@ function buildAgentSubjectPrompt(workload, subject) {
763
781
  return [
764
782
  ...(subject.instructions ? ["Instructions:", subject.instructions, ""] : []),
765
783
  "Context:",
766
- "- Subject files are mounted at /workspace/input/subject.",
767
- "- Subject files are also present in the task working directory.",
784
+ "- Subject source files are mounted at /workspace/input/subject.",
785
+ "- Follow any subject guidance, skill files, scripts, or configuration under /workspace/input/subject.",
786
+ "- The mutable working directory is /workspace.",
787
+ "- If the subject declares prepare.command, it has already run and may have copied files into /workspace.",
768
788
  ...(workload.case?.prompt ? ["Case:", workload.case.prompt, ""] : []),
769
789
  "- Public case files are mounted at /workspace/input/case.",
770
790
  "- Verifier tests are not present while you run.",
@@ -784,14 +804,14 @@ async function writeAgentSubjectRevisionOutput(request, workload, optimizer, opt
784
804
  adapterAuthRequest: options.adapterAuthRequest,
785
805
  adapterAuthEnv: options.adapterAuthEnv,
786
806
  workspaceRoot: request.paths.workspace,
787
- cwd: request.paths.cwd ?? request.paths.workspace,
807
+ cwd: request.paths.workspace,
788
808
  prompt: buildAgentOptimizerPrompt(workload),
789
809
  traceRoot,
790
810
  jobId: workload.job.id,
791
811
  });
792
812
  const subjectPatch = await createSubjectPatchFromWorkspace({
793
813
  beforeRoot: requiredRequestPath(request.paths.subject, "paths.subject"),
794
- afterRoot: request.paths.cwd ?? request.paths.workspace,
814
+ afterRoot: request.paths.workspace,
795
815
  edits: workload.optimizer.edits,
796
816
  });
797
817
  const changedSubjectPaths = subjectPatch.fileChanges.filter((filePath) => isSubjectEditPath(filePath, workload.optimizer.edits));
@@ -839,8 +859,10 @@ function buildAgentOptimizerPrompt(workload) {
839
859
  workload.benchmark.description || workload.benchmark.name,
840
860
  "",
841
861
  "Context:",
842
- "- Subject files are mounted at /workspace/input/subject.",
843
- "- Subject files are also present in the current working directory.",
862
+ "- Subject source files are mounted at /workspace/input/subject.",
863
+ "- Follow any subject guidance, skill files, scripts, or configuration under /workspace/input/subject.",
864
+ "- The mutable working directory is /workspace.",
865
+ "- If the subject declares prepare.command, it has already run and may have copied files into /workspace.",
844
866
  "- Prior run traces are mounted at /workspace/input/traces.",
845
867
  "- Use /workspace/input/traces as the source of truth for what happened in prior attempts.",
846
868
  "- Do not mutate /workspace/input.",
@@ -849,7 +871,7 @@ function buildAgentOptimizerPrompt(workload) {
849
871
  workload.optimizer.edits.map((entry) => `- ${entry}`).join("\n"),
850
872
  "",
851
873
  "Output:",
852
- "- Mutate the editable subject files directly in the current working directory.",
874
+ "- Create or mutate editable subject files directly in the current working directory.",
853
875
  "- Include at least one changed subject file covered by the optimizer edits list.",
854
876
  ].join("\n");
855
877
  }
@@ -873,6 +895,14 @@ async function writeRubricJudgeResult(request, workload, engine, options = {}) {
873
895
  engine,
874
896
  criterionRuns,
875
897
  });
898
+ await writeRubricEvidenceFiles({
899
+ request,
900
+ workload,
901
+ engine,
902
+ result,
903
+ criterionRuns,
904
+ usage,
905
+ });
876
906
  await writeWorkbenchAdapterOperationResult(request.paths.output, {
877
907
  protocol: "workbench.adapter-result.v1",
878
908
  operation: "engine.run",
@@ -886,7 +916,7 @@ async function writeRubricJudgeResult(request, workload, engine, options = {}) {
886
916
  aggregation: "weighted_mean",
887
917
  criteria: criterionRuns.map((run) => ({
888
918
  id: run.result.criterion_id,
889
- traceRoot: run.traceRoot,
919
+ traceFiles: run.traceFiles.map((file) => file.path),
890
920
  metadata: run.metadata,
891
921
  ...(run.repair ? { repair: run.repair } : {}),
892
922
  })),
@@ -894,8 +924,76 @@ async function writeRubricJudgeResult(request, workload, engine, options = {}) {
894
924
  ...(usage ? { usage } : {}),
895
925
  });
896
926
  }
927
+ async function writeRubricEvidenceFiles(args) {
928
+ const root = `.workbench/traces/${args.workload.job.id}/engine/rubric`;
929
+ const scorecard = {
930
+ schema: "workbench.engine.rubric.evidence.v1",
931
+ safeForOptimizer: true,
932
+ jobId: args.workload.job.id,
933
+ subjectId: args.workload.subjectId,
934
+ attemptIndex: args.workload.attemptIndex,
935
+ sampleIndex: args.workload.sampleIndex,
936
+ caseId: args.workload.caseId,
937
+ judge: args.engine.judge.use,
938
+ parallelism: args.engine.parallelism,
939
+ aggregation: "weighted_mean",
940
+ score: args.result.score,
941
+ metrics: args.result.metrics ?? {},
942
+ summary: args.result.summary ?? null,
943
+ criteria: args.criterionRuns.map((run) => ({
944
+ id: run.result.criterion_id,
945
+ label: run.result.label,
946
+ score: run.result.score,
947
+ pass: run.result.pass,
948
+ rationale: run.result.rationale ?? null,
949
+ errors: run.result.errors ?? [],
950
+ summary: run.summary ?? null,
951
+ metadata: safeRubricEvidenceMetadata(run.metadata),
952
+ repair: run.repair ?? null,
953
+ })),
954
+ ...(args.usage ? { usage: args.usage } : {}),
955
+ };
956
+ await writeSurfaceFiles(args.request.paths.output, [
957
+ jsonSurfaceFile(`${root}/scorecard.json`, scorecard),
958
+ ...args.criterionRuns.map((run) => jsonSurfaceFile(`${root}/criteria/${safeInternalPathSegment(run.result.criterion_id)}/result.json`, {
959
+ schema: "workbench.engine.rubric.criterion-evidence.v1",
960
+ safeForOptimizer: true,
961
+ criterion: args.engine.criteria.find((criterion) => criterion.id === run.result.criterion_id) ?? {
962
+ id: run.result.criterion_id,
963
+ },
964
+ result: run.result,
965
+ summary: run.summary ?? null,
966
+ metadata: safeRubricEvidenceMetadata(run.metadata),
967
+ repair: run.repair ?? null,
968
+ })),
969
+ ...args.criterionRuns.flatMap((run) => run.traceFiles),
970
+ ]);
971
+ }
972
+ function safeRubricEvidenceMetadata(metadata) {
973
+ const record = metadata && typeof metadata === "object" && !Array.isArray(metadata)
974
+ ? metadata
975
+ : {};
976
+ const safe = {};
977
+ for (const key of ["providerId", "sessionId", "eventCount", "model"]) {
978
+ const value = record[key];
979
+ if (typeof value === "string" || typeof value === "number" || typeof value === "boolean" || value === null) {
980
+ safe[key] = value;
981
+ }
982
+ }
983
+ return Object.keys(safe).length > 0 ? safe : null;
984
+ }
985
+ function jsonSurfaceFile(pathname, value) {
986
+ return {
987
+ path: pathname,
988
+ kind: "text",
989
+ encoding: "utf8",
990
+ executable: false,
991
+ content: `${JSON.stringify(value, null, 2)}\n`,
992
+ };
993
+ }
897
994
  async function runRubricCriterionJudge(args) {
898
995
  const traceRoot = path.join(args.request.paths.output, ".workbench", "internal", "rubric", safeInternalPathSegment(args.criterion.id));
996
+ const tracePath = rubricCriterionTracePath(args.workload.job.id, args.criterion.id, "judge");
899
997
  const agentResult = await executeBuiltInAgentTurn(args.agentExecutor, {
900
998
  role: "engine",
901
999
  provider: args.engine.judge,
@@ -903,9 +1001,10 @@ async function runRubricCriterionJudge(args) {
903
1001
  adapterAuthRequest: args.adapterAuthRequest,
904
1002
  adapterAuthEnv: args.adapterAuthEnv,
905
1003
  workspaceRoot: args.request.paths.workspace,
906
- cwd: args.request.paths.cwd ?? args.request.paths.workspace,
1004
+ cwd: args.request.paths.workspace,
907
1005
  prompt: buildRubricCriterionJudgePrompt(args.workload, args.engine, args.criterion),
908
1006
  traceRoot: path.join(traceRoot, "judge"),
1007
+ tracePath,
909
1008
  jobId: args.workload.job.id,
910
1009
  });
911
1010
  let usage = args.runtime.assignUsageRole("engine", agentResult.usage);
@@ -913,12 +1012,13 @@ async function runRubricCriterionJudge(args) {
913
1012
  return {
914
1013
  ...normalizeRubricCriterionJudgeResult(agentResult.output, args.criterion),
915
1014
  metadata: agentResult.metadata,
916
- traceRoot,
1015
+ traceFiles: publicRubricAgentTraceFiles(agentResult.traceFiles),
917
1016
  ...(usage ? { usage } : {}),
918
1017
  };
919
1018
  }
920
1019
  catch (error) {
921
1020
  const repairError = error instanceof Error ? error.message : String(error);
1021
+ const repairTracePath = rubricCriterionTracePath(args.workload.job.id, args.criterion.id, "repair");
922
1022
  const repairResult = await executeBuiltInAgentTurn(args.agentExecutor, {
923
1023
  role: "engine",
924
1024
  provider: args.engine.judge,
@@ -926,13 +1026,14 @@ async function runRubricCriterionJudge(args) {
926
1026
  adapterAuthRequest: args.adapterAuthRequest,
927
1027
  adapterAuthEnv: args.adapterAuthEnv,
928
1028
  workspaceRoot: args.request.paths.workspace,
929
- cwd: args.request.paths.cwd ?? args.request.paths.workspace,
1029
+ cwd: args.request.paths.workspace,
930
1030
  prompt: buildRubricCriterionRepairPrompt({
931
1031
  output: agentResult.output,
932
1032
  error: repairError,
933
1033
  criterion: args.criterion,
934
1034
  }),
935
1035
  traceRoot: path.join(traceRoot, "repair"),
1036
+ tracePath: repairTracePath,
936
1037
  jobId: args.workload.job.id,
937
1038
  });
938
1039
  usage = args.runtime.mergeUsageSummaries([
@@ -949,7 +1050,10 @@ async function runRubricCriterionJudge(args) {
949
1050
  originalMetadata: agentResult.metadata,
950
1051
  },
951
1052
  },
952
- traceRoot,
1053
+ traceFiles: publicRubricAgentTraceFiles([
1054
+ ...agentResult.traceFiles,
1055
+ ...repairResult.traceFiles,
1056
+ ]),
953
1057
  repair: {
954
1058
  attempted: true,
955
1059
  originalError: repairError,
@@ -958,6 +1062,14 @@ async function runRubricCriterionJudge(args) {
958
1062
  };
959
1063
  }
960
1064
  }
1065
+ function publicRubricAgentTraceFiles(files) {
1066
+ return files
1067
+ .filter((file) => file.encoding === "utf8" && file.path.endsWith("/trace.json"))
1068
+ .map((file) => ({ ...file }));
1069
+ }
1070
+ function rubricCriterionTracePath(jobId, criterionId, turn) {
1071
+ return `.workbench/traces/${jobId}/engine/rubric/criteria/${safeInternalPathSegment(criterionId)}/${turn}`;
1072
+ }
961
1073
  function buildRubricCriterionJudgePrompt(workload, engine, criterion) {
962
1074
  requireWorkloadTask(workload, "Rubric judge");
963
1075
  return [
@@ -981,7 +1093,7 @@ function buildRubricCriterionJudgePrompt(workload, engine, criterion) {
981
1093
  score: 0.0,
982
1094
  pass: false,
983
1095
  rationale: "why this criterion received this score",
984
- summary: "short grading summary",
1096
+ summary: "short scoring summary",
985
1097
  feedback: {},
986
1098
  }, null, 2),
987
1099
  `The only allowed criterion_id is ${criterion.id}.`,
@@ -1009,7 +1121,7 @@ function buildRubricCriterionRepairPrompt(input) {
1009
1121
  score: 0.0,
1010
1122
  pass: false,
1011
1123
  rationale: "why this criterion received this score",
1012
- summary: "short grading summary",
1124
+ summary: "short scoring summary",
1013
1125
  feedback: {},
1014
1126
  }, null, 2),
1015
1127
  "",
@@ -1026,9 +1138,6 @@ function rubricJudgeResultFromCriteria(args) {
1026
1138
  throw new Error("Rubric criterion scores must aggregate to a score in the 0..1 range.");
1027
1139
  }
1028
1140
  const metrics = { score };
1029
- for (const criterion of criteria) {
1030
- metrics[`criterion__${criterion.criterion_id}`] = criterion.score;
1031
- }
1032
1141
  const caseResult = rubricJudgeCaseResult({
1033
1142
  workload: args.workload,
1034
1143
  score,
@@ -1196,8 +1305,6 @@ function isRuntimeWorkspacePath(filePath) {
1196
1305
  normalized.startsWith("input/") ||
1197
1306
  normalized === "output" ||
1198
1307
  normalized.startsWith("output/") ||
1199
- normalized === "logs" ||
1200
- normalized.startsWith("logs/") ||
1201
1308
  normalized === "private" ||
1202
1309
  normalized.startsWith("private/");
1203
1310
  }
@@ -0,0 +1,5 @@
1
+ import { sortLocalTraceRefs, type AgentReadableTraceDigest, type LocalTraceAdapter, type LocalTraceRef } from "@workbench-ai/agent-driver";
2
+ export declare function builtinLocalTraceAdapters(): LocalTraceAdapter[];
3
+ export declare function builtinLocalTraceAdapter(id: string): LocalTraceAdapter | null;
4
+ export { sortLocalTraceRefs, type AgentReadableTraceDigest, type LocalTraceAdapter, type LocalTraceRef, };
5
+ //# sourceMappingURL=local-traces.d.ts.map
@@ -0,0 +1 @@
1
+ {"version":3,"file":"local-traces.d.ts","sourceRoot":"","sources":["../src/local-traces.ts"],"names":[],"mappings":"AAAA,OAAO,EACL,kBAAkB,EAClB,KAAK,wBAAwB,EAC7B,KAAK,iBAAiB,EACtB,KAAK,aAAa,EACnB,MAAM,4BAA4B,CAAC;AASpC,wBAAgB,yBAAyB,IAAI,iBAAiB,EAAE,CAE/D;AAED,wBAAgB,wBAAwB,CAAC,EAAE,EAAE,MAAM,GAAG,iBAAiB,GAAG,IAAI,CAE7E;AAED,OAAO,EACL,kBAAkB,EAClB,KAAK,wBAAwB,EAC7B,KAAK,iBAAiB,EACtB,KAAK,aAAa,GACnB,CAAC"}
@@ -0,0 +1,14 @@
1
+ import { sortLocalTraceRefs, } from "@workbench-ai/agent-driver";
2
+ import { claudeLocalTraceAdapter } from "@workbench-ai/agent-driver-anthropic-claude-code";
3
+ import { codexLocalTraceAdapter } from "@workbench-ai/agent-driver-openai-codex";
4
+ const BUILT_IN_LOCAL_TRACE_ADAPTERS = [
5
+ codexLocalTraceAdapter,
6
+ claudeLocalTraceAdapter,
7
+ ];
8
+ export function builtinLocalTraceAdapters() {
9
+ return [...BUILT_IN_LOCAL_TRACE_ADAPTERS];
10
+ }
11
+ export function builtinLocalTraceAdapter(id) {
12
+ return BUILT_IN_LOCAL_TRACE_ADAPTERS.find((adapter) => adapter.id === id) ?? null;
13
+ }
14
+ export { sortLocalTraceRefs, };
@@ -1,5 +1,5 @@
1
1
  import type { WorkbenchAdapterManifest } from "@workbench-ai/workbench-protocol";
2
- export type WorkbenchPublicBuiltInAdapterId = "workbench" | "codex" | "claude" | "pi" | "command";
2
+ export type WorkbenchPublicBuiltInAdapterId = "workbench" | "codex" | "claude" | "command";
3
3
  export type WorkbenchEngineHelperAdapterId = "rubric" | "tests";
4
4
  export type WorkbenchBuiltInAdapterId = WorkbenchPublicBuiltInAdapterId | WorkbenchEngineHelperAdapterId;
5
5
  export declare function builtinWorkbenchAdapterManifest(id: string): WorkbenchAdapterManifest | null;
@@ -1 +1 @@
1
- {"version":3,"file":"manifests.d.ts","sourceRoot":"","sources":["../src/manifests.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EACV,wBAAwB,EACzB,MAAM,kCAAkC,CAAC;AAW1C,MAAM,MAAM,+BAA+B,GACvC,WAAW,GACX,OAAO,GACP,QAAQ,GACR,IAAI,GACJ,SAAS,CAAC;AAEd,MAAM,MAAM,8BAA8B,GACtC,QAAQ,GACR,OAAO,CAAC;AAEZ,MAAM,MAAM,yBAAyB,GACjC,+BAA+B,GAC/B,8BAA8B,CAAC;AAuFnC,wBAAgB,+BAA+B,CAAC,EAAE,EAAE,MAAM,GAAG,wBAAwB,GAAG,IAAI,CAI3F;AAED,wBAAgB,gCAAgC,IAAI,wBAAwB,EAAE,CAI7E;AAED,wBAAgB,2BAA2B,CAAC,EAAE,EAAE,MAAM,GAAG,EAAE,IAAI,yBAAyB,CAEvF;AAED,wBAAgB,kBAAkB,CAAC,SAAS,EAAE,MAAM,GAAG,MAAM,CAE5D"}
1
+ {"version":3,"file":"manifests.d.ts","sourceRoot":"","sources":["../src/manifests.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EACV,wBAAwB,EACzB,MAAM,kCAAkC,CAAC;AAW1C,MAAM,MAAM,+BAA+B,GACvC,WAAW,GACX,OAAO,GACP,QAAQ,GACR,SAAS,CAAC;AAEd,MAAM,MAAM,8BAA8B,GACtC,QAAQ,GACR,OAAO,CAAC;AAEZ,MAAM,MAAM,yBAAyB,GACjC,+BAA+B,GAC/B,8BAA8B,CAAC;AA+EnC,wBAAgB,+BAA+B,CAAC,EAAE,EAAE,MAAM,GAAG,wBAAwB,GAAG,IAAI,CAI3F;AAED,wBAAgB,gCAAgC,IAAI,wBAAwB,EAAE,CAI7E;AAED,wBAAgB,2BAA2B,CAAC,EAAE,EAAE,MAAM,GAAG,EAAE,IAAI,yBAAyB,CAEvF;AAED,wBAAgB,kBAAkB,CAAC,SAAS,EAAE,MAAM,GAAG,MAAM,CAE5D"}
package/dist/manifests.js CHANGED
@@ -3,7 +3,7 @@ const BUILT_IN_ADAPTER_MANIFESTS = Object.fromEntries(Object.entries({
3
3
  workbench: defineAdapter({
4
4
  id: "workbench",
5
5
  engineResolve: defineEngineResolver(),
6
- engineRun: defineEngineRunner(),
6
+ engineRun: defineEngineRunner({ executor: "host" }),
7
7
  slots: {
8
8
  score: adapterSlot("/score", "engine.run"),
9
9
  },
@@ -55,14 +55,6 @@ const BUILT_IN_ADAPTER_MANIFESTS = Object.fromEntries(Object.entries({
55
55
  },
56
56
  },
57
57
  }),
58
- pi: defineAdapter({
59
- id: "pi",
60
- subject: defineSubject(),
61
- improve: defineOptimizer(),
62
- setup: [
63
- "npm install --global @mariozechner/pi-coding-agent@0.70.2",
64
- ],
65
- }),
66
58
  command: defineAdapter({
67
59
  id: "command",
68
60
  subject: defineSubject(),
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@workbench-ai/workbench-built-in-adapters",
3
- "version": "0.0.46",
3
+ "version": "0.0.48",
4
4
  "type": "module",
5
5
  "repository": {
6
6
  "type": "git",
@@ -13,6 +13,10 @@
13
13
  ".": {
14
14
  "types": "./dist/index.d.ts",
15
15
  "default": "./dist/index.js"
16
+ },
17
+ "./local-traces": {
18
+ "types": "./dist/local-traces.d.ts",
19
+ "default": "./dist/local-traces.js"
16
20
  }
17
21
  },
18
22
  "bin": {
@@ -21,21 +25,19 @@
21
25
  "workbench-adapter-tests": "dist/bin/tests.js",
22
26
  "workbench-adapter-rubric": "dist/bin/rubric.js",
23
27
  "workbench-adapter-codex": "dist/bin/codex.js",
24
- "workbench-adapter-claude": "dist/bin/claude.js",
25
- "workbench-adapter-pi": "dist/bin/pi.js"
28
+ "workbench-adapter-claude": "dist/bin/claude.js"
26
29
  },
27
30
  "files": [
28
31
  "dist"
29
32
  ],
30
33
  "dependencies": {
31
34
  "yaml": "^2.8.2",
32
- "@workbench-ai/agent-driver-anthropic-claude-code": "0.0.44",
33
- "@workbench-ai/agent-driver-badlogic-pi-coding-agent": "0.0.44",
34
- "@workbench-ai/agent-driver-openai-codex": "0.0.44",
35
- "@workbench-ai/workbench-contract": "0.0.46",
36
- "@workbench-ai/agent-driver": "0.0.44",
37
- "@workbench-ai/workbench-protocol": "0.0.46",
38
- "@workbench-ai/workbench-core": "0.0.46"
35
+ "@workbench-ai/agent-driver-anthropic-claude-code": "0.0.45",
36
+ "@workbench-ai/agent-driver-openai-codex": "0.0.45",
37
+ "@workbench-ai/agent-driver": "0.0.45",
38
+ "@workbench-ai/workbench-contract": "0.0.48",
39
+ "@workbench-ai/workbench-core": "0.0.48",
40
+ "@workbench-ai/workbench-protocol": "0.0.48"
39
41
  },
40
42
  "devDependencies": {
41
43
  "@types/node": "^24.3.1",
package/dist/bin/pi.d.ts DELETED
@@ -1,3 +0,0 @@
1
- #!/usr/bin/env node
2
- export {};
3
- //# sourceMappingURL=pi.d.ts.map
@@ -1 +0,0 @@
1
- {"version":3,"file":"pi.d.ts","sourceRoot":"","sources":["../../src/bin/pi.ts"],"names":[],"mappings":""}
package/dist/bin/pi.js DELETED
@@ -1,3 +0,0 @@
1
- #!/usr/bin/env node
2
- import { executeWorkbenchBuiltInAdapterCommand } from "../execute.js";
3
- await executeWorkbenchBuiltInAdapterCommand({ adapterId: "pi" });