@workbench-ai/workbench-built-in-adapters 0.0.46 → 0.0.47
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/agent-turn.d.ts +1 -0
- package/dist/agent-turn.d.ts.map +1 -1
- package/dist/agent-turn.js +19 -33
- package/dist/execute.d.ts.map +1 -1
- package/dist/execute.js +289 -182
- package/dist/local-traces.d.ts +5 -0
- package/dist/local-traces.d.ts.map +1 -0
- package/dist/local-traces.js +14 -0
- package/dist/manifests.d.ts +1 -1
- package/dist/manifests.d.ts.map +1 -1
- package/dist/manifests.js +1 -9
- package/package.json +9 -7
- package/dist/bin/pi.d.ts +0 -3
- package/dist/bin/pi.d.ts.map +0 -1
- package/dist/bin/pi.js +0 -3
package/dist/agent-turn.d.ts
CHANGED
package/dist/agent-turn.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"agent-turn.d.ts","sourceRoot":"","sources":["../src/agent-turn.ts"],"names":[],"mappings":"AAMA,OAAO,KAAK,EAEV,mBAAmB,EACnB,YAAY,EACb,MAAM,kCAAkC,CAAC;AAC1C,OAAO,EAQL,KAAK,SAAS,EAEf,MAAM,4BAA4B,CAAC;AACpC,OAAO,KAAK,EACV,gCAAgC,EACjC,MAAM,8BAA8B,CAAC;AAetC,MAAM,WAAW,iBAAiB;IAChC,GAAG,EAAE,MAAM,CAAC;IACZ,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,MAAM,CAAC,EAAE,MAAM,CAAC;CACjB;AAED,MAAM,WAAW,yBAAyB;IACxC,IAAI,EAAE,WAAW,GAAG,QAAQ,GAAG,QAAQ,CAAC;IACxC,QAAQ,EAAE,iBAAiB,CAAC;IAC5B,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB,kBAAkB,CAAC,EAAE,SAAS,CAAC;IAC/B,cAAc,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IACxC,aAAa,EAAE,MAAM,CAAC;IACtB,GAAG,EAAE,MAAM,CAAC;IACZ,MAAM,EAAE,MAAM,CAAC;IACf,SAAS,EAAE,MAAM,CAAC;IAClB,KAAK,EAAE,MAAM,CAAC;IACd,cAAc,CAAC,EAAE,gCAAgC,CAAC;CACnD;AAED,MAAM,WAAW,wBAAwB;IACvC,MAAM,EAAE,MAAM,CAAC;IACf,UAAU,EAAE,mBAAmB,EAAE,CAAC;IAClC,QAAQ,EAAE,MAAM,CAAC,MAAM,EAAE,SAAS,CAAC,CAAC;IACpC,KAAK,CAAC,EAAE,YAAY,CAAC;CACtB;AAED,MAAM,MAAM,0BAA0B,GAAG,CAAC,OAAO,EAAE,yBAAyB,KAAK,OAAO,CAAC,wBAAwB,CAAC,CAAC;
|
|
1
|
+
{"version":3,"file":"agent-turn.d.ts","sourceRoot":"","sources":["../src/agent-turn.ts"],"names":[],"mappings":"AAMA,OAAO,KAAK,EAEV,mBAAmB,EACnB,YAAY,EACb,MAAM,kCAAkC,CAAC;AAC1C,OAAO,EAQL,KAAK,SAAS,EAEf,MAAM,4BAA4B,CAAC;AACpC,OAAO,KAAK,EACV,gCAAgC,EACjC,MAAM,8BAA8B,CAAC;AAetC,MAAM,WAAW,iBAAiB;IAChC,GAAG,EAAE,MAAM,CAAC;IACZ,KAAK,CAAC,EAAE,MAAM,CAAC;IACf,MAAM,CAAC,EAAE,MAAM,CAAC;CACjB;AAED,MAAM,WAAW,yBAAyB;IACxC,IAAI,EAAE,WAAW,GAAG,QAAQ,GAAG,QAAQ,CAAC;IACxC,QAAQ,EAAE,iBAAiB,CAAC;IAC5B,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB,kBAAkB,CAAC,EAAE,SAAS,CAAC;IAC/B,cAAc,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;IACxC,aAAa,EAAE,MAAM,CAAC;IACtB,GAAG,EAAE,MAAM,CAAC;IACZ,MAAM,EAAE,MAAM,CAAC;IACf,SAAS,EAAE,MAAM,CAAC;IAClB,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,KAAK,EAAE,MAAM,CAAC;IACd,cAAc,CAAC,EAAE,gCAAgC,CAAC;CACnD;AAED,MAAM,WAAW,wBAAwB;IACvC,MAAM,EAAE,MAAM,CAAC;IACf,UAAU,EAAE,mBAAmB,EAAE,CAAC;IAClC,QAAQ,EAAE,MAAM,CAAC,MAAM,EAAE,SAAS,CAAC,CAAC;IACpC,KAAK,CAAC,EAAE,YAAY,CAAC;CACtB;AAED,MAAM,MAAM,0BAA0B,GAAG,CAAC,OAAO,EAAE,yBAAyB,KAAK,OAAO,CAAC,wBAAwB,CAAC,CAAC;AA4BnH,wBAAsB,yBAAyB,CAC7C,QAAQ,EAAE,CAAC,OAAO,EAAE,yBAAyB,KAAK,OAAO,CAAC,wBAAwB,CAAC,EACnF,OAAO,EAAE,yBAAyB,GACjC,OAAO,CAAC,wBAAwB,CAAC,CAenC;AAED,wBAAsB,iCAAiC,CACrD,OAAO,EAAE,yBAAyB,GACjC,OAAO,CAAC,wBAAwB,CAAC,CAwFnC"}
|
package/dist/agent-turn.js
CHANGED
|
@@ -8,7 +8,7 @@ import { importWorkbenchRuntime } from "./runtime.js";
|
|
|
8
8
|
const DEFAULT_AGENT_TURN_MAX_ATTEMPTS = 3;
|
|
9
9
|
const DEFAULT_AGENT_TURN_RETRY_BASE_MS = 5_000;
|
|
10
10
|
const DEFAULT_AGENT_TURN_RETRY_MAX_MS = 30_000;
|
|
11
|
-
const
|
|
11
|
+
const AGENT_PROVIDER_REGISTRY = {
|
|
12
12
|
codex: {
|
|
13
13
|
executable: "codex",
|
|
14
14
|
installHint: "@openai/codex",
|
|
@@ -32,14 +32,6 @@ const AGENT_HARNESS_REGISTRY = {
|
|
|
32
32
|
return module.claudeCodeHarness();
|
|
33
33
|
},
|
|
34
34
|
},
|
|
35
|
-
pi: {
|
|
36
|
-
executable: "pi",
|
|
37
|
-
installHint: "@mariozechner/pi-coding-agent",
|
|
38
|
-
async load() {
|
|
39
|
-
const module = await import("@workbench-ai/agent-driver-badlogic-pi-coding-agent");
|
|
40
|
-
return module.piCodingAgentHarness();
|
|
41
|
-
},
|
|
42
|
-
},
|
|
43
35
|
};
|
|
44
36
|
export async function executeWorkbenchAgentTurn(executor, request) {
|
|
45
37
|
const maxAttempts = workbenchAgentTurnMaxAttempts();
|
|
@@ -61,13 +53,13 @@ export async function executeWorkbenchAgentTurn(executor, request) {
|
|
|
61
53
|
export async function defaultWorkbenchAgentTurnExecutor(request) {
|
|
62
54
|
const execFileAsync = promisify(execFile);
|
|
63
55
|
await ensureAgentExecutableOnPath(request.provider.use, execFileAsync);
|
|
64
|
-
const provider = await
|
|
56
|
+
const provider = await loadAgentProvider(request.provider.use);
|
|
65
57
|
const agentHome = resolveRuntimeHome();
|
|
66
58
|
const stageSessionPath = path.join(request.traceRoot, "session");
|
|
67
59
|
await fs.mkdir(stageSessionPath, { recursive: true });
|
|
68
60
|
const restoreEnv = applyAdapterAuthEnv(request.adapterAuthEnv);
|
|
69
61
|
try {
|
|
70
|
-
const plan = await
|
|
62
|
+
const plan = await buildAgentExecutionPlan(provider, request.provider, request.workspaceRoot, agentHome, {
|
|
71
63
|
root: request.adapterAuthRoot,
|
|
72
64
|
request: request.adapterAuthRequest,
|
|
73
65
|
});
|
|
@@ -120,15 +112,9 @@ export async function defaultWorkbenchAgentTurnExecutor(request) {
|
|
|
120
112
|
const usage = runtime.extractExecutionUsageFromTrace(turnResult.trace, request.provider, provider.manifest.id, turnResult.events);
|
|
121
113
|
const eventCount = Math.max(turnResult.events.length, traceEventCount(turnResult.trace));
|
|
122
114
|
await writeAgentTraceFile(path.join(stageSessionPath, "trace.json"), turnResult.trace);
|
|
123
|
-
await fs.writeFile(path.join(stageSessionPath, "agent-result.json"), `${JSON.stringify({
|
|
124
|
-
sessionId: turnResult.sessionId,
|
|
125
|
-
finalOutput: turnResult.finalOutput,
|
|
126
|
-
eventCount,
|
|
127
|
-
...(usage ? { usage } : {}),
|
|
128
|
-
}, null, 2)}\n`);
|
|
129
115
|
return {
|
|
130
116
|
output: turnResult.finalOutput,
|
|
131
|
-
traceFiles: await runtime.readOutputTraceFiles(request.traceRoot, `.workbench/traces/${request.jobId}/${request.role}`),
|
|
117
|
+
traceFiles: await runtime.readOutputTraceFiles(request.traceRoot, request.tracePath ?? `.workbench/traces/${request.jobId}/${request.role}`),
|
|
132
118
|
metadata: {
|
|
133
119
|
providerId: provider.manifest.id,
|
|
134
120
|
sessionId: turnResult.sessionId,
|
|
@@ -186,8 +172,8 @@ function traceEventCount(trace) {
|
|
|
186
172
|
: {};
|
|
187
173
|
return Array.isArray(traceRecord.events) ? traceRecord.events.length : 0;
|
|
188
174
|
}
|
|
189
|
-
async function
|
|
190
|
-
return await
|
|
175
|
+
async function loadAgentProvider(providerName) {
|
|
176
|
+
return await agentProviderRegistration(providerName).load();
|
|
191
177
|
}
|
|
192
178
|
async function ensureAgentExecutableOnPath(providerName, execFileAsync) {
|
|
193
179
|
const executable = agentExecutableName(providerName);
|
|
@@ -202,28 +188,28 @@ async function ensureAgentExecutableOnPath(providerName, execFileAsync) {
|
|
|
202
188
|
}
|
|
203
189
|
}
|
|
204
190
|
function agentExecutableName(providerName) {
|
|
205
|
-
return
|
|
191
|
+
return agentProviderRegistration(providerName).executable;
|
|
206
192
|
}
|
|
207
193
|
function agentExecutableInstallHint(providerName) {
|
|
208
|
-
return
|
|
194
|
+
return agentProviderRegistration(providerName).installHint;
|
|
209
195
|
}
|
|
210
|
-
function
|
|
211
|
-
const registration =
|
|
196
|
+
function agentProviderRegistration(providerName) {
|
|
197
|
+
const registration = AGENT_PROVIDER_REGISTRY[providerName];
|
|
212
198
|
if (!registration) {
|
|
213
199
|
throw new Error(`Unsupported first-party agent adapter: ${providerName}`);
|
|
214
200
|
}
|
|
215
201
|
return registration;
|
|
216
202
|
}
|
|
217
|
-
async function
|
|
203
|
+
async function buildAgentExecutionPlan(provider, providerSpec, workspaceRoot, agentHome, adapterAuth) {
|
|
218
204
|
const turnTimeoutMs = provider.manifest.defaults.turn_timeout_ms ?? 3_600_000;
|
|
219
205
|
const harness = {
|
|
220
206
|
id: provider.manifest.id,
|
|
221
|
-
auth: await
|
|
207
|
+
auth: await resolveAgentAuth(provider, providerSpec, workspaceRoot, agentHome, adapterAuth),
|
|
222
208
|
...(firstNonEmpty(providerSpec.model, provider.manifest.defaults.model) ? { model: firstNonEmpty(providerSpec.model, provider.manifest.defaults.model) } : {}),
|
|
223
209
|
...(firstNonEmpty(providerSpec.effort, provider.manifest.defaults.effort) ? { effort: firstNonEmpty(providerSpec.effort, provider.manifest.defaults.effort) } : {}),
|
|
224
210
|
turn_timeout_ms: turnTimeoutMs,
|
|
225
211
|
stall_timeout_ms: Math.max(provider.manifest.defaults.stall_timeout_ms ?? 0, turnTimeoutMs),
|
|
226
|
-
config:
|
|
212
|
+
config: resolveAgentConfig(provider, defaultWorkbenchAgentConfig(provider, providerSpec.use)),
|
|
227
213
|
retry: DEFAULT_HARNESS_RETRY,
|
|
228
214
|
cancel: DEFAULT_HARNESS_CANCEL,
|
|
229
215
|
};
|
|
@@ -235,15 +221,15 @@ async function buildAgentHarnessExecutionPlan(provider, providerSpec, workspaceR
|
|
|
235
221
|
harness,
|
|
236
222
|
};
|
|
237
223
|
}
|
|
238
|
-
function
|
|
224
|
+
function defaultWorkbenchAgentConfig(provider, providerName) {
|
|
239
225
|
const fallback = (provider.manifest.defaults.config ?? {});
|
|
240
226
|
return {
|
|
241
227
|
...fallback,
|
|
242
|
-
...(
|
|
228
|
+
...(AGENT_PROVIDER_REGISTRY[providerName]?.defaultConfig ?? {}),
|
|
243
229
|
};
|
|
244
230
|
}
|
|
245
|
-
async function
|
|
246
|
-
const subject =
|
|
231
|
+
async function resolveAgentAuth(provider, providerSpec, workspaceRoot, agentHome, adapterAuth) {
|
|
232
|
+
const subject = adapterAuthProviderSubject(adapterAuth.request, providerSpec.use) ??
|
|
247
233
|
(provider.manifest.defaults.auth ?? {});
|
|
248
234
|
const parsed = provider.schemas.auth.safeParse(subject);
|
|
249
235
|
if (!parsed.success) {
|
|
@@ -253,7 +239,7 @@ async function resolveAgentHarnessAuth(provider, providerSpec, workspaceRoot, ag
|
|
|
253
239
|
void agentHome;
|
|
254
240
|
return { ...parsed.data };
|
|
255
241
|
}
|
|
256
|
-
function
|
|
242
|
+
function adapterAuthProviderSubject(auth, providerName) {
|
|
257
243
|
const record = jsonRecord(auth);
|
|
258
244
|
const self = jsonRecord(record?.self);
|
|
259
245
|
const adapters = jsonRecord(record?.adapters);
|
|
@@ -285,7 +271,7 @@ function adapterAuthHarnessSubject(auth, providerName) {
|
|
|
285
271
|
}
|
|
286
272
|
return null;
|
|
287
273
|
}
|
|
288
|
-
function
|
|
274
|
+
function resolveAgentConfig(provider, fallback) {
|
|
289
275
|
const parsed = provider.schemas.config.safeParse(fallback);
|
|
290
276
|
if (!parsed.success) {
|
|
291
277
|
throw new Error(`Agent provider "${provider.manifest.id}" config is invalid: ${formatValidationIssues(parsed.error.issues)}`);
|
package/dist/execute.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"execute.d.ts","sourceRoot":"","sources":["../src/execute.ts"],"names":[],"mappings":"AAIA,OAAO,KAAK,EAEV,IAAI,EAKL,MAAM,kCAAkC,CAAC;
|
|
1
|
+
{"version":3,"file":"execute.d.ts","sourceRoot":"","sources":["../src/execute.ts"],"names":[],"mappings":"AAIA,OAAO,KAAK,EAEV,IAAI,EAKL,MAAM,kCAAkC,CAAC;AAc1C,OAAO,KAAK,EAEV,0BAA0B,EAG3B,MAAM,iBAAiB,CAAC;AAQzB,MAAM,WAAW,4CAA4C;IAC3D,SAAS,CAAC,EAAE,MAAM,CAAC;IACnB,WAAW,CAAC,EAAE,MAAM,CAAC;IACrB,UAAU,CAAC,EAAE,MAAM,CAAC;IACpB,aAAa,CAAC,EAAE,0BAA0B,CAAC;IAC3C,eAAe,CAAC,EAAE,MAAM,CAAC;IACzB,kBAAkB,CAAC,EAAE,IAAI,CAAC;IAC1B,cAAc,CAAC,EAAE,MAAM,CAAC,MAAM,EAAE,MAAM,CAAC,CAAC;CACzC;AA4CD,wBAAsB,qCAAqC,CACzD,IAAI,GAAE,4CAAiD,GACtD,OAAO,CAAC,IAAI,CAAC,CAiEf"}
|
package/dist/execute.js
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
import { spawn } from "node:child_process";
|
|
2
2
|
import { promises as fs } from "node:fs";
|
|
3
3
|
import path from "node:path";
|
|
4
|
-
import { ensureWorkbenchAdapterOutputDir, readWorkbenchAdapterOperationResult, readWorkbenchAdapterOperationRequest, writeWorkbenchAdapterOperationResult, workbenchAdapterOperationResultPath, } from "@workbench-ai/workbench-protocol";
|
|
4
|
+
import { ensureWorkbenchAdapterOutputDir, readWorkbenchAdapterOperationResult, readWorkbenchAdapterOperationRequest, runWorkbenchRuntimeOperationSequence, writeWorkbenchAdapterOperationResult, workbenchAdapterOperationResultPath, } from "@workbench-ai/workbench-protocol";
|
|
5
5
|
import YAML from "yaml";
|
|
6
6
|
import { isWorkbenchBuiltInAdapterId, adapterCommandName, } from "./manifests.js";
|
|
7
7
|
import { importWorkbenchRuntime } from "./runtime.js";
|
|
@@ -81,7 +81,7 @@ async function executeWorkbenchEngineRequest(request) {
|
|
|
81
81
|
}
|
|
82
82
|
async function executeWorkbenchEngineResolveRequest(request) {
|
|
83
83
|
const configuredPath = workbenchEngineTasksPath(request);
|
|
84
|
-
const sourcePath = path.resolve(request.paths.
|
|
84
|
+
const sourcePath = path.resolve(request.paths.workspace, configuredPath);
|
|
85
85
|
const stat = await fs.stat(sourcePath).catch(() => null);
|
|
86
86
|
if (!stat?.isDirectory()) {
|
|
87
87
|
throw new Error(`Workbench engine tasks path is not a directory: ${sourcePath}`);
|
|
@@ -100,50 +100,42 @@ async function executeWorkbenchEngineResolveRequest(request) {
|
|
|
100
100
|
});
|
|
101
101
|
}
|
|
102
102
|
async function executeWorkbenchEngineRunRequest(request) {
|
|
103
|
-
const
|
|
104
|
-
|
|
105
|
-
|
|
106
|
-
|
|
107
|
-
|
|
108
|
-
|
|
109
|
-
|
|
103
|
+
const outcome = workbenchEngineGradingIsolation(request) === "separate"
|
|
104
|
+
? await runWorkbenchEngineSeparateGrading(request)
|
|
105
|
+
: await runWorkbenchEngineSharedGrading(request);
|
|
106
|
+
if (!outcome.result) {
|
|
107
|
+
throw new Error("Workbench engine scoring completed without an engine result.");
|
|
108
|
+
}
|
|
109
|
+
await writeSurfaceFiles(request.paths.output, outcome.files.map((file) => remapRuntimeControlTraceFile(request, file)));
|
|
110
|
+
const usage = await workbenchEngineOutcomeUsage(outcome);
|
|
111
|
+
await writeWorkbenchAdapterOperationResult(request.paths.output, {
|
|
112
|
+
protocol: "workbench.adapter-result.v1",
|
|
110
113
|
operation: "engine.run",
|
|
111
|
-
|
|
112
|
-
|
|
113
|
-
|
|
114
|
-
|
|
115
|
-
|
|
116
|
-
if (usage) {
|
|
117
|
-
await writeWorkbenchAdapterOperationResult(request.paths.output, {
|
|
118
|
-
...engineResult,
|
|
119
|
-
usage,
|
|
120
|
-
});
|
|
121
|
-
}
|
|
122
|
-
}
|
|
123
|
-
async function hideWorkbenchEnginePrivateFiles(request) {
|
|
124
|
-
if (!request.paths.enginePrivate) {
|
|
125
|
-
return [];
|
|
126
|
-
}
|
|
127
|
-
const files = await readSurfaceFilesRecursive(request.paths.enginePrivate).catch((error) => {
|
|
128
|
-
if (error.code === "ENOENT") {
|
|
129
|
-
return [];
|
|
130
|
-
}
|
|
131
|
-
throw error;
|
|
114
|
+
ok: true,
|
|
115
|
+
value: outcome.result,
|
|
116
|
+
...(usage ? { usage } : {}),
|
|
117
|
+
...(outcome.summary !== undefined ? { summary: outcome.summary } : {}),
|
|
118
|
+
...(outcome.feedback !== undefined ? { feedback: outcome.feedback } : {}),
|
|
132
119
|
});
|
|
133
|
-
await fs.rm(request.paths.enginePrivate, { recursive: true, force: true }).catch(() => undefined);
|
|
134
|
-
return files;
|
|
135
120
|
}
|
|
136
|
-
async function
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
|
|
140
|
-
|
|
141
|
-
|
|
142
|
-
|
|
143
|
-
|
|
144
|
-
|
|
145
|
-
|
|
146
|
-
|
|
121
|
+
async function workbenchEngineOutcomeUsage(outcome) {
|
|
122
|
+
const runtime = await importWorkbenchRuntime();
|
|
123
|
+
const operationUsage = outcome.usage
|
|
124
|
+
? undefined
|
|
125
|
+
: runtime.mergeUsageSummaries(outcome.operationResults.map((result) => {
|
|
126
|
+
if (result.operation === "subject.run") {
|
|
127
|
+
return runtime.assignUsageRole("runner", result.usage);
|
|
128
|
+
}
|
|
129
|
+
if (result.operation === "engine.run") {
|
|
130
|
+
return runtime.assignUsageRole("engine", result.usage);
|
|
131
|
+
}
|
|
132
|
+
return result.usage;
|
|
133
|
+
}));
|
|
134
|
+
const runtimeUsage = runtime.mergeUsageSummaries([outcome.usage, operationUsage]);
|
|
135
|
+
const resultUsage = runtimeUsage?.engine
|
|
136
|
+
? undefined
|
|
137
|
+
: runtime.assignUsageRole("engine", outcome.result?.usage);
|
|
138
|
+
return runtime.mergeUsageSummaries([runtimeUsage, resultUsage]);
|
|
147
139
|
}
|
|
148
140
|
function workbenchEngineTasksPath(request) {
|
|
149
141
|
const config = adapterCommandConfigRecord(request);
|
|
@@ -171,124 +163,145 @@ function workbenchEngineScoreInvocation(request) {
|
|
|
171
163
|
: adapterCommandName(score.use),
|
|
172
164
|
};
|
|
173
165
|
}
|
|
174
|
-
|
|
166
|
+
function workbenchEngineSubjectInvocation(request) {
|
|
175
167
|
const subject = request.context?.subject?.run;
|
|
176
|
-
if (!subject?.command) {
|
|
177
|
-
throw new Error("engine
|
|
178
|
-
}
|
|
179
|
-
|
|
180
|
-
|
|
181
|
-
|
|
182
|
-
|
|
183
|
-
with: (subject.with ?? {}),
|
|
184
|
-
...(subject.auth !== undefined ? { auth: subject.auth } : {}),
|
|
185
|
-
command: subject.command,
|
|
186
|
-
},
|
|
187
|
-
operation: "subject.run",
|
|
168
|
+
if (!subject?.use || !subject.command) {
|
|
169
|
+
throw new Error("Workbench engine requires context.subject.run.use and context.subject.run.command.");
|
|
170
|
+
}
|
|
171
|
+
return {
|
|
172
|
+
use: subject.use,
|
|
173
|
+
with: (subject.with ?? {}),
|
|
174
|
+
...(subject.auth !== undefined ? { auth: subject.auth } : {}),
|
|
188
175
|
command: subject.command,
|
|
189
|
-
|
|
190
|
-
|
|
191
|
-
|
|
176
|
+
};
|
|
177
|
+
}
|
|
178
|
+
function workbenchEngineGradingIsolation(request) {
|
|
179
|
+
const grading = jsonRecord(adapterCommandConfigRecord(request).grading);
|
|
180
|
+
const isolation = grading?.isolation;
|
|
181
|
+
if (isolation === undefined) {
|
|
182
|
+
return "shared";
|
|
183
|
+
}
|
|
184
|
+
if (isolation === "shared" || isolation === "separate") {
|
|
185
|
+
return isolation;
|
|
186
|
+
}
|
|
187
|
+
throw new Error("Workbench engine grading.isolation must be shared or separate.");
|
|
188
|
+
}
|
|
189
|
+
async function runWorkbenchEngineSharedGrading(request) {
|
|
190
|
+
const inputs = await workbenchEngineRuntimeInputs(request);
|
|
191
|
+
const subject = workbenchEngineSubjectInvocation(request);
|
|
192
|
+
const score = workbenchEngineScoreInvocation(request);
|
|
193
|
+
const result = await runWorkbenchRuntimeOperationSequence({
|
|
194
|
+
inputs,
|
|
195
|
+
prepare: true,
|
|
196
|
+
operations: [
|
|
197
|
+
{ label: "subject", operation: "subject.run", invocation: subject },
|
|
198
|
+
{ label: "score", operation: "engine.run", invocation: score },
|
|
199
|
+
],
|
|
192
200
|
});
|
|
193
|
-
|
|
194
|
-
await copySubjectOutputArtifacts(subjectOutput, request.paths.output);
|
|
201
|
+
assertRuntimeControlResultOk(result, "Workbench shared grading");
|
|
195
202
|
return result;
|
|
196
203
|
}
|
|
197
|
-
function
|
|
198
|
-
const
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
}
|
|
208
|
-
else if (engine?.total) {
|
|
209
|
-
usage.engine = engine.total;
|
|
210
|
-
}
|
|
211
|
-
if (subject?.optimizer) {
|
|
212
|
-
usage.optimizer = subject.optimizer;
|
|
213
|
-
}
|
|
214
|
-
return Object.keys(usage).length > 0 ? usage : undefined;
|
|
215
|
-
}
|
|
216
|
-
async function runNestedAdapterOperation(args) {
|
|
217
|
-
const internalRoot = path.join(args.parent.paths.output, ".workbench", "internal", args.outputName ?? "engine-slot", safeInternalPathSegment(args.parent.id));
|
|
218
|
-
const output = args.outputName ? path.join(internalRoot, "output") : args.parent.paths.output;
|
|
219
|
-
const result = args.outputName
|
|
220
|
-
? workbenchAdapterOperationResultPath(output)
|
|
221
|
-
: args.parent.paths.result;
|
|
222
|
-
const requestPath = path.join(internalRoot, args.requestName);
|
|
223
|
-
await fs.mkdir(path.dirname(requestPath), { recursive: true });
|
|
224
|
-
await fs.mkdir(output, { recursive: true });
|
|
225
|
-
const nestedPaths = {
|
|
226
|
-
...args.parent.paths,
|
|
227
|
-
output,
|
|
228
|
-
result,
|
|
229
|
-
};
|
|
230
|
-
if (args.visibility === "subject") {
|
|
231
|
-
delete nestedPaths.enginePrivate;
|
|
232
|
-
}
|
|
233
|
-
await fs.writeFile(requestPath, `${JSON.stringify({
|
|
234
|
-
...args.parent,
|
|
235
|
-
id: `${args.parent.id}:${args.invocation.use}:${args.operation}`,
|
|
236
|
-
operation: args.operation,
|
|
237
|
-
invocation: {
|
|
238
|
-
use: args.invocation.use,
|
|
239
|
-
with: args.invocation.with,
|
|
240
|
-
...(args.invocation.auth !== undefined ? { auth: args.invocation.auth } : {}),
|
|
204
|
+
async function runWorkbenchEngineSeparateGrading(request) {
|
|
205
|
+
const inputs = await workbenchEngineRuntimeInputs(request);
|
|
206
|
+
const subject = workbenchEngineSubjectInvocation(request);
|
|
207
|
+
const score = workbenchEngineScoreInvocation(request);
|
|
208
|
+
const runtime = await importWorkbenchRuntime();
|
|
209
|
+
const runner = await runWorkbenchRuntimeOperationSequence({
|
|
210
|
+
inputs: {
|
|
211
|
+
subject: inputs.subject,
|
|
212
|
+
case: inputs.case,
|
|
213
|
+
traces: inputs.traces,
|
|
241
214
|
},
|
|
242
|
-
|
|
243
|
-
|
|
244
|
-
|
|
245
|
-
|
|
246
|
-
|
|
247
|
-
await runAdapterShellCommand(args.command, args.parent.paths.cwd ?? args.parent.paths.workspace, {
|
|
248
|
-
WORKBENCH_ADAPTER_REQUEST: requestPath,
|
|
249
|
-
WORKBENCH_OUTPUT: output,
|
|
250
|
-
WORKBENCH_RESULT: result,
|
|
215
|
+
prepare: true,
|
|
216
|
+
collectWorkspace: true,
|
|
217
|
+
operations: [
|
|
218
|
+
{ label: "subject", operation: "subject.run", invocation: subject },
|
|
219
|
+
],
|
|
251
220
|
});
|
|
252
|
-
|
|
221
|
+
assertRuntimeControlResultOk(runner, "Workbench separate runner");
|
|
222
|
+
const grader = await runWorkbenchRuntimeOperationSequence({
|
|
223
|
+
inputs: {
|
|
224
|
+
subject: inputs.subject,
|
|
225
|
+
case: inputs.case,
|
|
226
|
+
enginePrivate: inputs.enginePrivate,
|
|
227
|
+
traces: inputs.traces,
|
|
228
|
+
workspace: runner.workspaceFiles ?? [],
|
|
229
|
+
output: runner.files.filter((file) => !runtime.isWorkbenchInternalOutputPath(file.path)),
|
|
230
|
+
},
|
|
231
|
+
prepare: false,
|
|
232
|
+
operations: [
|
|
233
|
+
{ label: "score", operation: "engine.run", invocation: score },
|
|
234
|
+
],
|
|
235
|
+
});
|
|
236
|
+
assertRuntimeControlResultOk(grader, "Workbench separate grader");
|
|
237
|
+
return {
|
|
238
|
+
...grader,
|
|
239
|
+
files: dedupeSurfaceFiles([...runner.files, ...grader.files]),
|
|
240
|
+
fileChanges: [...new Set([...runner.fileChanges, ...grader.fileChanges])].sort(),
|
|
241
|
+
usage: runtime.mergeUsageSummaries([runner.usage, grader.usage]),
|
|
242
|
+
operationResults: [...runner.operationResults, ...grader.operationResults],
|
|
243
|
+
};
|
|
253
244
|
}
|
|
254
|
-
async function
|
|
255
|
-
|
|
245
|
+
async function workbenchEngineRuntimeInputs(request) {
|
|
246
|
+
const [subject, caseFiles, enginePrivate, traces] = await Promise.all([
|
|
247
|
+
readOptionalSurfaceFiles(request.paths.subject),
|
|
248
|
+
readOptionalSurfaceFiles(request.paths.case),
|
|
249
|
+
readOptionalSurfaceFiles(request.paths.enginePrivate),
|
|
250
|
+
readOptionalSurfaceFiles(request.paths.traces),
|
|
251
|
+
]);
|
|
252
|
+
return {
|
|
253
|
+
subject,
|
|
254
|
+
case: caseFiles,
|
|
255
|
+
enginePrivate,
|
|
256
|
+
traces,
|
|
257
|
+
};
|
|
256
258
|
}
|
|
257
|
-
async function
|
|
258
|
-
|
|
259
|
-
|
|
260
|
-
|
|
261
|
-
|
|
262
|
-
|
|
263
|
-
|
|
264
|
-
continue;
|
|
265
|
-
}
|
|
266
|
-
const sourcePath = path.join(sourceRoot, relativePath);
|
|
267
|
-
const targetPath = path.join(targetRoot, relativePath);
|
|
268
|
-
if (entry.isDirectory()) {
|
|
269
|
-
await copyDirectoryEntries(sourceRoot, targetRoot, relativePath);
|
|
270
|
-
continue;
|
|
271
|
-
}
|
|
272
|
-
if (!entry.isFile()) {
|
|
273
|
-
continue;
|
|
259
|
+
async function readOptionalSurfaceFiles(root) {
|
|
260
|
+
if (!root) {
|
|
261
|
+
return [];
|
|
262
|
+
}
|
|
263
|
+
return await readSurfaceFilesRecursive(root).catch((error) => {
|
|
264
|
+
if (error.code === "ENOENT") {
|
|
265
|
+
return [];
|
|
274
266
|
}
|
|
275
|
-
|
|
276
|
-
|
|
267
|
+
throw error;
|
|
268
|
+
});
|
|
269
|
+
}
|
|
270
|
+
function assertRuntimeControlResultOk(result, label) {
|
|
271
|
+
if (result.ok) {
|
|
272
|
+
return;
|
|
273
|
+
}
|
|
274
|
+
throw new Error(`${label} failed${result.error ? `: ${result.error}` : "."}`);
|
|
275
|
+
}
|
|
276
|
+
function dedupeSurfaceFiles(files) {
|
|
277
|
+
const byPath = new Map();
|
|
278
|
+
for (const file of files) {
|
|
279
|
+
const normalized = normalizeRelativePath(file.path);
|
|
280
|
+
byPath.set(normalized, {
|
|
281
|
+
...file,
|
|
282
|
+
path: normalized,
|
|
283
|
+
});
|
|
277
284
|
}
|
|
285
|
+
return [...byPath.values()].sort((left, right) => left.path.localeCompare(right.path));
|
|
278
286
|
}
|
|
279
|
-
function
|
|
280
|
-
|
|
281
|
-
|
|
287
|
+
function remapRuntimeControlTraceFile(request, file) {
|
|
288
|
+
const normalized = normalizeRelativePath(file.path);
|
|
289
|
+
if (!normalized.startsWith(".workbench/traces/")) {
|
|
290
|
+
return { ...file, path: normalized };
|
|
282
291
|
}
|
|
283
|
-
const
|
|
284
|
-
const
|
|
285
|
-
|
|
286
|
-
|
|
287
|
-
|
|
288
|
-
|
|
289
|
-
|
|
292
|
+
const segments = normalized.split("/");
|
|
293
|
+
const rest = segments.length >= 6
|
|
294
|
+
? segments.slice(5)
|
|
295
|
+
: segments.length >= 3
|
|
296
|
+
? segments.slice(3)
|
|
297
|
+
: [];
|
|
298
|
+
if (rest.length === 0) {
|
|
299
|
+
return { ...file, path: normalized };
|
|
290
300
|
}
|
|
291
|
-
return
|
|
301
|
+
return {
|
|
302
|
+
...file,
|
|
303
|
+
path: `.workbench/traces/${request.jobId ?? request.id}/${rest.join("/")}`,
|
|
304
|
+
};
|
|
292
305
|
}
|
|
293
306
|
function safeInternalPathSegment(value) {
|
|
294
307
|
const safe = value.replace(/[^a-z0-9._-]+/giu, "_").replace(/^_+|_+$/gu, "");
|
|
@@ -296,7 +309,7 @@ function safeInternalPathSegment(value) {
|
|
|
296
309
|
}
|
|
297
310
|
async function executeCommandAdapterRequest(request) {
|
|
298
311
|
const command = requiredAdapterCommandString(request, "command");
|
|
299
|
-
await runAdapterShellCommand(command, request.paths.
|
|
312
|
+
await runAdapterShellCommand(command, request.paths.workspace);
|
|
300
313
|
if (request.operation === "engine.run") {
|
|
301
314
|
await requireCommandScoreResult(request);
|
|
302
315
|
return;
|
|
@@ -316,9 +329,9 @@ async function executeTestsEngineRequest(request) {
|
|
|
316
329
|
throw new Error(`Tests adapter cannot handle ${request.operation}.`);
|
|
317
330
|
}
|
|
318
331
|
const testsRoot = requiredRequestPath(request.paths.enginePrivate, "paths.enginePrivate");
|
|
319
|
-
const
|
|
320
|
-
|
|
321
|
-
await fs.mkdir(
|
|
332
|
+
const verifierRoot = testsVerifierOutputDir(request.paths.output);
|
|
333
|
+
await fs.rm(verifierRoot, { recursive: true, force: true }).catch(() => undefined);
|
|
334
|
+
await fs.mkdir(verifierRoot, { recursive: true });
|
|
322
335
|
const script = await firstExistingFile([
|
|
323
336
|
path.join(testsRoot, "test.sh"),
|
|
324
337
|
path.join(testsRoot, "run.sh"),
|
|
@@ -326,9 +339,11 @@ async function executeTestsEngineRequest(request) {
|
|
|
326
339
|
if (!script) {
|
|
327
340
|
throw new Error(`Tests engine requires ${path.join(testsRoot, "test.sh")}.`);
|
|
328
341
|
}
|
|
329
|
-
await runAdapterShellCommand(`sh ${shellQuote(script)}`, request.paths.
|
|
342
|
+
await runAdapterShellCommand(`sh ${shellQuote(script)}`, request.paths.workspace, {
|
|
343
|
+
WORKBENCH_TESTS_VERIFIER_DIR: verifierRoot,
|
|
344
|
+
});
|
|
330
345
|
const result = await readTestsResult({
|
|
331
|
-
|
|
346
|
+
verifierRoot,
|
|
332
347
|
caseId: request.context?.attempt?.caseId ?? "current",
|
|
333
348
|
});
|
|
334
349
|
await writeWorkbenchAdapterOperationResult(request.paths.output, {
|
|
@@ -371,7 +386,7 @@ async function writeOperationOkUnlessPresent(request) {
|
|
|
371
386
|
if (request.operation === "optimizer.improve") {
|
|
372
387
|
const patch = await createSubjectPatchFromWorkspace({
|
|
373
388
|
beforeRoot: requiredRequestPath(request.paths.subject, "paths.subject"),
|
|
374
|
-
afterRoot: request.paths.
|
|
389
|
+
afterRoot: request.paths.workspace,
|
|
375
390
|
edits: request.context?.optimizer?.edits ?? [],
|
|
376
391
|
});
|
|
377
392
|
await writeWorkbenchAdapterOperationResult(request.paths.output, {
|
|
@@ -452,8 +467,8 @@ async function readWorkbenchEngineCase(args) {
|
|
|
452
467
|
const publicPrefix = taskDirectoryPrefix(taskRecord.files, "files", args.id);
|
|
453
468
|
const testsPrefix = taskDirectoryPrefix(taskRecord.tests, "tests", args.id);
|
|
454
469
|
const solutionPrefix = taskDirectoryPrefix(taskRecord.solution, "solution", args.id);
|
|
455
|
-
const
|
|
456
|
-
const
|
|
470
|
+
const publicFiles = stripTaskDirectory(sourceFiles, publicPrefix);
|
|
471
|
+
const privateFiles = [
|
|
457
472
|
...stripTaskDirectory(sourceFiles, testsPrefix),
|
|
458
473
|
...stripTaskDirectory(sourceFiles, solutionPrefix),
|
|
459
474
|
].sort((left, right) => left.path.localeCompare(right.path));
|
|
@@ -473,8 +488,8 @@ async function readWorkbenchEngineCase(args) {
|
|
|
473
488
|
: {}),
|
|
474
489
|
},
|
|
475
490
|
files: {
|
|
476
|
-
|
|
477
|
-
|
|
491
|
+
public: publicFiles,
|
|
492
|
+
private: privateFiles,
|
|
478
493
|
source: sourceFiles,
|
|
479
494
|
},
|
|
480
495
|
};
|
|
@@ -543,11 +558,11 @@ async function fileExists(filePath) {
|
|
|
543
558
|
return fs.stat(filePath).then((stat) => stat.isFile(), () => false);
|
|
544
559
|
}
|
|
545
560
|
async function readTestsResult(args) {
|
|
546
|
-
const rewardJson = await readOptionalJson(path.join(args.
|
|
561
|
+
const rewardJson = await readOptionalJson(path.join(args.verifierRoot, "reward.json"));
|
|
547
562
|
if (rewardJson) {
|
|
548
563
|
return normalizeTestsResult(rewardJson, args.caseId);
|
|
549
564
|
}
|
|
550
|
-
const rewardText = await fs.readFile(path.join(args.
|
|
565
|
+
const rewardText = await fs.readFile(path.join(args.verifierRoot, "reward.txt"), "utf8").catch((error) => {
|
|
551
566
|
if (error.code === "ENOENT") {
|
|
552
567
|
return null;
|
|
553
568
|
}
|
|
@@ -560,7 +575,10 @@ async function readTestsResult(args) {
|
|
|
560
575
|
}
|
|
561
576
|
return normalizeTestsResult({ reward: score }, args.caseId);
|
|
562
577
|
}
|
|
563
|
-
throw new Error("Tests engine did not find reward.json or reward.txt under
|
|
578
|
+
throw new Error("Tests engine did not find reward.json or reward.txt under its verifier output directory.");
|
|
579
|
+
}
|
|
580
|
+
function testsVerifierOutputDir(outputRoot) {
|
|
581
|
+
return path.join(outputRoot, ".workbench", "internal", "verifier");
|
|
564
582
|
}
|
|
565
583
|
async function readOptionalJson(filePath) {
|
|
566
584
|
const source = await fs.readFile(filePath, "utf8").catch((error) => {
|
|
@@ -640,7 +658,7 @@ function workloadFromAdapterOperationRequest(request) {
|
|
|
640
658
|
};
|
|
641
659
|
}
|
|
642
660
|
function isBuiltInAgentAdapterId(value) {
|
|
643
|
-
return value === "codex" || value === "claude"
|
|
661
|
+
return value === "codex" || value === "claude";
|
|
644
662
|
}
|
|
645
663
|
function builtInAgentSpecFromRequest(request) {
|
|
646
664
|
const config = adapterCommandConfigRecord(request);
|
|
@@ -720,7 +738,7 @@ async function writeAgentSubjectOutput(request, workload, subject, options = {})
|
|
|
720
738
|
adapterAuthRequest: options.adapterAuthRequest,
|
|
721
739
|
adapterAuthEnv: options.adapterAuthEnv,
|
|
722
740
|
workspaceRoot: request.paths.workspace,
|
|
723
|
-
cwd: request.paths.
|
|
741
|
+
cwd: request.paths.workspace,
|
|
724
742
|
prompt: buildAgentSubjectPrompt(workload, subject),
|
|
725
743
|
traceRoot,
|
|
726
744
|
jobId: workload.job.id,
|
|
@@ -763,8 +781,10 @@ function buildAgentSubjectPrompt(workload, subject) {
|
|
|
763
781
|
return [
|
|
764
782
|
...(subject.instructions ? ["Instructions:", subject.instructions, ""] : []),
|
|
765
783
|
"Context:",
|
|
766
|
-
"- Subject files are mounted at /workspace/input/subject.",
|
|
767
|
-
"-
|
|
784
|
+
"- Subject source files are mounted at /workspace/input/subject.",
|
|
785
|
+
"- Follow any subject guidance, skill files, scripts, or configuration under /workspace/input/subject.",
|
|
786
|
+
"- The mutable working directory is /workspace.",
|
|
787
|
+
"- If the subject declares prepare.command, it has already run and may have copied files into /workspace.",
|
|
768
788
|
...(workload.case?.prompt ? ["Case:", workload.case.prompt, ""] : []),
|
|
769
789
|
"- Public case files are mounted at /workspace/input/case.",
|
|
770
790
|
"- Verifier tests are not present while you run.",
|
|
@@ -784,14 +804,14 @@ async function writeAgentSubjectRevisionOutput(request, workload, optimizer, opt
|
|
|
784
804
|
adapterAuthRequest: options.adapterAuthRequest,
|
|
785
805
|
adapterAuthEnv: options.adapterAuthEnv,
|
|
786
806
|
workspaceRoot: request.paths.workspace,
|
|
787
|
-
cwd: request.paths.
|
|
807
|
+
cwd: request.paths.workspace,
|
|
788
808
|
prompt: buildAgentOptimizerPrompt(workload),
|
|
789
809
|
traceRoot,
|
|
790
810
|
jobId: workload.job.id,
|
|
791
811
|
});
|
|
792
812
|
const subjectPatch = await createSubjectPatchFromWorkspace({
|
|
793
813
|
beforeRoot: requiredRequestPath(request.paths.subject, "paths.subject"),
|
|
794
|
-
afterRoot: request.paths.
|
|
814
|
+
afterRoot: request.paths.workspace,
|
|
795
815
|
edits: workload.optimizer.edits,
|
|
796
816
|
});
|
|
797
817
|
const changedSubjectPaths = subjectPatch.fileChanges.filter((filePath) => isSubjectEditPath(filePath, workload.optimizer.edits));
|
|
@@ -839,8 +859,10 @@ function buildAgentOptimizerPrompt(workload) {
|
|
|
839
859
|
workload.benchmark.description || workload.benchmark.name,
|
|
840
860
|
"",
|
|
841
861
|
"Context:",
|
|
842
|
-
"- Subject files are mounted at /workspace/input/subject.",
|
|
843
|
-
"-
|
|
862
|
+
"- Subject source files are mounted at /workspace/input/subject.",
|
|
863
|
+
"- Follow any subject guidance, skill files, scripts, or configuration under /workspace/input/subject.",
|
|
864
|
+
"- The mutable working directory is /workspace.",
|
|
865
|
+
"- If the subject declares prepare.command, it has already run and may have copied files into /workspace.",
|
|
844
866
|
"- Prior run traces are mounted at /workspace/input/traces.",
|
|
845
867
|
"- Use /workspace/input/traces as the source of truth for what happened in prior attempts.",
|
|
846
868
|
"- Do not mutate /workspace/input.",
|
|
@@ -849,7 +871,7 @@ function buildAgentOptimizerPrompt(workload) {
|
|
|
849
871
|
workload.optimizer.edits.map((entry) => `- ${entry}`).join("\n"),
|
|
850
872
|
"",
|
|
851
873
|
"Output:",
|
|
852
|
-
"-
|
|
874
|
+
"- Create or mutate editable subject files directly in the current working directory.",
|
|
853
875
|
"- Include at least one changed subject file covered by the optimizer edits list.",
|
|
854
876
|
].join("\n");
|
|
855
877
|
}
|
|
@@ -873,6 +895,14 @@ async function writeRubricJudgeResult(request, workload, engine, options = {}) {
|
|
|
873
895
|
engine,
|
|
874
896
|
criterionRuns,
|
|
875
897
|
});
|
|
898
|
+
await writeRubricEvidenceFiles({
|
|
899
|
+
request,
|
|
900
|
+
workload,
|
|
901
|
+
engine,
|
|
902
|
+
result,
|
|
903
|
+
criterionRuns,
|
|
904
|
+
usage,
|
|
905
|
+
});
|
|
876
906
|
await writeWorkbenchAdapterOperationResult(request.paths.output, {
|
|
877
907
|
protocol: "workbench.adapter-result.v1",
|
|
878
908
|
operation: "engine.run",
|
|
@@ -886,7 +916,7 @@ async function writeRubricJudgeResult(request, workload, engine, options = {}) {
|
|
|
886
916
|
aggregation: "weighted_mean",
|
|
887
917
|
criteria: criterionRuns.map((run) => ({
|
|
888
918
|
id: run.result.criterion_id,
|
|
889
|
-
|
|
919
|
+
traceFiles: run.traceFiles.map((file) => file.path),
|
|
890
920
|
metadata: run.metadata,
|
|
891
921
|
...(run.repair ? { repair: run.repair } : {}),
|
|
892
922
|
})),
|
|
@@ -894,8 +924,76 @@ async function writeRubricJudgeResult(request, workload, engine, options = {}) {
|
|
|
894
924
|
...(usage ? { usage } : {}),
|
|
895
925
|
});
|
|
896
926
|
}
|
|
927
|
+
async function writeRubricEvidenceFiles(args) {
|
|
928
|
+
const root = `.workbench/traces/${args.workload.job.id}/engine/rubric`;
|
|
929
|
+
const scorecard = {
|
|
930
|
+
schema: "workbench.engine.rubric.evidence.v1",
|
|
931
|
+
safeForOptimizer: true,
|
|
932
|
+
jobId: args.workload.job.id,
|
|
933
|
+
subjectId: args.workload.subjectId,
|
|
934
|
+
attemptIndex: args.workload.attemptIndex,
|
|
935
|
+
sampleIndex: args.workload.sampleIndex,
|
|
936
|
+
caseId: args.workload.caseId,
|
|
937
|
+
judge: args.engine.judge.use,
|
|
938
|
+
parallelism: args.engine.parallelism,
|
|
939
|
+
aggregation: "weighted_mean",
|
|
940
|
+
score: args.result.score,
|
|
941
|
+
metrics: args.result.metrics ?? {},
|
|
942
|
+
summary: args.result.summary ?? null,
|
|
943
|
+
criteria: args.criterionRuns.map((run) => ({
|
|
944
|
+
id: run.result.criterion_id,
|
|
945
|
+
label: run.result.label,
|
|
946
|
+
score: run.result.score,
|
|
947
|
+
pass: run.result.pass,
|
|
948
|
+
rationale: run.result.rationale ?? null,
|
|
949
|
+
errors: run.result.errors ?? [],
|
|
950
|
+
summary: run.summary ?? null,
|
|
951
|
+
metadata: safeRubricEvidenceMetadata(run.metadata),
|
|
952
|
+
repair: run.repair ?? null,
|
|
953
|
+
})),
|
|
954
|
+
...(args.usage ? { usage: args.usage } : {}),
|
|
955
|
+
};
|
|
956
|
+
await writeSurfaceFiles(args.request.paths.output, [
|
|
957
|
+
jsonSurfaceFile(`${root}/scorecard.json`, scorecard),
|
|
958
|
+
...args.criterionRuns.map((run) => jsonSurfaceFile(`${root}/criteria/${safeInternalPathSegment(run.result.criterion_id)}/result.json`, {
|
|
959
|
+
schema: "workbench.engine.rubric.criterion-evidence.v1",
|
|
960
|
+
safeForOptimizer: true,
|
|
961
|
+
criterion: args.engine.criteria.find((criterion) => criterion.id === run.result.criterion_id) ?? {
|
|
962
|
+
id: run.result.criterion_id,
|
|
963
|
+
},
|
|
964
|
+
result: run.result,
|
|
965
|
+
summary: run.summary ?? null,
|
|
966
|
+
metadata: safeRubricEvidenceMetadata(run.metadata),
|
|
967
|
+
repair: run.repair ?? null,
|
|
968
|
+
})),
|
|
969
|
+
...args.criterionRuns.flatMap((run) => run.traceFiles),
|
|
970
|
+
]);
|
|
971
|
+
}
|
|
972
|
+
function safeRubricEvidenceMetadata(metadata) {
|
|
973
|
+
const record = metadata && typeof metadata === "object" && !Array.isArray(metadata)
|
|
974
|
+
? metadata
|
|
975
|
+
: {};
|
|
976
|
+
const safe = {};
|
|
977
|
+
for (const key of ["providerId", "sessionId", "eventCount", "model"]) {
|
|
978
|
+
const value = record[key];
|
|
979
|
+
if (typeof value === "string" || typeof value === "number" || typeof value === "boolean" || value === null) {
|
|
980
|
+
safe[key] = value;
|
|
981
|
+
}
|
|
982
|
+
}
|
|
983
|
+
return Object.keys(safe).length > 0 ? safe : null;
|
|
984
|
+
}
|
|
985
|
+
function jsonSurfaceFile(pathname, value) {
|
|
986
|
+
return {
|
|
987
|
+
path: pathname,
|
|
988
|
+
kind: "text",
|
|
989
|
+
encoding: "utf8",
|
|
990
|
+
executable: false,
|
|
991
|
+
content: `${JSON.stringify(value, null, 2)}\n`,
|
|
992
|
+
};
|
|
993
|
+
}
|
|
897
994
|
async function runRubricCriterionJudge(args) {
|
|
898
995
|
const traceRoot = path.join(args.request.paths.output, ".workbench", "internal", "rubric", safeInternalPathSegment(args.criterion.id));
|
|
996
|
+
const tracePath = rubricCriterionTracePath(args.workload.job.id, args.criterion.id, "judge");
|
|
899
997
|
const agentResult = await executeBuiltInAgentTurn(args.agentExecutor, {
|
|
900
998
|
role: "engine",
|
|
901
999
|
provider: args.engine.judge,
|
|
@@ -903,9 +1001,10 @@ async function runRubricCriterionJudge(args) {
|
|
|
903
1001
|
adapterAuthRequest: args.adapterAuthRequest,
|
|
904
1002
|
adapterAuthEnv: args.adapterAuthEnv,
|
|
905
1003
|
workspaceRoot: args.request.paths.workspace,
|
|
906
|
-
cwd: args.request.paths.
|
|
1004
|
+
cwd: args.request.paths.workspace,
|
|
907
1005
|
prompt: buildRubricCriterionJudgePrompt(args.workload, args.engine, args.criterion),
|
|
908
1006
|
traceRoot: path.join(traceRoot, "judge"),
|
|
1007
|
+
tracePath,
|
|
909
1008
|
jobId: args.workload.job.id,
|
|
910
1009
|
});
|
|
911
1010
|
let usage = args.runtime.assignUsageRole("engine", agentResult.usage);
|
|
@@ -913,12 +1012,13 @@ async function runRubricCriterionJudge(args) {
|
|
|
913
1012
|
return {
|
|
914
1013
|
...normalizeRubricCriterionJudgeResult(agentResult.output, args.criterion),
|
|
915
1014
|
metadata: agentResult.metadata,
|
|
916
|
-
|
|
1015
|
+
traceFiles: publicRubricAgentTraceFiles(agentResult.traceFiles),
|
|
917
1016
|
...(usage ? { usage } : {}),
|
|
918
1017
|
};
|
|
919
1018
|
}
|
|
920
1019
|
catch (error) {
|
|
921
1020
|
const repairError = error instanceof Error ? error.message : String(error);
|
|
1021
|
+
const repairTracePath = rubricCriterionTracePath(args.workload.job.id, args.criterion.id, "repair");
|
|
922
1022
|
const repairResult = await executeBuiltInAgentTurn(args.agentExecutor, {
|
|
923
1023
|
role: "engine",
|
|
924
1024
|
provider: args.engine.judge,
|
|
@@ -926,13 +1026,14 @@ async function runRubricCriterionJudge(args) {
|
|
|
926
1026
|
adapterAuthRequest: args.adapterAuthRequest,
|
|
927
1027
|
adapterAuthEnv: args.adapterAuthEnv,
|
|
928
1028
|
workspaceRoot: args.request.paths.workspace,
|
|
929
|
-
cwd: args.request.paths.
|
|
1029
|
+
cwd: args.request.paths.workspace,
|
|
930
1030
|
prompt: buildRubricCriterionRepairPrompt({
|
|
931
1031
|
output: agentResult.output,
|
|
932
1032
|
error: repairError,
|
|
933
1033
|
criterion: args.criterion,
|
|
934
1034
|
}),
|
|
935
1035
|
traceRoot: path.join(traceRoot, "repair"),
|
|
1036
|
+
tracePath: repairTracePath,
|
|
936
1037
|
jobId: args.workload.job.id,
|
|
937
1038
|
});
|
|
938
1039
|
usage = args.runtime.mergeUsageSummaries([
|
|
@@ -949,7 +1050,10 @@ async function runRubricCriterionJudge(args) {
|
|
|
949
1050
|
originalMetadata: agentResult.metadata,
|
|
950
1051
|
},
|
|
951
1052
|
},
|
|
952
|
-
|
|
1053
|
+
traceFiles: publicRubricAgentTraceFiles([
|
|
1054
|
+
...agentResult.traceFiles,
|
|
1055
|
+
...repairResult.traceFiles,
|
|
1056
|
+
]),
|
|
953
1057
|
repair: {
|
|
954
1058
|
attempted: true,
|
|
955
1059
|
originalError: repairError,
|
|
@@ -958,6 +1062,14 @@ async function runRubricCriterionJudge(args) {
|
|
|
958
1062
|
};
|
|
959
1063
|
}
|
|
960
1064
|
}
|
|
1065
|
+
function publicRubricAgentTraceFiles(files) {
|
|
1066
|
+
return files
|
|
1067
|
+
.filter((file) => file.encoding === "utf8" && file.path.endsWith("/trace.json"))
|
|
1068
|
+
.map((file) => ({ ...file }));
|
|
1069
|
+
}
|
|
1070
|
+
function rubricCriterionTracePath(jobId, criterionId, turn) {
|
|
1071
|
+
return `.workbench/traces/${jobId}/engine/rubric/criteria/${safeInternalPathSegment(criterionId)}/${turn}`;
|
|
1072
|
+
}
|
|
961
1073
|
function buildRubricCriterionJudgePrompt(workload, engine, criterion) {
|
|
962
1074
|
requireWorkloadTask(workload, "Rubric judge");
|
|
963
1075
|
return [
|
|
@@ -981,7 +1093,7 @@ function buildRubricCriterionJudgePrompt(workload, engine, criterion) {
|
|
|
981
1093
|
score: 0.0,
|
|
982
1094
|
pass: false,
|
|
983
1095
|
rationale: "why this criterion received this score",
|
|
984
|
-
summary: "short
|
|
1096
|
+
summary: "short scoring summary",
|
|
985
1097
|
feedback: {},
|
|
986
1098
|
}, null, 2),
|
|
987
1099
|
`The only allowed criterion_id is ${criterion.id}.`,
|
|
@@ -1009,7 +1121,7 @@ function buildRubricCriterionRepairPrompt(input) {
|
|
|
1009
1121
|
score: 0.0,
|
|
1010
1122
|
pass: false,
|
|
1011
1123
|
rationale: "why this criterion received this score",
|
|
1012
|
-
summary: "short
|
|
1124
|
+
summary: "short scoring summary",
|
|
1013
1125
|
feedback: {},
|
|
1014
1126
|
}, null, 2),
|
|
1015
1127
|
"",
|
|
@@ -1026,9 +1138,6 @@ function rubricJudgeResultFromCriteria(args) {
|
|
|
1026
1138
|
throw new Error("Rubric criterion scores must aggregate to a score in the 0..1 range.");
|
|
1027
1139
|
}
|
|
1028
1140
|
const metrics = { score };
|
|
1029
|
-
for (const criterion of criteria) {
|
|
1030
|
-
metrics[`criterion__${criterion.criterion_id}`] = criterion.score;
|
|
1031
|
-
}
|
|
1032
1141
|
const caseResult = rubricJudgeCaseResult({
|
|
1033
1142
|
workload: args.workload,
|
|
1034
1143
|
score,
|
|
@@ -1196,8 +1305,6 @@ function isRuntimeWorkspacePath(filePath) {
|
|
|
1196
1305
|
normalized.startsWith("input/") ||
|
|
1197
1306
|
normalized === "output" ||
|
|
1198
1307
|
normalized.startsWith("output/") ||
|
|
1199
|
-
normalized === "logs" ||
|
|
1200
|
-
normalized.startsWith("logs/") ||
|
|
1201
1308
|
normalized === "private" ||
|
|
1202
1309
|
normalized.startsWith("private/");
|
|
1203
1310
|
}
|
|
@@ -0,0 +1,5 @@
|
|
|
1
|
+
import { sortLocalTraceRefs, type AgentReadableTraceDigest, type LocalTraceAdapter, type LocalTraceRef } from "@workbench-ai/agent-driver";
|
|
2
|
+
export declare function builtinLocalTraceAdapters(): LocalTraceAdapter[];
|
|
3
|
+
export declare function builtinLocalTraceAdapter(id: string): LocalTraceAdapter | null;
|
|
4
|
+
export { sortLocalTraceRefs, type AgentReadableTraceDigest, type LocalTraceAdapter, type LocalTraceRef, };
|
|
5
|
+
//# sourceMappingURL=local-traces.d.ts.map
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
{"version":3,"file":"local-traces.d.ts","sourceRoot":"","sources":["../src/local-traces.ts"],"names":[],"mappings":"AAAA,OAAO,EACL,kBAAkB,EAClB,KAAK,wBAAwB,EAC7B,KAAK,iBAAiB,EACtB,KAAK,aAAa,EACnB,MAAM,4BAA4B,CAAC;AASpC,wBAAgB,yBAAyB,IAAI,iBAAiB,EAAE,CAE/D;AAED,wBAAgB,wBAAwB,CAAC,EAAE,EAAE,MAAM,GAAG,iBAAiB,GAAG,IAAI,CAE7E;AAED,OAAO,EACL,kBAAkB,EAClB,KAAK,wBAAwB,EAC7B,KAAK,iBAAiB,EACtB,KAAK,aAAa,GACnB,CAAC"}
|
|
@@ -0,0 +1,14 @@
|
|
|
1
|
+
import { sortLocalTraceRefs, } from "@workbench-ai/agent-driver";
|
|
2
|
+
import { claudeLocalTraceAdapter } from "@workbench-ai/agent-driver-anthropic-claude-code";
|
|
3
|
+
import { codexLocalTraceAdapter } from "@workbench-ai/agent-driver-openai-codex";
|
|
4
|
+
const BUILT_IN_LOCAL_TRACE_ADAPTERS = [
|
|
5
|
+
codexLocalTraceAdapter,
|
|
6
|
+
claudeLocalTraceAdapter,
|
|
7
|
+
];
|
|
8
|
+
export function builtinLocalTraceAdapters() {
|
|
9
|
+
return [...BUILT_IN_LOCAL_TRACE_ADAPTERS];
|
|
10
|
+
}
|
|
11
|
+
export function builtinLocalTraceAdapter(id) {
|
|
12
|
+
return BUILT_IN_LOCAL_TRACE_ADAPTERS.find((adapter) => adapter.id === id) ?? null;
|
|
13
|
+
}
|
|
14
|
+
export { sortLocalTraceRefs, };
|
package/dist/manifests.d.ts
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
1
|
import type { WorkbenchAdapterManifest } from "@workbench-ai/workbench-protocol";
|
|
2
|
-
export type WorkbenchPublicBuiltInAdapterId = "workbench" | "codex" | "claude" | "
|
|
2
|
+
export type WorkbenchPublicBuiltInAdapterId = "workbench" | "codex" | "claude" | "command";
|
|
3
3
|
export type WorkbenchEngineHelperAdapterId = "rubric" | "tests";
|
|
4
4
|
export type WorkbenchBuiltInAdapterId = WorkbenchPublicBuiltInAdapterId | WorkbenchEngineHelperAdapterId;
|
|
5
5
|
export declare function builtinWorkbenchAdapterManifest(id: string): WorkbenchAdapterManifest | null;
|
package/dist/manifests.d.ts.map
CHANGED
|
@@ -1 +1 @@
|
|
|
1
|
-
{"version":3,"file":"manifests.d.ts","sourceRoot":"","sources":["../src/manifests.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EACV,wBAAwB,EACzB,MAAM,kCAAkC,CAAC;AAW1C,MAAM,MAAM,+BAA+B,GACvC,WAAW,GACX,OAAO,GACP,QAAQ,GACR,
|
|
1
|
+
{"version":3,"file":"manifests.d.ts","sourceRoot":"","sources":["../src/manifests.ts"],"names":[],"mappings":"AAAA,OAAO,KAAK,EACV,wBAAwB,EACzB,MAAM,kCAAkC,CAAC;AAW1C,MAAM,MAAM,+BAA+B,GACvC,WAAW,GACX,OAAO,GACP,QAAQ,GACR,SAAS,CAAC;AAEd,MAAM,MAAM,8BAA8B,GACtC,QAAQ,GACR,OAAO,CAAC;AAEZ,MAAM,MAAM,yBAAyB,GACjC,+BAA+B,GAC/B,8BAA8B,CAAC;AA+EnC,wBAAgB,+BAA+B,CAAC,EAAE,EAAE,MAAM,GAAG,wBAAwB,GAAG,IAAI,CAI3F;AAED,wBAAgB,gCAAgC,IAAI,wBAAwB,EAAE,CAI7E;AAED,wBAAgB,2BAA2B,CAAC,EAAE,EAAE,MAAM,GAAG,EAAE,IAAI,yBAAyB,CAEvF;AAED,wBAAgB,kBAAkB,CAAC,SAAS,EAAE,MAAM,GAAG,MAAM,CAE5D"}
|
package/dist/manifests.js
CHANGED
|
@@ -3,7 +3,7 @@ const BUILT_IN_ADAPTER_MANIFESTS = Object.fromEntries(Object.entries({
|
|
|
3
3
|
workbench: defineAdapter({
|
|
4
4
|
id: "workbench",
|
|
5
5
|
engineResolve: defineEngineResolver(),
|
|
6
|
-
engineRun: defineEngineRunner(),
|
|
6
|
+
engineRun: defineEngineRunner({ executor: "host" }),
|
|
7
7
|
slots: {
|
|
8
8
|
score: adapterSlot("/score", "engine.run"),
|
|
9
9
|
},
|
|
@@ -55,14 +55,6 @@ const BUILT_IN_ADAPTER_MANIFESTS = Object.fromEntries(Object.entries({
|
|
|
55
55
|
},
|
|
56
56
|
},
|
|
57
57
|
}),
|
|
58
|
-
pi: defineAdapter({
|
|
59
|
-
id: "pi",
|
|
60
|
-
subject: defineSubject(),
|
|
61
|
-
improve: defineOptimizer(),
|
|
62
|
-
setup: [
|
|
63
|
-
"npm install --global @mariozechner/pi-coding-agent@0.70.2",
|
|
64
|
-
],
|
|
65
|
-
}),
|
|
66
58
|
command: defineAdapter({
|
|
67
59
|
id: "command",
|
|
68
60
|
subject: defineSubject(),
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@workbench-ai/workbench-built-in-adapters",
|
|
3
|
-
"version": "0.0.
|
|
3
|
+
"version": "0.0.47",
|
|
4
4
|
"type": "module",
|
|
5
5
|
"repository": {
|
|
6
6
|
"type": "git",
|
|
@@ -13,6 +13,10 @@
|
|
|
13
13
|
".": {
|
|
14
14
|
"types": "./dist/index.d.ts",
|
|
15
15
|
"default": "./dist/index.js"
|
|
16
|
+
},
|
|
17
|
+
"./local-traces": {
|
|
18
|
+
"types": "./dist/local-traces.d.ts",
|
|
19
|
+
"default": "./dist/local-traces.js"
|
|
16
20
|
}
|
|
17
21
|
},
|
|
18
22
|
"bin": {
|
|
@@ -21,8 +25,7 @@
|
|
|
21
25
|
"workbench-adapter-tests": "dist/bin/tests.js",
|
|
22
26
|
"workbench-adapter-rubric": "dist/bin/rubric.js",
|
|
23
27
|
"workbench-adapter-codex": "dist/bin/codex.js",
|
|
24
|
-
"workbench-adapter-claude": "dist/bin/claude.js"
|
|
25
|
-
"workbench-adapter-pi": "dist/bin/pi.js"
|
|
28
|
+
"workbench-adapter-claude": "dist/bin/claude.js"
|
|
26
29
|
},
|
|
27
30
|
"files": [
|
|
28
31
|
"dist"
|
|
@@ -30,12 +33,11 @@
|
|
|
30
33
|
"dependencies": {
|
|
31
34
|
"yaml": "^2.8.2",
|
|
32
35
|
"@workbench-ai/agent-driver-anthropic-claude-code": "0.0.44",
|
|
33
|
-
"@workbench-ai/
|
|
36
|
+
"@workbench-ai/workbench-contract": "0.0.47",
|
|
34
37
|
"@workbench-ai/agent-driver-openai-codex": "0.0.44",
|
|
35
|
-
"@workbench-ai/workbench-
|
|
38
|
+
"@workbench-ai/workbench-core": "0.0.47",
|
|
36
39
|
"@workbench-ai/agent-driver": "0.0.44",
|
|
37
|
-
"@workbench-ai/workbench-protocol": "0.0.
|
|
38
|
-
"@workbench-ai/workbench-core": "0.0.46"
|
|
40
|
+
"@workbench-ai/workbench-protocol": "0.0.47"
|
|
39
41
|
},
|
|
40
42
|
"devDependencies": {
|
|
41
43
|
"@types/node": "^24.3.1",
|
package/dist/bin/pi.d.ts
DELETED
package/dist/bin/pi.d.ts.map
DELETED
|
@@ -1 +0,0 @@
|
|
|
1
|
-
{"version":3,"file":"pi.d.ts","sourceRoot":"","sources":["../../src/bin/pi.ts"],"names":[],"mappings":""}
|
package/dist/bin/pi.js
DELETED