@workbench-ai/workbench-built-in-adapters 0.0.46

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,1336 @@
1
+ import { spawn } from "node:child_process";
2
+ import { promises as fs } from "node:fs";
3
+ import path from "node:path";
4
+ import { ensureWorkbenchAdapterOutputDir, readWorkbenchAdapterOperationResult, readWorkbenchAdapterOperationRequest, writeWorkbenchAdapterOperationResult, workbenchAdapterOperationResultPath, } from "@workbench-ai/workbench-protocol";
5
+ import YAML from "yaml";
6
+ import { isWorkbenchBuiltInAdapterId, adapterCommandName, } from "./manifests.js";
7
+ import { importWorkbenchRuntime } from "./runtime.js";
8
+ const TASK_CONTROL_FILE = "task.yaml";
9
+ const DEFAULT_RUBRIC_PARALLELISM = 4;
10
+ export async function executeWorkbenchBuiltInAdapterCommand(args = {}) {
11
+ const request = await readWorkbenchAdapterOperationRequest(args.requestPath);
12
+ const adapterId = args.adapterId ?? request.invocation.use;
13
+ if (adapterId !== request.invocation.use) {
14
+ throw new Error(`Adapter command ${adapterId} cannot execute request for ${request.invocation.use}.`);
15
+ }
16
+ if (!isWorkbenchBuiltInAdapterId(adapterId)) {
17
+ throw new Error(`Unsupported built-in Workbench adapter: ${adapterId}.`);
18
+ }
19
+ if (args.outputRoot && args.outputRoot !== request.paths.output) {
20
+ request.paths.output = args.outputRoot;
21
+ }
22
+ await ensureWorkbenchAdapterOutputDir(request);
23
+ if (adapterId === "workbench") {
24
+ await executeWorkbenchEngineRequest(request);
25
+ return;
26
+ }
27
+ if (adapterId === "command") {
28
+ await executeCommandAdapterRequest(request);
29
+ return;
30
+ }
31
+ if (adapterId === "tests") {
32
+ await executeTestsEngineRequest(request);
33
+ return;
34
+ }
35
+ if (adapterId === "rubric") {
36
+ if (request.operation !== "engine.run") {
37
+ throw new Error(`Rubric adapter cannot handle ${request.operation}.`);
38
+ }
39
+ await writeRubricJudgeResult(request, workloadFromAdapterOperationRequest(request), builtInRubricSpecFromRequest(request), {
40
+ agentExecutor: args.agentExecutor,
41
+ adapterAuthRoot: args.adapterAuthRoot,
42
+ adapterAuthRequest: args.adapterAuthRequest ?? request.auth,
43
+ adapterAuthEnv: args.adapterAuthEnv,
44
+ });
45
+ return;
46
+ }
47
+ if (isBuiltInAgentAdapterId(adapterId)) {
48
+ const workload = workloadFromAdapterOperationRequest(request);
49
+ const agent = builtInAgentSpecFromRequest(request);
50
+ if (request.operation === "optimizer.improve") {
51
+ await writeAgentSubjectRevisionOutput(request, workload, agent, {
52
+ agentExecutor: args.agentExecutor,
53
+ adapterAuthRoot: args.adapterAuthRoot,
54
+ adapterAuthRequest: args.adapterAuthRequest ?? request.auth,
55
+ adapterAuthEnv: args.adapterAuthEnv,
56
+ });
57
+ return;
58
+ }
59
+ if (request.operation === "subject.run") {
60
+ await writeAgentSubjectOutput(request, workload, agent, {
61
+ agentExecutor: args.agentExecutor,
62
+ adapterAuthRoot: args.adapterAuthRoot,
63
+ adapterAuthRequest: args.adapterAuthRequest ?? request.auth,
64
+ adapterAuthEnv: args.adapterAuthEnv,
65
+ });
66
+ return;
67
+ }
68
+ throw new Error(`Agent adapter ${adapterId} cannot handle ${request.operation}.`);
69
+ }
70
+ }
71
+ async function executeWorkbenchEngineRequest(request) {
72
+ if (request.operation === "engine.resolve") {
73
+ await executeWorkbenchEngineResolveRequest(request);
74
+ return;
75
+ }
76
+ if (request.operation === "engine.run") {
77
+ await executeWorkbenchEngineRunRequest(request);
78
+ return;
79
+ }
80
+ throw new Error(`Workbench engine adapter cannot handle ${request.operation}.`);
81
+ }
82
+ async function executeWorkbenchEngineResolveRequest(request) {
83
+ const configuredPath = workbenchEngineTasksPath(request);
84
+ const sourcePath = path.resolve(request.paths.cwd ?? request.paths.workspace, configuredPath);
85
+ const stat = await fs.stat(sourcePath).catch(() => null);
86
+ if (!stat?.isDirectory()) {
87
+ throw new Error(`Workbench engine tasks path is not a directory: ${sourcePath}`);
88
+ }
89
+ const cases = await readEngineCasesFromWorkbenchTaskRoot(sourcePath);
90
+ await writeWorkbenchAdapterOperationResult(request.paths.output, {
91
+ protocol: "workbench.adapter-result.v1",
92
+ operation: "engine.resolve",
93
+ ok: true,
94
+ value: { cases },
95
+ summary: `Resolved Workbench engine cases from ${configuredPath}.`,
96
+ feedback: {
97
+ engineResolve: "workbench",
98
+ path: configuredPath,
99
+ },
100
+ });
101
+ }
102
+ async function executeWorkbenchEngineRunRequest(request) {
103
+ const enginePrivateFiles = await hideWorkbenchEnginePrivateFiles(request);
104
+ const subjectResult = await runSubjectFromWorkbenchEngine(request);
105
+ await stageWorkbenchEngineScoringInputs(request, enginePrivateFiles);
106
+ const score = workbenchEngineScoreInvocation(request);
107
+ await runNestedAdapterOperation({
108
+ parent: request,
109
+ invocation: score,
110
+ operation: "engine.run",
111
+ command: score.command,
112
+ requestName: "score-request.json",
113
+ });
114
+ const engineResult = await readWorkbenchAdapterOperationResult(request.paths.output, "engine.run");
115
+ const usage = mergeNestedEngineUsage(subjectResult.usage, engineResult.usage);
116
+ if (usage) {
117
+ await writeWorkbenchAdapterOperationResult(request.paths.output, {
118
+ ...engineResult,
119
+ usage,
120
+ });
121
+ }
122
+ }
123
+ async function hideWorkbenchEnginePrivateFiles(request) {
124
+ if (!request.paths.enginePrivate) {
125
+ return [];
126
+ }
127
+ const files = await readSurfaceFilesRecursive(request.paths.enginePrivate).catch((error) => {
128
+ if (error.code === "ENOENT") {
129
+ return [];
130
+ }
131
+ throw error;
132
+ });
133
+ await fs.rm(request.paths.enginePrivate, { recursive: true, force: true }).catch(() => undefined);
134
+ return files;
135
+ }
136
+ async function stageWorkbenchEngineScoringInputs(request, enginePrivateFiles) {
137
+ if (request.paths.enginePrivate) {
138
+ await fs.rm(request.paths.enginePrivate, { recursive: true, force: true }).catch(() => undefined);
139
+ await fs.mkdir(request.paths.enginePrivate, { recursive: true });
140
+ await writeSurfaceFiles(request.paths.enginePrivate, enginePrivateFiles);
141
+ }
142
+ if (request.paths.logs) {
143
+ const verifierLogs = path.join(request.paths.logs, "verifier");
144
+ await fs.rm(verifierLogs, { recursive: true, force: true }).catch(() => undefined);
145
+ await fs.mkdir(verifierLogs, { recursive: true });
146
+ }
147
+ }
148
+ function workbenchEngineTasksPath(request) {
149
+ const config = adapterCommandConfigRecord(request);
150
+ const tasks = config.tasks;
151
+ if (tasks === undefined) {
152
+ return "tasks";
153
+ }
154
+ const taskConfig = jsonRecord(tasks);
155
+ if (typeof taskConfig.path === "string" && taskConfig.path.trim().length > 0) {
156
+ return taskConfig.path;
157
+ }
158
+ throw new Error("Workbench engine tasks must be an object with path.");
159
+ }
160
+ function workbenchEngineScoreInvocation(request) {
161
+ const score = jsonRecord(adapterCommandConfigRecord(request).score);
162
+ if (!score || typeof score.use !== "string" || score.use.length === 0) {
163
+ throw new Error("Workbench engine requires invocation.with.score.use.");
164
+ }
165
+ return {
166
+ use: score.use,
167
+ with: (score.with ?? {}),
168
+ ...(score.auth !== undefined ? { auth: score.auth } : {}),
169
+ command: typeof score.command === "string" && score.command.length > 0
170
+ ? score.command
171
+ : adapterCommandName(score.use),
172
+ };
173
+ }
174
+ async function runSubjectFromWorkbenchEngine(request) {
175
+ const subject = request.context?.subject?.run;
176
+ if (!subject?.command) {
177
+ throw new Error("engine.run request context.subject.run.command is required to invoke the subject.");
178
+ }
179
+ const subjectOutput = await runNestedAdapterOperation({
180
+ parent: request,
181
+ invocation: {
182
+ use: subject.use,
183
+ with: (subject.with ?? {}),
184
+ ...(subject.auth !== undefined ? { auth: subject.auth } : {}),
185
+ command: subject.command,
186
+ },
187
+ operation: "subject.run",
188
+ command: subject.command,
189
+ requestName: "subject-request.json",
190
+ outputName: "subject-run",
191
+ visibility: "subject",
192
+ });
193
+ const result = await readWorkbenchAdapterOperationResult(subjectOutput, "subject.run");
194
+ await copySubjectOutputArtifacts(subjectOutput, request.paths.output);
195
+ return result;
196
+ }
197
+ function mergeNestedEngineUsage(subject, engine) {
198
+ const usage = {};
199
+ if (subject?.runner) {
200
+ usage.runner = subject.runner;
201
+ }
202
+ else if (subject?.total) {
203
+ usage.runner = subject.total;
204
+ }
205
+ if (engine?.engine) {
206
+ usage.engine = engine.engine;
207
+ }
208
+ else if (engine?.total) {
209
+ usage.engine = engine.total;
210
+ }
211
+ if (subject?.optimizer) {
212
+ usage.optimizer = subject.optimizer;
213
+ }
214
+ return Object.keys(usage).length > 0 ? usage : undefined;
215
+ }
216
+ async function runNestedAdapterOperation(args) {
217
+ const internalRoot = path.join(args.parent.paths.output, ".workbench", "internal", args.outputName ?? "engine-slot", safeInternalPathSegment(args.parent.id));
218
+ const output = args.outputName ? path.join(internalRoot, "output") : args.parent.paths.output;
219
+ const result = args.outputName
220
+ ? workbenchAdapterOperationResultPath(output)
221
+ : args.parent.paths.result;
222
+ const requestPath = path.join(internalRoot, args.requestName);
223
+ await fs.mkdir(path.dirname(requestPath), { recursive: true });
224
+ await fs.mkdir(output, { recursive: true });
225
+ const nestedPaths = {
226
+ ...args.parent.paths,
227
+ output,
228
+ result,
229
+ };
230
+ if (args.visibility === "subject") {
231
+ delete nestedPaths.enginePrivate;
232
+ }
233
+ await fs.writeFile(requestPath, `${JSON.stringify({
234
+ ...args.parent,
235
+ id: `${args.parent.id}:${args.invocation.use}:${args.operation}`,
236
+ operation: args.operation,
237
+ invocation: {
238
+ use: args.invocation.use,
239
+ with: args.invocation.with,
240
+ ...(args.invocation.auth !== undefined ? { auth: args.invocation.auth } : {}),
241
+ },
242
+ ...(args.parent.auth !== undefined
243
+ ? { auth: adapterScopedAuth(args.parent.auth, args.invocation.use) }
244
+ : {}),
245
+ paths: nestedPaths,
246
+ }, null, 2)}\n`);
247
+ await runAdapterShellCommand(args.command, args.parent.paths.cwd ?? args.parent.paths.workspace, {
248
+ WORKBENCH_ADAPTER_REQUEST: requestPath,
249
+ WORKBENCH_OUTPUT: output,
250
+ WORKBENCH_RESULT: result,
251
+ });
252
+ return output;
253
+ }
254
+ async function copySubjectOutputArtifacts(source, target) {
255
+ await copyDirectoryEntries(source, target, "");
256
+ }
257
+ async function copyDirectoryEntries(sourceRoot, targetRoot, relativeDir) {
258
+ const sourceDir = path.join(sourceRoot, relativeDir);
259
+ const entries = await fs.readdir(sourceDir, { withFileTypes: true }).catch(() => []);
260
+ for (const entry of entries) {
261
+ const relativePath = path.join(relativeDir, entry.name);
262
+ const normalized = normalizeRelativePath(relativePath);
263
+ if (normalized === "workbench-result.json" || normalized.startsWith(".workbench/internal/")) {
264
+ continue;
265
+ }
266
+ const sourcePath = path.join(sourceRoot, relativePath);
267
+ const targetPath = path.join(targetRoot, relativePath);
268
+ if (entry.isDirectory()) {
269
+ await copyDirectoryEntries(sourceRoot, targetRoot, relativePath);
270
+ continue;
271
+ }
272
+ if (!entry.isFile()) {
273
+ continue;
274
+ }
275
+ await fs.mkdir(path.dirname(targetPath), { recursive: true });
276
+ await fs.copyFile(sourcePath, targetPath);
277
+ }
278
+ }
279
+ function adapterScopedAuth(auth, adapterId) {
280
+ if (!auth || typeof auth !== "object" || Array.isArray(auth)) {
281
+ return auth;
282
+ }
283
+ const record = JSON.parse(JSON.stringify(auth));
284
+ const adapters = record.adapters;
285
+ if (adapters && typeof adapters === "object" && !Array.isArray(adapters)) {
286
+ const scoped = adapters[adapterId];
287
+ if (scoped !== undefined) {
288
+ record.self = scoped;
289
+ }
290
+ }
291
+ return record;
292
+ }
293
+ function safeInternalPathSegment(value) {
294
+ const safe = value.replace(/[^a-z0-9._-]+/giu, "_").replace(/^_+|_+$/gu, "");
295
+ return safe || "nested";
296
+ }
297
+ async function executeCommandAdapterRequest(request) {
298
+ const command = requiredAdapterCommandString(request, "command");
299
+ await runAdapterShellCommand(command, request.paths.cwd ?? request.paths.workspace);
300
+ if (request.operation === "engine.run") {
301
+ await requireCommandScoreResult(request);
302
+ return;
303
+ }
304
+ await writeOperationOkUnlessPresent(request);
305
+ }
306
+ async function requireCommandScoreResult(request) {
307
+ if (!await fileExists(workbenchAdapterOperationResultPath(request.paths.output))) {
308
+ throw new Error("Command engine must write workbench-result.json for engine.run.");
309
+ }
310
+ await readWorkbenchAdapterOperationResult(request.paths.output, "engine.run").catch((error) => {
311
+ throw new Error(`Command engine wrote an invalid workbench-result.json for engine.run: ${error instanceof Error ? error.message : String(error)}`);
312
+ });
313
+ }
314
+ async function executeTestsEngineRequest(request) {
315
+ if (request.operation !== "engine.run") {
316
+ throw new Error(`Tests adapter cannot handle ${request.operation}.`);
317
+ }
318
+ const testsRoot = requiredRequestPath(request.paths.enginePrivate, "paths.enginePrivate");
319
+ const logsRoot = requiredRequestPath(request.paths.logs, "paths.logs");
320
+ const verifierLogs = path.join(logsRoot, "verifier");
321
+ await fs.mkdir(verifierLogs, { recursive: true });
322
+ const script = await firstExistingFile([
323
+ path.join(testsRoot, "test.sh"),
324
+ path.join(testsRoot, "run.sh"),
325
+ ]);
326
+ if (!script) {
327
+ throw new Error(`Tests engine requires ${path.join(testsRoot, "test.sh")}.`);
328
+ }
329
+ await runAdapterShellCommand(`sh ${shellQuote(script)}`, request.paths.cwd ?? request.paths.workspace);
330
+ const result = await readTestsResult({
331
+ logsRoot,
332
+ caseId: request.context?.attempt?.caseId ?? "current",
333
+ });
334
+ await writeWorkbenchAdapterOperationResult(request.paths.output, {
335
+ protocol: "workbench.adapter-result.v1",
336
+ operation: "engine.run",
337
+ ok: true,
338
+ value: result,
339
+ ...(typeof result.summary === "string" ? { summary: result.summary } : {}),
340
+ feedback: {
341
+ engine: "tests",
342
+ },
343
+ });
344
+ }
345
+ async function runAdapterShellCommand(command, cwd, env = {}) {
346
+ await new Promise((resolve, reject) => {
347
+ const child = spawn("sh", ["-c", command], {
348
+ cwd,
349
+ env: {
350
+ ...process.env,
351
+ ...env,
352
+ },
353
+ stdio: "inherit",
354
+ });
355
+ child.on("error", reject);
356
+ child.on("exit", (code, signal) => {
357
+ if (code === 0) {
358
+ resolve();
359
+ return;
360
+ }
361
+ reject(new Error(code === null
362
+ ? `Command adapter exited from signal ${signal ?? "unknown"}.`
363
+ : `Command adapter exited with status ${code}.`));
364
+ });
365
+ });
366
+ }
367
+ async function writeOperationOkUnlessPresent(request) {
368
+ if (await fileExists(workbenchAdapterOperationResultPath(request.paths.output))) {
369
+ return;
370
+ }
371
+ if (request.operation === "optimizer.improve") {
372
+ const patch = await createSubjectPatchFromWorkspace({
373
+ beforeRoot: requiredRequestPath(request.paths.subject, "paths.subject"),
374
+ afterRoot: request.paths.cwd ?? request.paths.workspace,
375
+ edits: request.context?.optimizer?.edits ?? [],
376
+ });
377
+ await writeWorkbenchAdapterOperationResult(request.paths.output, {
378
+ protocol: "workbench.adapter-result.v1",
379
+ operation: request.operation,
380
+ ok: true,
381
+ value: patch,
382
+ });
383
+ return;
384
+ }
385
+ await writeWorkbenchAdapterOperationResult(request.paths.output, {
386
+ protocol: "workbench.adapter-result.v1",
387
+ operation: request.operation,
388
+ ok: true,
389
+ });
390
+ }
391
+ async function firstExistingFile(files) {
392
+ for (const file of files) {
393
+ const stat = await fs.stat(file).catch(() => null);
394
+ if (stat?.isFile()) {
395
+ return file;
396
+ }
397
+ }
398
+ return null;
399
+ }
400
+ function requiredRequestPath(value, label) {
401
+ if (!value) {
402
+ throw new Error(`Adapter request ${label} is required.`);
403
+ }
404
+ return value;
405
+ }
406
+ async function readEngineCasesFromWorkbenchTaskRoot(tasksRoot) {
407
+ const taskDirs = await listWorkbenchTaskDirectories(tasksRoot);
408
+ if (taskDirs.length === 0) {
409
+ throw new Error(`Engine resolve has no Workbench task packages: ${tasksRoot}`);
410
+ }
411
+ return await Promise.all(taskDirs.map(async (taskDir) => readWorkbenchEngineCase({
412
+ taskDir,
413
+ id: path.basename(taskDir),
414
+ })));
415
+ }
416
+ async function listWorkbenchTaskDirectories(root) {
417
+ if (await fileExists(path.join(root, TASK_CONTROL_FILE))) {
418
+ throw new Error(`Workbench engine tasks root must contain task directories, not a direct ${TASK_CONTROL_FILE}: ${root}`);
419
+ }
420
+ const entries = await fs.readdir(root, { withFileTypes: true });
421
+ const tasks = [];
422
+ for (const entry of entries) {
423
+ if (!entry.isDirectory()) {
424
+ continue;
425
+ }
426
+ const taskDir = path.join(root, entry.name);
427
+ if (await fileExists(path.join(taskDir, TASK_CONTROL_FILE))) {
428
+ tasks.push(taskDir);
429
+ }
430
+ }
431
+ return tasks.sort((left, right) => left.localeCompare(right));
432
+ }
433
+ async function readWorkbenchEngineCase(args) {
434
+ const sourceFiles = await readSurfaceFilesRecursive(args.taskDir);
435
+ const taskFile = sourceFiles.find((file) => normalizeRelativePath(file.path) === TASK_CONTROL_FILE && file.encoding === "utf8");
436
+ if (!taskFile) {
437
+ throw new Error(`Task ${args.id} is missing ${TASK_CONTROL_FILE}.`);
438
+ }
439
+ const parsed = YAML.parse(taskFile.content);
440
+ const taskRecord = jsonRecord(parsed);
441
+ if (taskRecord.version !== 3) {
442
+ throw new Error(`Task ${args.id} ${TASK_CONTROL_FILE} version must be 3.`);
443
+ }
444
+ if (typeof taskRecord.task !== "string" || taskRecord.task.trim().length === 0) {
445
+ throw new Error(`Task ${args.id} ${TASK_CONTROL_FILE} must include a task string.`);
446
+ }
447
+ const unsupportedTaskFields = Object.keys(taskRecord)
448
+ .filter((key) => !["version", "task", "files", "tests", "solution", "environment"].includes(key));
449
+ if (unsupportedTaskFields.length > 0) {
450
+ throw new Error(`Task ${args.id} ${TASK_CONTROL_FILE} has unsupported field${unsupportedTaskFields.length === 1 ? "" : "s"}: ${unsupportedTaskFields.join(", ")}.`);
451
+ }
452
+ const publicPrefix = taskDirectoryPrefix(taskRecord.files, "files", args.id);
453
+ const testsPrefix = taskDirectoryPrefix(taskRecord.tests, "tests", args.id);
454
+ const solutionPrefix = taskDirectoryPrefix(taskRecord.solution, "solution", args.id);
455
+ const subjectVisible = stripTaskDirectory(sourceFiles, publicPrefix);
456
+ const enginePrivate = [
457
+ ...stripTaskDirectory(sourceFiles, testsPrefix),
458
+ ...stripTaskDirectory(sourceFiles, solutionPrefix),
459
+ ].sort((left, right) => left.path.localeCompare(right.path));
460
+ assertWorkbenchTaskPackageLayout(args.id, sourceFiles, [
461
+ publicPrefix,
462
+ testsPrefix,
463
+ solutionPrefix,
464
+ "environment/",
465
+ ]);
466
+ return {
467
+ id: normalizeRelativePath(args.id),
468
+ case: {
469
+ version: 3,
470
+ prompt: taskRecord.task,
471
+ ...(taskRecord.environment !== undefined
472
+ ? { environment: taskRecord.environment }
473
+ : {}),
474
+ },
475
+ files: {
476
+ subjectVisible,
477
+ enginePrivate,
478
+ source: sourceFiles,
479
+ },
480
+ };
481
+ }
482
+ function taskDirectoryPrefix(value, fallback, taskId) {
483
+ if (value === undefined) {
484
+ return `${fallback}/`;
485
+ }
486
+ const record = jsonRecord(value);
487
+ if (typeof record.path !== "string" || record.path.trim().length === 0) {
488
+ throw new Error(`Task ${taskId} ${TASK_CONTROL_FILE} path config must include a path string.`);
489
+ }
490
+ return `${normalizeRelativePath(record.path)}/`;
491
+ }
492
+ function assertWorkbenchTaskPackageLayout(taskId, files, allowedPrefixes) {
493
+ const invalid = files
494
+ .map((file) => normalizeRelativePath(file.path))
495
+ .filter((filePath) => filePath !== TASK_CONTROL_FILE &&
496
+ !allowedPrefixes.some((prefix) => filePath.startsWith(prefix)));
497
+ if (invalid.length > 0) {
498
+ throw new Error(`Task ${taskId} contains unsupported file${invalid.length === 1 ? "" : "s"} outside task.yaml or declared task directories: ${invalid.join(", ")}`);
499
+ }
500
+ }
501
+ function stripTaskDirectory(files, prefix) {
502
+ return files.flatMap((file) => {
503
+ const normalized = normalizeRelativePath(file.path);
504
+ if (!normalized.startsWith(prefix)) {
505
+ return [];
506
+ }
507
+ return [{ ...file, path: normalized.slice(prefix.length) }];
508
+ }).sort((left, right) => left.path.localeCompare(right.path));
509
+ }
510
+ async function readSurfaceFilesRecursive(root) {
511
+ const result = [];
512
+ await readSurfaceFilesInto(root, "", result);
513
+ return result.sort((left, right) => left.path.localeCompare(right.path));
514
+ }
515
+ async function readSurfaceFilesInto(root, relativeDir, result) {
516
+ const entries = await fs.readdir(path.join(root, relativeDir), { withFileTypes: true });
517
+ for (const entry of entries) {
518
+ const relativePath = normalizeRelativePath(path.join(relativeDir, entry.name));
519
+ const absolutePath = path.join(root, relativePath);
520
+ if (entry.isDirectory()) {
521
+ await readSurfaceFilesInto(root, relativePath, result);
522
+ continue;
523
+ }
524
+ if (!entry.isFile()) {
525
+ continue;
526
+ }
527
+ const [body, stat] = await Promise.all([
528
+ fs.readFile(absolutePath),
529
+ fs.stat(absolutePath),
530
+ ]);
531
+ const text = body.toString("utf8");
532
+ const isUtf8 = Buffer.from(text, "utf8").equals(body);
533
+ result.push({
534
+ path: relativePath,
535
+ kind: isUtf8 ? "text" : "binary",
536
+ encoding: isUtf8 ? "utf8" : "base64",
537
+ content: isUtf8 ? text : body.toString("base64"),
538
+ executable: (stat.mode & 0o111) !== 0,
539
+ });
540
+ }
541
+ }
542
+ async function fileExists(filePath) {
543
+ return fs.stat(filePath).then((stat) => stat.isFile(), () => false);
544
+ }
545
+ async function readTestsResult(args) {
546
+ const rewardJson = await readOptionalJson(path.join(args.logsRoot, "verifier", "reward.json"));
547
+ if (rewardJson) {
548
+ return normalizeTestsResult(rewardJson, args.caseId);
549
+ }
550
+ const rewardText = await fs.readFile(path.join(args.logsRoot, "verifier", "reward.txt"), "utf8").catch((error) => {
551
+ if (error.code === "ENOENT") {
552
+ return null;
553
+ }
554
+ throw error;
555
+ });
556
+ if (rewardText !== null) {
557
+ const score = Number.parseFloat(rewardText.trim());
558
+ if (!Number.isFinite(score)) {
559
+ throw new Error("Tests engine reward.txt must contain a finite numeric reward.");
560
+ }
561
+ return normalizeTestsResult({ reward: score }, args.caseId);
562
+ }
563
+ throw new Error("Tests engine did not find reward.json or reward.txt under the request logs verifier directory.");
564
+ }
565
+ async function readOptionalJson(filePath) {
566
+ const source = await fs.readFile(filePath, "utf8").catch((error) => {
567
+ if (error.code === "ENOENT") {
568
+ return null;
569
+ }
570
+ throw error;
571
+ });
572
+ if (source === null) {
573
+ return null;
574
+ }
575
+ const parsed = JSON.parse(source);
576
+ if (!parsed || typeof parsed !== "object" || Array.isArray(parsed)) {
577
+ throw new Error(`${filePath} must contain a JSON object.`);
578
+ }
579
+ return parsed;
580
+ }
581
+ function normalizeTestsResult(record, caseId) {
582
+ const rawScore = typeof record.score === "number"
583
+ ? record.score
584
+ : typeof record.reward === "number"
585
+ ? record.reward
586
+ : undefined;
587
+ if (rawScore === undefined || !Number.isFinite(rawScore)) {
588
+ throw new Error("Tests engine reward must include a finite numeric score or reward.");
589
+ }
590
+ const metrics = normalizeTestsMetrics(record, rawScore);
591
+ return {
592
+ score: rawScore,
593
+ metrics,
594
+ cases: [{
595
+ id: caseId,
596
+ status: "completed",
597
+ metrics,
598
+ }],
599
+ ...(typeof record.summary === "string" ? { summary: record.summary } : {}),
600
+ feedback: {
601
+ reward: record,
602
+ },
603
+ };
604
+ }
605
+ function normalizeTestsMetrics(record, score) {
606
+ const metrics = { score };
607
+ const source = record.metrics && typeof record.metrics === "object" && !Array.isArray(record.metrics)
608
+ ? record.metrics
609
+ : record;
610
+ for (const [key, value] of Object.entries(source)) {
611
+ if (typeof value === "number" && Number.isFinite(value)) {
612
+ metrics[key === "reward" ? "score" : key] = value;
613
+ }
614
+ }
615
+ return metrics;
616
+ }
617
+ function shellQuote(value) {
618
+ return `'${value.replace(/'/gu, "'\\''")}'`;
619
+ }
620
+ function workloadFromAdapterOperationRequest(request) {
621
+ const context = request.context ?? {};
622
+ const attempt = context.attempt ?? {};
623
+ return {
624
+ job: { id: request.jobId ?? request.id },
625
+ benchmark: {
626
+ name: context.benchmark?.name ?? "",
627
+ description: context.benchmark?.description ?? "",
628
+ },
629
+ subject: {
630
+ path: context.subject?.path ?? "",
631
+ },
632
+ optimizer: {
633
+ edits: context.optimizer?.edits ?? [],
634
+ },
635
+ subjectId: context.subject?.id ?? "",
636
+ attemptIndex: attempt.attemptIndex ?? 0,
637
+ sampleIndex: attempt.sampleIndex ?? 0,
638
+ caseId: attempt.caseId ?? "",
639
+ ...(context.case?.prompt ? { case: { prompt: context.case.prompt } } : {}),
640
+ };
641
+ }
642
+ function isBuiltInAgentAdapterId(value) {
643
+ return value === "codex" || value === "claude" || value === "pi";
644
+ }
645
+ function builtInAgentSpecFromRequest(request) {
646
+ const config = adapterCommandConfigRecord(request);
647
+ return {
648
+ agent: agentProviderFromAdapterCommandRequest(request),
649
+ ...(typeof config.instructions === "string" && config.instructions.length > 0
650
+ ? { instructions: config.instructions }
651
+ : {}),
652
+ };
653
+ }
654
+ function builtInRubricSpecFromRequest(request) {
655
+ const config = adapterCommandConfigRecord(request);
656
+ const criteria = rubricCriteria(config.criteria, "adapter.with.criteria");
657
+ return {
658
+ judge: rubricJudgeProviderFromAdapterCommandRequest(request),
659
+ ...(typeof config.instructions === "string" && config.instructions.length > 0
660
+ ? { instructions: config.instructions }
661
+ : {}),
662
+ parallelism: rubricParallelism(config.parallelism, criteria.length),
663
+ criteria,
664
+ };
665
+ }
666
+ function agentProviderFromAdapterCommandRequest(request) {
667
+ const config = adapterCommandConfigRecord(request);
668
+ return {
669
+ use: request.invocation.use,
670
+ ...(typeof config.model === "string" && config.model.length > 0
671
+ ? { model: config.model }
672
+ : {}),
673
+ ...(typeof config.effort === "string" && config.effort.length > 0
674
+ ? { effort: config.effort }
675
+ : {}),
676
+ };
677
+ }
678
+ function rubricJudgeProviderFromAdapterCommandRequest(request) {
679
+ const judge = jsonRecord(adapterCommandConfigRecord(request).judge);
680
+ const use = typeof judge?.use === "string" && judge.use.length > 0
681
+ ? judge.use
682
+ : "";
683
+ if (!use) {
684
+ throw new Error("Rubric adapter requires adapter.with.judge.use.");
685
+ }
686
+ const config = jsonRecord(judge?.with) ?? {};
687
+ return {
688
+ use,
689
+ ...(typeof config.model === "string" && config.model.length > 0
690
+ ? { model: config.model }
691
+ : {}),
692
+ ...(typeof config.effort === "string" && config.effort.length > 0
693
+ ? { effort: config.effort }
694
+ : {}),
695
+ };
696
+ }
697
+ function adapterCommandConfigRecord(request) {
698
+ return jsonRecord(request.invocation.with);
699
+ }
700
+ function requiredAdapterCommandString(request, key) {
701
+ const value = adapterCommandConfigRecord(request)[key];
702
+ if (typeof value !== "string" || value.length === 0) {
703
+ throw new Error(`Adapter ${request.invocation.use} requires invocation.with.${key}.`);
704
+ }
705
+ return value;
706
+ }
707
+ async function executeBuiltInAgentTurn(executor, request) {
708
+ const { defaultWorkbenchAgentTurnExecutor, executeWorkbenchAgentTurn, } = await import("./agent-turn.js");
709
+ return await executeWorkbenchAgentTurn(executor ?? defaultWorkbenchAgentTurnExecutor, request);
710
+ }
711
+ async function writeAgentSubjectOutput(request, workload, subject, options = {}) {
712
+ if (request.operation !== "subject.run") {
713
+ throw new Error("Agent subject results can only complete subject.run operations.");
714
+ }
715
+ const traceRoot = path.join(request.paths.output, ".workbench", "internal", "agent-subject");
716
+ const agentResult = await executeBuiltInAgentTurn(options.agentExecutor, {
717
+ role: "runner",
718
+ provider: subject.agent,
719
+ adapterAuthRoot: options.adapterAuthRoot,
720
+ adapterAuthRequest: options.adapterAuthRequest,
721
+ adapterAuthEnv: options.adapterAuthEnv,
722
+ workspaceRoot: request.paths.workspace,
723
+ cwd: request.paths.cwd ?? request.paths.workspace,
724
+ prompt: buildAgentSubjectPrompt(workload, subject),
725
+ traceRoot,
726
+ jobId: workload.job.id,
727
+ });
728
+ const outputPath = path.join(request.paths.output, "subject-summary.md");
729
+ await fs.mkdir(path.dirname(outputPath), { recursive: true });
730
+ await fs.writeFile(outputPath, agentResult.output);
731
+ const trace = {
732
+ path: `.workbench/traces/${workload.job.id}/subject.json`,
733
+ kind: "text",
734
+ encoding: "utf8",
735
+ executable: false,
736
+ content: `${JSON.stringify({
737
+ kind: "agent_subject",
738
+ provider: subject.agent.use,
739
+ subjectId: workload.subjectId,
740
+ attemptIndex: workload.attemptIndex,
741
+ sampleIndex: workload.sampleIndex,
742
+ summary: agentResult.output,
743
+ metadata: agentResult.metadata,
744
+ }, null, 2)}\n`,
745
+ };
746
+ await writeSurfaceFiles(request.paths.output, [trace, ...agentResult.traceFiles]);
747
+ const runtime = await importWorkbenchRuntime();
748
+ const usage = runtime.assignUsageRole("runner", agentResult.usage);
749
+ await writeWorkbenchAdapterOperationResult(request.paths.output, {
750
+ protocol: "workbench.adapter-result.v1",
751
+ operation: "subject.run",
752
+ ok: true,
753
+ ...(agentResult.output ? { summary: agentResult.output } : {}),
754
+ feedback: {
755
+ subject: "agent",
756
+ agent: subject.agent.use,
757
+ metadata: agentResult.metadata,
758
+ },
759
+ ...(usage ? { usage } : {}),
760
+ });
761
+ }
762
+ function buildAgentSubjectPrompt(workload, subject) {
763
+ return [
764
+ ...(subject.instructions ? ["Instructions:", subject.instructions, ""] : []),
765
+ "Context:",
766
+ "- Subject files are mounted at /workspace/input/subject.",
767
+ "- Subject files are also present in the task working directory.",
768
+ ...(workload.case?.prompt ? ["Case:", workload.case.prompt, ""] : []),
769
+ "- Public case files are mounted at /workspace/input/case.",
770
+ "- Verifier tests are not present while you run.",
771
+ "- Mutate the current working directory to complete the task.",
772
+ "- You may write inspection artifacts under /workspace/output.",
773
+ ].join("\n");
774
+ }
775
+ async function writeAgentSubjectRevisionOutput(request, workload, optimizer, options) {
776
+ if (request.operation !== "optimizer.improve") {
777
+ throw new Error("Agent subject revision results can only complete optimizer.improve operations.");
778
+ }
779
+ const traceRoot = path.join(request.paths.output, ".workbench", "internal", "agent-optimizer");
780
+ const agentResult = await executeBuiltInAgentTurn(options.agentExecutor, {
781
+ role: "optimizer",
782
+ provider: optimizer.agent,
783
+ adapterAuthRoot: options.adapterAuthRoot,
784
+ adapterAuthRequest: options.adapterAuthRequest,
785
+ adapterAuthEnv: options.adapterAuthEnv,
786
+ workspaceRoot: request.paths.workspace,
787
+ cwd: request.paths.cwd ?? request.paths.workspace,
788
+ prompt: buildAgentOptimizerPrompt(workload),
789
+ traceRoot,
790
+ jobId: workload.job.id,
791
+ });
792
+ const subjectPatch = await createSubjectPatchFromWorkspace({
793
+ beforeRoot: requiredRequestPath(request.paths.subject, "paths.subject"),
794
+ afterRoot: request.paths.cwd ?? request.paths.workspace,
795
+ edits: workload.optimizer.edits,
796
+ });
797
+ const changedSubjectPaths = subjectPatch.fileChanges.filter((filePath) => isSubjectEditPath(filePath, workload.optimizer.edits));
798
+ if (changedSubjectPaths.length === 0) {
799
+ throw new Error("Agent improve adapter completed without changing a subject file covered by optimizer edits.");
800
+ }
801
+ const trace = {
802
+ path: `.workbench/traces/${workload.job.id}/optimizer.json`,
803
+ kind: "text",
804
+ encoding: "utf8",
805
+ executable: false,
806
+ content: `${JSON.stringify({
807
+ kind: "agent_optimizer",
808
+ provider: optimizer.agent.use,
809
+ subjectId: workload.subjectId,
810
+ attemptIndex: workload.attemptIndex,
811
+ changedPaths: changedSubjectPaths,
812
+ summary: agentResult.output,
813
+ metadata: agentResult.metadata,
814
+ }, null, 2)}\n`,
815
+ };
816
+ await writeSurfaceFiles(request.paths.output, [trace, ...agentResult.traceFiles]);
817
+ const runtime = await importWorkbenchRuntime();
818
+ const usage = runtime.assignUsageRole("optimizer", agentResult.usage);
819
+ await writeWorkbenchAdapterOperationResult(request.paths.output, {
820
+ protocol: "workbench.adapter-result.v1",
821
+ operation: "optimizer.improve",
822
+ ok: true,
823
+ value: {
824
+ ...subjectPatch,
825
+ fileChanges: changedSubjectPaths,
826
+ },
827
+ ...(agentResult.output ? { summary: agentResult.output } : {}),
828
+ feedback: {
829
+ optimizer: optimizer.agent.use,
830
+ changedPaths: changedSubjectPaths,
831
+ metadata: agentResult.metadata,
832
+ },
833
+ ...(usage ? { usage } : {}),
834
+ });
835
+ }
836
+ function buildAgentOptimizerPrompt(workload) {
837
+ return [
838
+ "Benchmark:",
839
+ workload.benchmark.description || workload.benchmark.name,
840
+ "",
841
+ "Context:",
842
+ "- Subject files are mounted at /workspace/input/subject.",
843
+ "- Subject files are also present in the current working directory.",
844
+ "- Prior run traces are mounted at /workspace/input/traces.",
845
+ "- Use /workspace/input/traces as the source of truth for what happened in prior attempts.",
846
+ "- Do not mutate /workspace/input.",
847
+ "",
848
+ "Editable subject paths:",
849
+ workload.optimizer.edits.map((entry) => `- ${entry}`).join("\n"),
850
+ "",
851
+ "Output:",
852
+ "- Mutate the editable subject files directly in the current working directory.",
853
+ "- Include at least one changed subject file covered by the optimizer edits list.",
854
+ ].join("\n");
855
+ }
856
+ async function writeRubricJudgeResult(request, workload, engine, options = {}) {
857
+ const agentExecutor = options.agentExecutor;
858
+ const runtime = await importWorkbenchRuntime();
859
+ const criterionRuns = await mapWithConcurrency(engine.criteria, engine.parallelism, async (criterion) => runRubricCriterionJudge({
860
+ request,
861
+ workload,
862
+ engine,
863
+ criterion,
864
+ agentExecutor,
865
+ adapterAuthRoot: options.adapterAuthRoot,
866
+ adapterAuthRequest: options.adapterAuthRequest,
867
+ adapterAuthEnv: options.adapterAuthEnv,
868
+ runtime,
869
+ }));
870
+ const usage = runtime.mergeUsageSummaries(criterionRuns.map((run) => run.usage));
871
+ const result = rubricJudgeResultFromCriteria({
872
+ workload,
873
+ engine,
874
+ criterionRuns,
875
+ });
876
+ await writeWorkbenchAdapterOperationResult(request.paths.output, {
877
+ protocol: "workbench.adapter-result.v1",
878
+ operation: "engine.run",
879
+ ok: true,
880
+ value: result,
881
+ ...(typeof result.summary === "string" ? { summary: result.summary } : {}),
882
+ feedback: {
883
+ rubric: "criterion-fanout",
884
+ judge: engine.judge.use,
885
+ parallelism: engine.parallelism,
886
+ aggregation: "weighted_mean",
887
+ criteria: criterionRuns.map((run) => ({
888
+ id: run.result.criterion_id,
889
+ traceRoot: run.traceRoot,
890
+ metadata: run.metadata,
891
+ ...(run.repair ? { repair: run.repair } : {}),
892
+ })),
893
+ },
894
+ ...(usage ? { usage } : {}),
895
+ });
896
+ }
897
+ async function runRubricCriterionJudge(args) {
898
+ const traceRoot = path.join(args.request.paths.output, ".workbench", "internal", "rubric", safeInternalPathSegment(args.criterion.id));
899
+ const agentResult = await executeBuiltInAgentTurn(args.agentExecutor, {
900
+ role: "engine",
901
+ provider: args.engine.judge,
902
+ adapterAuthRoot: args.adapterAuthRoot,
903
+ adapterAuthRequest: args.adapterAuthRequest,
904
+ adapterAuthEnv: args.adapterAuthEnv,
905
+ workspaceRoot: args.request.paths.workspace,
906
+ cwd: args.request.paths.cwd ?? args.request.paths.workspace,
907
+ prompt: buildRubricCriterionJudgePrompt(args.workload, args.engine, args.criterion),
908
+ traceRoot: path.join(traceRoot, "judge"),
909
+ jobId: args.workload.job.id,
910
+ });
911
+ let usage = args.runtime.assignUsageRole("engine", agentResult.usage);
912
+ try {
913
+ return {
914
+ ...normalizeRubricCriterionJudgeResult(agentResult.output, args.criterion),
915
+ metadata: agentResult.metadata,
916
+ traceRoot,
917
+ ...(usage ? { usage } : {}),
918
+ };
919
+ }
920
+ catch (error) {
921
+ const repairError = error instanceof Error ? error.message : String(error);
922
+ const repairResult = await executeBuiltInAgentTurn(args.agentExecutor, {
923
+ role: "engine",
924
+ provider: args.engine.judge,
925
+ adapterAuthRoot: args.adapterAuthRoot,
926
+ adapterAuthRequest: args.adapterAuthRequest,
927
+ adapterAuthEnv: args.adapterAuthEnv,
928
+ workspaceRoot: args.request.paths.workspace,
929
+ cwd: args.request.paths.cwd ?? args.request.paths.workspace,
930
+ prompt: buildRubricCriterionRepairPrompt({
931
+ output: agentResult.output,
932
+ error: repairError,
933
+ criterion: args.criterion,
934
+ }),
935
+ traceRoot: path.join(traceRoot, "repair"),
936
+ jobId: args.workload.job.id,
937
+ });
938
+ usage = args.runtime.mergeUsageSummaries([
939
+ usage,
940
+ args.runtime.assignUsageRole("engine", repairResult.usage),
941
+ ]);
942
+ return {
943
+ ...normalizeRubricCriterionJudgeResult(repairResult.output, args.criterion),
944
+ metadata: {
945
+ ...repairResult.metadata,
946
+ repair: {
947
+ attempted: true,
948
+ originalError: repairError,
949
+ originalMetadata: agentResult.metadata,
950
+ },
951
+ },
952
+ traceRoot,
953
+ repair: {
954
+ attempted: true,
955
+ originalError: repairError,
956
+ },
957
+ ...(usage ? { usage } : {}),
958
+ };
959
+ }
960
+ }
961
+ function buildRubricCriterionJudgePrompt(workload, engine, criterion) {
962
+ requireWorkloadTask(workload, "Rubric judge");
963
+ return [
964
+ ...(engine.instructions ? ["Instructions:", engine.instructions, ""] : []),
965
+ ...(workload.case?.prompt ? ["Case:", workload.case.prompt, ""] : []),
966
+ "Criterion:",
967
+ JSON.stringify(criterion, null, 2),
968
+ "",
969
+ "Context:",
970
+ "- The subject already ran in this same working directory.",
971
+ "- Subject outputs are available in the current working directory.",
972
+ "- Public case files are mounted at /workspace/input/case.",
973
+ "- Verifier-private files are mounted at /workspace/private/engine when the task provides them.",
974
+ "- Score only from the current working directory, public case files, verifier-private files, and the criterion above.",
975
+ "",
976
+ "Output:",
977
+ "Return only a JSON object. Do not wrap it in Markdown.",
978
+ "The JSON object must score exactly this one criterion. Use this shape:",
979
+ JSON.stringify({
980
+ criterion_id: criterion.id,
981
+ score: 0.0,
982
+ pass: false,
983
+ rationale: "why this criterion received this score",
984
+ summary: "short grading summary",
985
+ feedback: {},
986
+ }, null, 2),
987
+ `The only allowed criterion_id is ${criterion.id}.`,
988
+ "The rationale must be non-empty and specific to this criterion.",
989
+ ].join("\n");
990
+ }
991
+ function buildRubricCriterionRepairPrompt(input) {
992
+ return [
993
+ "The previous Workbench rubric criterion judge response was rejected by the result parser.",
994
+ "",
995
+ `Parser error: ${input.error}`,
996
+ "",
997
+ "Convert the previous response into one valid JSON object. Return only JSON, with no Markdown.",
998
+ "Preserve the prior score, rationale, and feedback whenever they are present.",
999
+ "If the previous response uses clear qualitative scoring, convert only these terms: perfect/full pass/pass = 1, fail/no credit = 0, partial = 0.5.",
1000
+ "If the required score is still not recoverable from the previous response, use score 0, pass false, and rationale \"The judge response did not provide a recoverable score and rationale for this criterion.\"",
1001
+ "Do not invent file paths, log paths, or extra criterion ids.",
1002
+ "",
1003
+ "Criterion:",
1004
+ JSON.stringify(input.criterion, null, 2),
1005
+ "",
1006
+ "Required JSON shape:",
1007
+ JSON.stringify({
1008
+ criterion_id: input.criterion.id,
1009
+ score: 0.0,
1010
+ pass: false,
1011
+ rationale: "why this criterion received this score",
1012
+ summary: "short grading summary",
1013
+ feedback: {},
1014
+ }, null, 2),
1015
+ "",
1016
+ `The only allowed criterion_id is ${input.criterion.id}.`,
1017
+ "",
1018
+ "Previous response:",
1019
+ input.output,
1020
+ ].join("\n");
1021
+ }
1022
+ function rubricJudgeResultFromCriteria(args) {
1023
+ const criteria = args.criterionRuns.map((run) => run.result);
1024
+ const score = weightedCriteriaScore(criteria, args.engine.criteria);
1025
+ if (!isBoundedScore(score)) {
1026
+ throw new Error("Rubric criterion scores must aggregate to a score in the 0..1 range.");
1027
+ }
1028
+ const metrics = { score };
1029
+ for (const criterion of criteria) {
1030
+ metrics[`criterion__${criterion.criterion_id}`] = criterion.score;
1031
+ }
1032
+ const caseResult = rubricJudgeCaseResult({
1033
+ workload: args.workload,
1034
+ score,
1035
+ criteria,
1036
+ });
1037
+ const passed = criteria.filter((criterion) => criterion.pass).length;
1038
+ return {
1039
+ score,
1040
+ metrics,
1041
+ summary: `Rubric judged ${criteria.length} criteria (${passed} passed).`,
1042
+ cases: [caseResult],
1043
+ feedback: {
1044
+ judge: args.engine.judge.use,
1045
+ rubric: {
1046
+ parallelism: args.engine.parallelism,
1047
+ aggregation: "weighted_mean",
1048
+ criteria: args.criterionRuns.map((run) => ({
1049
+ id: run.result.criterion_id,
1050
+ score: run.result.score,
1051
+ pass: run.result.pass,
1052
+ ...(run.summary ? { summary: run.summary } : {}),
1053
+ ...(run.feedback !== undefined ? { feedback: run.feedback } : {}),
1054
+ metadata: run.metadata,
1055
+ ...(run.repair ? { repair: run.repair } : {}),
1056
+ })),
1057
+ },
1058
+ },
1059
+ };
1060
+ }
1061
+ function normalizeRubricCriterionJudgeResult(output, criterion) {
1062
+ const parsed = parseAgentJsonObject(output, "Rubric judge");
1063
+ const result = normalizeRubricCriterionObject(parsed, criterion);
1064
+ const score = result.score;
1065
+ if (!isBoundedScore(score)) {
1066
+ throw new Error("Rubric criterion judge output must include a score in the 0..1 range.");
1067
+ }
1068
+ return {
1069
+ result,
1070
+ ...(typeof parsed.summary === "string" ? { summary: parsed.summary } : {}),
1071
+ ...(parsed.feedback !== undefined ? { feedback: parsed.feedback } : {}),
1072
+ };
1073
+ }
1074
+ function rubricJudgeCaseResult(args) {
1075
+ return {
1076
+ id: args.workload.caseId,
1077
+ status: "completed",
1078
+ metrics: { score: args.score },
1079
+ criteria: args.criteria,
1080
+ };
1081
+ }
1082
+ function readCriterionRationale(record) {
1083
+ for (const key of ["rationale", "feedback", "reason", "explanation"]) {
1084
+ const value = record[key];
1085
+ if (typeof value === "string" && value.trim().length > 0) {
1086
+ return value.trim();
1087
+ }
1088
+ }
1089
+ return undefined;
1090
+ }
1091
+ function normalizeRubricCriterionObject(record, criterion) {
1092
+ const criterionId = typeof record.criterion_id === "string"
1093
+ ? record.criterion_id
1094
+ : "";
1095
+ if (criterionId !== criterion.id) {
1096
+ throw new Error(`Rubric criterion judge output must use criterion_id ${criterion.id}.`);
1097
+ }
1098
+ if (!isBoundedScore(record.score)) {
1099
+ throw new Error(`Rubric criterion ${criterion.id} output must include a score in the 0..1 range.`);
1100
+ }
1101
+ const rationale = readCriterionRationale(record);
1102
+ if (!rationale) {
1103
+ throw new Error(`Rubric criterion ${criterion.id} output must include a non-empty rationale.`);
1104
+ }
1105
+ return {
1106
+ criterion_id: criterion.id,
1107
+ label: typeof record.label === "string" && record.label.length > 0 ? record.label : criterion.id,
1108
+ score: record.score,
1109
+ pass: typeof record.pass === "boolean" ? record.pass : record.score >= 0.5,
1110
+ rationale,
1111
+ };
1112
+ }
1113
+ function rubricCriteria(value, label) {
1114
+ if (!Array.isArray(value)) {
1115
+ throw new Error(`${label} must be an array.`);
1116
+ }
1117
+ const seen = new Set();
1118
+ return value.map((entry, index) => {
1119
+ const record = jsonRecord(entry);
1120
+ const id = record.id;
1121
+ const description = record.description;
1122
+ if (typeof id !== "string" || id.length === 0) {
1123
+ throw new Error(`Spec must include ${label}[${index}].id.`);
1124
+ }
1125
+ if (seen.has(id)) {
1126
+ throw new Error(`${label}[${index}].id duplicates another rubric criterion id.`);
1127
+ }
1128
+ seen.add(id);
1129
+ if (typeof description !== "string" || description.length === 0) {
1130
+ throw new Error(`Spec must include ${label}[${index}].description.`);
1131
+ }
1132
+ return {
1133
+ id,
1134
+ description,
1135
+ ...(typeof record.weight === "number" ? { weight: record.weight } : {}),
1136
+ };
1137
+ });
1138
+ }
1139
+ function rubricParallelism(value, criterionCount) {
1140
+ if (criterionCount <= 0) {
1141
+ return 1;
1142
+ }
1143
+ if (value === undefined) {
1144
+ return Math.min(DEFAULT_RUBRIC_PARALLELISM, criterionCount);
1145
+ }
1146
+ if (typeof value !== "number" || !Number.isInteger(value) || value <= 0) {
1147
+ throw new Error("adapter.with.parallelism must be a positive integer.");
1148
+ }
1149
+ return Math.min(value, criterionCount);
1150
+ }
1151
+ async function mapWithConcurrency(inputs, concurrency, mapper) {
1152
+ const limit = Math.max(1, Math.min(concurrency, inputs.length || 1));
1153
+ const results = new Array(inputs.length);
1154
+ let nextIndex = 0;
1155
+ async function worker() {
1156
+ while (nextIndex < inputs.length) {
1157
+ const index = nextIndex;
1158
+ nextIndex += 1;
1159
+ results[index] = await mapper(inputs[index], index);
1160
+ }
1161
+ }
1162
+ await Promise.all(Array.from({ length: limit }, async () => worker()));
1163
+ return results;
1164
+ }
1165
+ function requireWorkloadTask(workload, label) {
1166
+ if (!workload.case) {
1167
+ throw new Error(`${label} workload is missing case text.`);
1168
+ }
1169
+ }
1170
+ async function createSubjectPatchFromWorkspace(args) {
1171
+ const before = new Map((await readSurfaceFilesRecursive(args.beforeRoot))
1172
+ .map((file) => [normalizeRelativePath(file.path), file]));
1173
+ const changedFiles = (await readSurfaceFilesRecursive(args.afterRoot))
1174
+ .map((file) => ({ ...file, path: normalizeRelativePath(file.path) }))
1175
+ .filter((file) => isSubjectEditPath(file.path, args.edits) &&
1176
+ !isRuntimeWorkspacePath(file.path) &&
1177
+ !sameSurfaceFile(before.get(file.path), file))
1178
+ .sort((left, right) => left.path.localeCompare(right.path));
1179
+ return {
1180
+ files: changedFiles,
1181
+ fileChanges: changedFiles.map((file) => file.path),
1182
+ };
1183
+ }
1184
+ function sameSurfaceFile(left, right) {
1185
+ return !!left &&
1186
+ left.kind === right.kind &&
1187
+ left.encoding === right.encoding &&
1188
+ left.content === right.content &&
1189
+ left.executable === right.executable;
1190
+ }
1191
+ function isRuntimeWorkspacePath(filePath) {
1192
+ const normalized = normalizeRelativePath(filePath);
1193
+ return normalized === ".workbench" ||
1194
+ normalized.startsWith(".workbench/") ||
1195
+ normalized === "input" ||
1196
+ normalized.startsWith("input/") ||
1197
+ normalized === "output" ||
1198
+ normalized.startsWith("output/") ||
1199
+ normalized === "logs" ||
1200
+ normalized.startsWith("logs/") ||
1201
+ normalized === "private" ||
1202
+ normalized.startsWith("private/");
1203
+ }
1204
+ async function writeSurfaceFiles(root, files) {
1205
+ for (const file of files) {
1206
+ const target = path.join(root, normalizeRelativePath(file.path));
1207
+ await fs.mkdir(path.dirname(target), { recursive: true });
1208
+ const body = file.encoding === "base64"
1209
+ ? Buffer.from(file.content, "base64")
1210
+ : Buffer.from(file.content, "utf8");
1211
+ await fs.writeFile(target, body);
1212
+ if (file.executable) {
1213
+ await fs.chmod(target, 0o755).catch(() => undefined);
1214
+ }
1215
+ }
1216
+ }
1217
+ function isSubjectEditPath(filePath, edits) {
1218
+ const normalized = normalizeRelativePath(filePath);
1219
+ return edits.some((entry) => {
1220
+ const editPath = normalizeRelativePath(entry).replace(/\/+$/u, "");
1221
+ return normalized === editPath || normalized.startsWith(`${editPath}/`);
1222
+ });
1223
+ }
1224
+ function normalizeRelativePath(filePath) {
1225
+ const normalized = filePath.replace(/\\/gu, "/").replace(/^\/+/u, "");
1226
+ return normalized.split("/").filter(Boolean).join("/");
1227
+ }
1228
+ function parseAgentJsonObject(output, label) {
1229
+ const trimmed = output.trim();
1230
+ const start = trimmed.indexOf("{");
1231
+ const end = trimmed.lastIndexOf("}");
1232
+ if (start < 0 || end < start) {
1233
+ throw new Error(`${label} output must be a JSON object.`);
1234
+ }
1235
+ let parsed;
1236
+ const jsonText = trimmed.slice(start, end + 1);
1237
+ try {
1238
+ parsed = parseAgentJsonText(jsonText);
1239
+ }
1240
+ catch (error) {
1241
+ throw new Error(`${label} output must parse as a JSON object: ${error instanceof Error ? error.message : String(error)}.`);
1242
+ }
1243
+ if (!parsed || typeof parsed !== "object" || Array.isArray(parsed)) {
1244
+ throw new Error(`${label} output must be a JSON object.`);
1245
+ }
1246
+ return parsed;
1247
+ }
1248
+ function parseAgentJsonText(jsonText) {
1249
+ try {
1250
+ return JSON.parse(jsonText);
1251
+ }
1252
+ catch (error) {
1253
+ const repaired = repairInvalidJsonStringEscapes(jsonText);
1254
+ if (repaired !== jsonText) {
1255
+ try {
1256
+ return JSON.parse(repaired);
1257
+ }
1258
+ catch {
1259
+ // Preserve the original parse error; it points at the model output.
1260
+ }
1261
+ }
1262
+ throw error;
1263
+ }
1264
+ }
1265
+ function repairInvalidJsonStringEscapes(jsonText) {
1266
+ let repaired = "";
1267
+ let inString = false;
1268
+ let escaped = false;
1269
+ for (const char of jsonText) {
1270
+ if (!inString) {
1271
+ repaired += char;
1272
+ if (char === "\"") {
1273
+ inString = true;
1274
+ }
1275
+ continue;
1276
+ }
1277
+ if (escaped) {
1278
+ repaired += isJsonEscapeCharacter(char) ? char : `\\${char}`;
1279
+ escaped = false;
1280
+ continue;
1281
+ }
1282
+ repaired += char;
1283
+ if (char === "\\") {
1284
+ escaped = true;
1285
+ continue;
1286
+ }
1287
+ if (char === "\"") {
1288
+ inString = false;
1289
+ }
1290
+ }
1291
+ if (escaped) {
1292
+ repaired += "\\";
1293
+ }
1294
+ return repaired;
1295
+ }
1296
+ function isJsonEscapeCharacter(char) {
1297
+ return char === "\""
1298
+ || char === "\\"
1299
+ || char === "/"
1300
+ || char === "b"
1301
+ || char === "f"
1302
+ || char === "n"
1303
+ || char === "r"
1304
+ || char === "t"
1305
+ || char === "u";
1306
+ }
1307
+ function isBoundedScore(value) {
1308
+ return typeof value === "number" && Number.isFinite(value) && value >= 0 && value <= 1;
1309
+ }
1310
+ function weightedCriteriaScore(criteria, specCriteria) {
1311
+ if (criteria.length === 0) {
1312
+ return undefined;
1313
+ }
1314
+ const weights = new Map(specCriteria.map((criterion) => [criterion.id, criterion.weight ?? 1]));
1315
+ let numerator = 0;
1316
+ let denominator = 0;
1317
+ for (const criterion of criteria) {
1318
+ const weight = weights.get(criterion.criterion_id) ?? 1;
1319
+ numerator += criterion.score * weight;
1320
+ denominator += weight;
1321
+ }
1322
+ return denominator > 0 ? Number((numerator / denominator).toFixed(6)) : undefined;
1323
+ }
1324
+ function jsonRecord(value) {
1325
+ return value && typeof value === "object" && !Array.isArray(value)
1326
+ ? value
1327
+ : {};
1328
+ }
1329
+ function isJsonPayload(value) {
1330
+ return value === null ||
1331
+ typeof value === "string" ||
1332
+ typeof value === "number" ||
1333
+ typeof value === "boolean" ||
1334
+ (Array.isArray(value) && value.every(isJsonPayload)) ||
1335
+ (typeof value === "object" && value !== null && Object.values(value).every(isJsonPayload));
1336
+ }