@forwardimpact/libeval 0.1.35 → 0.1.38

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,7 +3,7 @@
3
3
  *
4
4
  * Phases per (task, runIndex):
5
5
  * 1. WorkdirManager.start → seed CWD + run pre-flight probe
6
- * 2. AgentRunner (bare; design Decision 14) → produce trace + submission
6
+ * 2. Supervisor relay (agent + supervisor) → produce traces + submission
7
7
  * 3. Scorer.runScoring → exit-code-driven verdict via fd-3 NDJSON
8
8
  * 4. Judge.runJudge → Conclude-driven verdict mapped to pass/fail
9
9
  * 5. WorkdirManager.teardown → process-group cleanup
@@ -15,15 +15,12 @@
15
15
  */
16
16
 
17
17
  import { createReadStream, createWriteStream } from "node:fs";
18
- import { access, constants, mkdir, readFile } from "node:fs/promises";
18
+ import { access, constants, mkdir, readFile, unlink } from "node:fs/promises";
19
19
  import { createInterface } from "node:readline";
20
20
  import { join, resolve as resolvePath } from "node:path";
21
21
 
22
- import { createAgentRunner } from "../agent-runner.js";
23
- import { composeProfilePrompt } from "../profile-prompt.js";
24
22
  import { createRedactor } from "../redaction.js";
25
- import { AGENT_SYSTEM_PROMPT } from "../supervisor.js";
26
- import { createTraceCollector } from "../trace-collector.js";
23
+ import { createSupervisor } from "../supervisor.js";
27
24
  import { installApm } from "./apm-installer.js";
28
25
  import { runJudge } from "./judge.js";
29
26
  import { validateResultRecord } from "./result.js";
@@ -40,7 +37,9 @@ export class BenchmarkRunner {
40
37
  * @param {import("./task-family.js").TaskFamily | string} opts.family
41
38
  * @param {number} opts.runs - Runs per task (≥ 1).
42
39
  * @param {string} opts.output - Run-output directory.
43
- * @param {string} opts.model
40
+ * @param {string} opts.agentModel
41
+ * @param {string} opts.supervisorModel
42
+ * @param {string} opts.judgeModel
44
43
  * @param {{agent?: string, judge?: string}} [opts.profiles]
45
44
  * @param {Function} opts.query - SDK query (injected for testability).
46
45
  * @param {number} [opts.maxTurns] - Agent-under-test turn budget.
@@ -60,7 +59,9 @@ export class BenchmarkRunner {
60
59
  family,
61
60
  runs,
62
61
  output,
63
- model,
62
+ agentModel,
63
+ supervisorModel,
64
+ judgeModel,
64
65
  profiles,
65
66
  query,
66
67
  maxTurns,
@@ -74,12 +75,16 @@ export class BenchmarkRunner {
74
75
  if (!Number.isInteger(runs) || runs < 1)
75
76
  throw new Error("runs must be an integer ≥ 1");
76
77
  if (!output) throw new Error("output is required");
77
- if (!model) throw new Error("model is required");
78
+ if (!agentModel) throw new Error("agentModel is required");
79
+ if (!supervisorModel) throw new Error("supervisorModel is required");
80
+ if (!judgeModel) throw new Error("judgeModel is required");
78
81
  if (!query) throw new Error("query is required");
79
82
  this.familyInput = family;
80
83
  this.runs = runs;
81
84
  this.output = output;
82
- this.model = model;
85
+ this.agentModel = agentModel;
86
+ this.supervisorModel = supervisorModel;
87
+ this.judgeModel = judgeModel;
83
88
  this.profiles = {
84
89
  agent: profiles?.agent ?? null,
85
90
  judge: profiles?.judge ?? null,
@@ -103,14 +108,21 @@ export class BenchmarkRunner {
103
108
  : this.familyInput;
104
109
 
105
110
  await mkdir(this.output, { recursive: true });
106
- const { stagingDir, skillSetHash } = await installApm(family, this.output);
111
+ const { stagingDir, skillSetHash, judgeProfilesDir } = await installApm(
112
+ family,
113
+ this.output,
114
+ );
107
115
 
108
116
  const tasks = family.tasks();
109
117
  for (const task of tasks) {
110
118
  await assertPreflightExecutable(task);
111
119
  }
112
120
  if (this.profiles.judge) {
113
- await assertJudgeProfileStaged(family, stagingDir, this.profiles.judge);
121
+ await assertJudgeProfileStaged(
122
+ family,
123
+ judgeProfilesDir,
124
+ this.profiles.judge,
125
+ );
114
126
  }
115
127
 
116
128
  const wm = createWorkdirManager({
@@ -130,6 +142,7 @@ export class BenchmarkRunner {
130
142
  task,
131
143
  runIndex,
132
144
  skillSetHash,
145
+ judgeProfilesDir,
133
146
  );
134
147
  await writeRecord(resultsStream, record);
135
148
  yield record;
@@ -140,7 +153,7 @@ export class BenchmarkRunner {
140
153
  }
141
154
  }
142
155
 
143
- async #runOne(family, wm, task, runIndex, skillSetHash) {
156
+ async #runOne(family, wm, task, runIndex, skillSetHash, judgeProfilesDir) {
144
157
  const t0 = Date.now();
145
158
  const workdir = await wm.start(task, runIndex);
146
159
  try {
@@ -165,11 +178,23 @@ export class BenchmarkRunner {
165
178
  port: workdir.port,
166
179
  runDir: workdir.runDir,
167
180
  });
168
- const judgeVerdict = await this._runJudgeHook(task, workdir, scoring, {
169
- query: this.query,
170
- model: this.model,
171
- judgeProfile: this.profiles.judge ?? undefined,
172
- });
181
+ const judgeContext = await this.#buildJudgeContext(
182
+ task,
183
+ workdir,
184
+ skillSetHash,
185
+ );
186
+ const judgeVerdict = await this._runJudgeHook(
187
+ task,
188
+ workdir,
189
+ scoring,
190
+ {
191
+ query: this.query,
192
+ model: this.judgeModel,
193
+ judgeProfile: this.profiles.judge ?? undefined,
194
+ profilesDir: judgeProfilesDir,
195
+ },
196
+ judgeContext,
197
+ );
173
198
  const record = {
174
199
  taskId: task.id,
175
200
  runIndex,
@@ -183,13 +208,18 @@ export class BenchmarkRunner {
183
208
  costUsd,
184
209
  turns,
185
210
  agentTracePath: workdir.agentTracePath,
211
+ supervisorTracePath: workdir.supervisorTracePath,
186
212
  judgeTracePath: workdir.judgeTracePath,
187
213
  profiles: {
188
214
  agent: this.profiles.agent,
189
215
  supervisor: null,
190
216
  judge: this.profiles.judge,
191
217
  },
192
- model: this.model,
218
+ model: {
219
+ agent: this.agentModel,
220
+ supervisor: this.supervisorModel,
221
+ judge: this.judgeModel,
222
+ },
193
223
  skillSetHash,
194
224
  familyRevision: family.familyRevision,
195
225
  durationMs: Date.now() - t0,
@@ -225,57 +255,60 @@ export class BenchmarkRunner {
225
255
  }
226
256
 
227
257
  /**
228
- * Run the agent-under-test as a bare AgentRunner (design Decision 14).
229
- * Recover cost/turns/submission from the trace by replaying it into a
230
- * fresh TraceCollector the bare runner writes a single NDJSON stream
231
- * with one terminal `result` event.
232
- *
233
- * Inspects both thrown errors AND the resolved `{success, aborted, error}`
234
- * shape returned by `AgentRunner.run()` (agent-runner.js:69, 166–194):
235
- * the SDK iterator catches its own errors and resolves with `success:
236
- * false`, so a try/catch alone would silently treat a failed session as
237
- * a successful one (plan Step 8.5.c).
258
+ * Run the agent-under-test via a Supervisor relay. The supervisor writes
259
+ * a combined tagged NDJSON trace; after the session we split it into
260
+ * agent.ndjson and supervisor.ndjson and extract cost/turns/submission.
238
261
  */
239
262
  async #runAgent(task, workdir) {
240
- const agentTraceStream = createWriteStream(workdir.agentTracePath);
241
- const systemPrompt = this.profiles.agent
242
- ? composeProfilePrompt(this.profiles.agent, {
243
- profilesDir: resolvePath(workdir.cwd, ".claude/agents"),
244
- trailer: AGENT_SYSTEM_PROMPT,
245
- })
246
- : undefined;
247
- const runner = createAgentRunner({
248
- cwd: workdir.cwd,
263
+ const combinedPath = join(workdir.runDir, ".combined.ndjson");
264
+ const combinedStream = createWriteStream(combinedPath);
265
+ const supervisor = createSupervisor({
266
+ supervisorCwd: workdir.cwd,
267
+ agentCwd: workdir.cwd,
249
268
  query: this.query,
250
- output: agentTraceStream,
251
- model: this.model,
269
+ output: combinedStream,
270
+ agentModel: this.agentModel,
271
+ supervisorModel: this.supervisorModel,
252
272
  maxTurns: this.maxTurns ?? 50,
253
273
  allowedTools: BASE_TOOLS,
254
- settingSources: ["project"],
255
- systemPrompt,
274
+ ...(this.profiles.agent && { agentProfile: this.profiles.agent }),
256
275
  redactor: createRedactor(),
257
276
  });
258
277
  const instructions = await readFile(task.paths.instructions, "utf8");
259
278
  let agentError = null;
260
279
  try {
261
- const result = await runner.run(instructions);
262
- if (!result.success) {
263
- agentError = {
264
- message:
265
- result.error?.message ??
266
- (result.aborted ? "aborted" : "agent did not succeed"),
267
- aborted: result.aborted ?? false,
268
- };
280
+ const result = await supervisor.run(instructions);
281
+ if (!result.success && !result.concluded) {
282
+ agentError = { message: "supervisor did not succeed", aborted: false };
269
283
  }
270
284
  } catch (e) {
271
285
  agentError = { message: e.message ?? String(e), aborted: false };
272
286
  } finally {
273
- await new Promise((r) => agentTraceStream.end(r));
287
+ await new Promise((r) => combinedStream.end(r));
274
288
  }
275
- const summary = await readAgentSummary(workdir.agentTracePath);
289
+ const summary = await splitAndSummarize(
290
+ combinedPath,
291
+ workdir.agentTracePath,
292
+ workdir.supervisorTracePath,
293
+ );
294
+ await unlink(combinedPath).catch(() => {});
276
295
  return { ...summary, agentError };
277
296
  }
278
297
 
298
+ async #buildJudgeContext(task, workdir, skillSetHash) {
299
+ const agentInstructions = await readFile(task.paths.instructions, "utf8");
300
+ let agentProfile = "";
301
+ if (this.profiles.agent) {
302
+ const profilePath = resolvePath(
303
+ workdir.cwd,
304
+ ".claude/agents",
305
+ `${this.profiles.agent}.md`,
306
+ );
307
+ agentProfile = await readFile(profilePath, "utf8").catch(() => "");
308
+ }
309
+ return { agentInstructions, agentProfile, skillSetHash };
310
+ }
311
+
279
312
  #buildPreflightFailureRecord({
280
313
  task,
281
314
  runIndex,
@@ -296,11 +329,16 @@ export class BenchmarkRunner {
296
329
  supervisor: null,
297
330
  judge: this.profiles.judge,
298
331
  },
299
- model: this.model,
332
+ model: {
333
+ agent: this.agentModel,
334
+ supervisor: this.supervisorModel,
335
+ judge: this.judgeModel,
336
+ },
300
337
  skillSetHash,
301
338
  familyRevision,
302
339
  durationMs,
303
340
  agentTracePath: workdir.agentTracePath,
341
+ supervisorTracePath: workdir.supervisorTracePath,
304
342
  judgeTracePath: workdir.judgeTracePath,
305
343
  };
306
344
  }
@@ -341,7 +379,7 @@ async function writeRecord(stream, record) {
341
379
  * is missing or non-executable, before any agent session starts."
342
380
  */
343
381
  async function assertPreflightExecutable(task) {
344
- const path = join(task.paths.workdir, "scripts", "preflight.sh");
382
+ const path = join(task.paths.hooks, "preflight.sh");
345
383
  try {
346
384
  await access(path, constants.X_OK);
347
385
  } catch (e) {
@@ -352,35 +390,67 @@ async function assertPreflightExecutable(task) {
352
390
  }
353
391
 
354
392
  /**
355
- * Replay the bare AgentRunner trace into a fresh TraceCollector to recover
356
- * cost, turn count, and the final assistant text block (the submission).
393
+ * Split the combined supervisor trace into agent and supervisor files, and
394
+ * extract cost, turn count, and submission in a single pass. Agent-source
395
+ * events go to `agentPath`; supervisor and orchestrator events go to
396
+ * `supervisorPath`.
357
397
  */
358
- async function readAgentSummary(tracePath) {
359
- const collector = createTraceCollector();
360
- const stream = createReadStream(tracePath);
361
- const rl = createInterface({ input: stream, crlfDelay: Infinity });
362
- for await (const line of rl) collector.addLine(line);
363
- const json = collector.toJSON();
364
- const summary = json.summary ?? {};
365
- return {
366
- costUsd:
367
- typeof summary.totalCostUsd === "number" ? summary.totalCostUsd : 0,
368
- turns: typeof summary.numTurns === "number" ? summary.numTurns : 0,
369
- submission: lastAssistantText(json),
370
- };
398
+ // biome-ignore lint/complexity/noExcessiveCognitiveComplexity: stream-splitting state machine
399
+ async function splitAndSummarize(combinedPath, agentPath, supervisorPath) {
400
+ const agentStream = createWriteStream(agentPath);
401
+ const supStream = createWriteStream(supervisorPath);
402
+ const rl = createInterface({
403
+ input: createReadStream(combinedPath),
404
+ crlfDelay: Infinity,
405
+ });
406
+ let agentCost = 0;
407
+ let supervisorCost = 0;
408
+ let turns = 0;
409
+ let submission = "";
410
+ for await (const line of rl) {
411
+ if (!line.trim()) continue;
412
+ let event;
413
+ try {
414
+ event = JSON.parse(line);
415
+ } catch {
416
+ continue;
417
+ }
418
+ const target = event.source === "agent" ? agentStream : supStream;
419
+ target.write(line + "\n");
420
+ const inner = event.event;
421
+ if (!inner) continue;
422
+ if (event.source === "agent") {
423
+ if (inner.type === "result" && typeof inner.total_cost_usd === "number") {
424
+ agentCost = inner.total_cost_usd;
425
+ }
426
+ if (inner.type === "assistant") {
427
+ const text = extractText(inner);
428
+ if (text) submission = text;
429
+ }
430
+ }
431
+ if (event.source === "supervisor") {
432
+ if (inner.type === "result" && typeof inner.total_cost_usd === "number") {
433
+ supervisorCost = inner.total_cost_usd;
434
+ }
435
+ }
436
+ if (event.source === "orchestrator" && inner.type === "summary") {
437
+ turns = inner.turns ?? 0;
438
+ }
439
+ }
440
+ await Promise.all([
441
+ new Promise((r) => agentStream.end(r)),
442
+ new Promise((r) => supStream.end(r)),
443
+ ]);
444
+ return { costUsd: agentCost + supervisorCost, turns, submission };
371
445
  }
372
446
 
373
- function lastAssistantText(json) {
374
- const turns = json.turns ?? [];
375
- for (let i = turns.length - 1; i >= 0; i--) {
376
- const turn = turns[i];
377
- if (turn.role !== "assistant") continue;
378
- const content = turn.content ?? [];
379
- for (let j = content.length - 1; j >= 0; j--) {
380
- if (content[j].type === "text" && content[j].text) return content[j].text;
381
- }
447
+ function extractText(inner) {
448
+ const content = inner.message?.content ?? inner.content;
449
+ if (!Array.isArray(content)) return null;
450
+ for (let i = content.length - 1; i >= 0; i--) {
451
+ if (content[i].type === "text" && content[i].text) return content[i].text;
382
452
  }
383
- return "";
453
+ return null;
384
454
  }
385
455
 
386
456
  /**
@@ -1,10 +1,7 @@
1
1
  /**
2
- * Scorer — runs `<task.paths.scoring>/run.sh` from the template path against
2
+ * Scorer — runs `<task.paths.hooks>/score.sh` from the template path against
3
3
  * the post-run agent CWD. The exit code is authoritative for the verdict;
4
4
  * structured per-test rows arrive on fd 3 (`$RESULTS_FD=3`) as NDJSON.
5
- *
6
- * Scoring scripts are never copied into the agent CWD — they live only in the
7
- * task template (design Decision 3).
8
5
  */
9
6
 
10
7
  import { spawn } from "node:child_process";
@@ -32,7 +29,7 @@ import { join } from "node:path";
32
29
  */
33
30
  export function runScoring(task, ctx) {
34
31
  return new Promise((res, rej) => {
35
- const script = join(task.paths.scoring, "run.sh");
32
+ const script = join(task.paths.hooks, "score.sh");
36
33
  const stderrLog = createWriteStream(join(ctx.runDir, "scoring.stderr.log"));
37
34
 
38
35
  // Bun's child_process pipe setup for fd >= 3 is racy under load (it
@@ -4,13 +4,14 @@
4
4
  * apm.lock.yaml
5
5
  * .claude/ # pre-staged skills + agents (P1)
6
6
  * tasks/<task_name>/
7
- * instructions.md
7
+ * agent.task.md
8
8
  * supervisor.task.md # preserved for v2; not read in v1
9
9
  * judge.task.md
10
+ * hooks/ # harness-only; never copied to agent CWD
11
+ * preflight.sh
12
+ * score.sh
10
13
  * specs/ # copied into agent CWD
11
- * workdir/ # copied into agent CWD (excludes scripts/)
12
- * scripts/preflight.sh
13
- * scoring/ # template-only; never copied
14
+ * workdir/ # copied into agent CWD
14
15
  *
15
16
  * Local paths or git URLs are both accepted; git URLs are shallow-cloned into
16
17
  * a temp dir and `familyRevision` becomes `git:<sha>` of HEAD at clone time.
@@ -53,13 +54,11 @@ export async function loadTaskFamily(rootPathOrGitUrl) {
53
54
  familyRevision = "sha256:" + (await canonicalTreeHash(rootPath));
54
55
  }
55
56
 
56
- const apmLockBytes = await readApmLockBytes(rootPath);
57
57
  const tasks = await discoverTasks(rootPath);
58
58
 
59
59
  return {
60
60
  rootPath,
61
61
  familyRevision,
62
- apmLockBytes,
63
62
  tasks() {
64
63
  return tasks;
65
64
  },
@@ -67,58 +66,27 @@ export async function loadTaskFamily(rootPathOrGitUrl) {
67
66
  }
68
67
 
69
68
  /**
70
- * Assert that `<stagingDir>/.claude/agents/<judgeProfile>.md` exists. Called
71
- * from `BenchmarkRunner.run()` so a missing judge profile fails the family
69
+ * Assert that `<judgeProfilesDir>/<judgeProfile>.md` exists. Called from
70
+ * `BenchmarkRunner.run()` so a missing judge profile fails the family
72
71
  * install before any agent session starts.
73
72
  * @param {TaskFamily} _family
74
- * @param {string} stagingDir
73
+ * @param {string} judgeProfilesDir
75
74
  * @param {string} judgeProfile
76
75
  * @returns {Promise<void>}
77
76
  */
78
77
  export async function assertJudgeProfileStaged(
79
78
  _family,
80
- stagingDir,
79
+ judgeProfilesDir,
81
80
  judgeProfile,
82
81
  ) {
83
- const candidate = join(stagingDir, ".claude", "agents", `${judgeProfile}.md`);
82
+ const candidate = join(judgeProfilesDir, `${judgeProfile}.md`);
84
83
  try {
85
84
  await access(candidate);
86
85
  } catch {
87
- throw new Error(
88
- `judge profile not staged: ${candidate} (createSupervisor resolves profiles relative to <supervisorCwd>/.claude/agents)`,
89
- );
86
+ throw new Error(`judge profile not staged: ${candidate}`);
90
87
  }
91
88
  }
92
89
 
93
- async function readApmLockBytes(rootPath) {
94
- const lockPath = join(rootPath, "apm.lock.yaml");
95
- try {
96
- const raw = await readFile(lockPath);
97
- return normalizeLf(raw);
98
- } catch (e) {
99
- if (e.code === "ENOENT") {
100
- throw new Error(
101
- `task family missing apm.lock.yaml at ${lockPath} (matches libpack stager.js:126; .yml is not accepted)`,
102
- );
103
- }
104
- throw e;
105
- }
106
- }
107
-
108
- /**
109
- * Replace CRLF with LF so cross-OS authored lockfiles hash identically.
110
- * @param {Buffer} buf
111
- * @returns {Buffer}
112
- */
113
- function normalizeLf(buf) {
114
- const out = [];
115
- for (let i = 0; i < buf.length; i++) {
116
- if (buf[i] === 0x0d && i + 1 < buf.length && buf[i + 1] === 0x0a) continue;
117
- out.push(buf[i]);
118
- }
119
- return Buffer.from(out);
120
- }
121
-
122
90
  async function discoverTasks(rootPath) {
123
91
  const tasksRoot = join(rootPath, "tasks");
124
92
  const tasks = [];
@@ -135,12 +103,12 @@ async function discoverTasks(rootPath) {
135
103
  tasks.push({
136
104
  id: entry.name,
137
105
  paths: {
138
- instructions: join(taskDir, "instructions.md"),
106
+ instructions: join(taskDir, "agent.task.md"),
139
107
  supervisor: join(taskDir, "supervisor.task.md"),
140
108
  judge: join(taskDir, "judge.task.md"),
109
+ hooks: join(taskDir, "hooks"),
141
110
  specs: join(taskDir, "specs"),
142
111
  workdir: join(taskDir, "workdir"),
143
- scoring: join(taskDir, "scoring"),
144
112
  },
145
113
  });
146
114
  }
@@ -242,13 +210,12 @@ function run(cmd, args) {
242
210
  /**
243
211
  * @typedef {object} Task
244
212
  * @property {string} id - Task name (directory name under tasks/)
245
- * @property {{instructions: string, supervisor: string, judge: string, specs: string, workdir: string, scoring: string}} paths
213
+ * @property {{instructions: string, supervisor: string, judge: string, hooks: string, specs: string, workdir: string}} paths
246
214
  */
247
215
 
248
216
  /**
249
217
  * @typedef {object} TaskFamily
250
218
  * @property {string} rootPath
251
219
  * @property {string} familyRevision - `git:<sha>` or `sha256:<hex>`
252
- * @property {Buffer} apmLockBytes - LF-normalised
253
220
  * @property {() => Task[]} tasks
254
221
  */
@@ -11,9 +11,8 @@ import { spawn } from "node:child_process";
11
11
  import { cp, mkdir } from "node:fs/promises";
12
12
  import { createServer } from "node:net";
13
13
  import { connect } from "node:net";
14
- import { join, sep } from "node:path";
14
+ import { join } from "node:path";
15
15
 
16
- const PREFLIGHT_REL = join("workdir", "scripts");
17
16
  const DEFAULT_TERM_GRACE_MS = 5_000;
18
17
 
19
18
  /**
@@ -24,6 +23,7 @@ const DEFAULT_TERM_GRACE_MS = 5_000;
24
23
  * @property {number} pgid - Process-group id captured from the preflight child.
25
24
  * @property {*} scaffold - Reserved per design § Components; v1 sets null.
26
25
  * @property {string} agentTracePath
26
+ * @property {string} supervisorTracePath
27
27
  * @property {string} judgeTracePath
28
28
  * @property {{phase: string, message: string, exitCode: number}} [preflightError]
29
29
  */
@@ -55,9 +55,8 @@ export class WorkdirManager {
55
55
  const cwd = join(runDir, "cwd");
56
56
  await mkdir(cwd, { recursive: true });
57
57
 
58
- await cp(task.paths.workdir, cwd, {
59
- recursive: true,
60
- filter: (src) => !src.endsWith(sep + PREFLIGHT_REL),
58
+ await cp(task.paths.workdir, cwd, { recursive: true }).catch((e) => {
59
+ if (e.code !== "ENOENT") throw e;
61
60
  });
62
61
  await cp(task.paths.specs, join(cwd, "specs"), {
63
62
  recursive: true,
@@ -70,9 +69,10 @@ export class WorkdirManager {
70
69
 
71
70
  const port = await allocatePort();
72
71
  const agentTracePath = join(runDir, "agent.ndjson");
72
+ const supervisorTracePath = join(runDir, "supervisor.ndjson");
73
73
  const judgeTracePath = join(runDir, "judge.ndjson");
74
74
 
75
- const preflightScript = join(task.paths.workdir, "scripts", "preflight.sh");
75
+ const preflightScript = join(task.paths.hooks, "preflight.sh");
76
76
  const preflight = await runPreflight(preflightScript, cwd, port);
77
77
 
78
78
  return {
@@ -82,6 +82,7 @@ export class WorkdirManager {
82
82
  pgid: preflight.pgid,
83
83
  scaffold: null,
84
84
  agentTracePath,
85
+ supervisorTracePath,
85
86
  judgeTracePath,
86
87
  ...(preflight.error && { preflightError: preflight.error }),
87
88
  };