@forwardimpact/libeval 0.1.36 → 0.1.39

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,7 +3,7 @@
3
3
  *
4
4
  * Phases per (task, runIndex):
5
5
  * 1. WorkdirManager.start → seed CWD + run pre-flight probe
6
- * 2. AgentRunner (bare; design Decision 14) → produce trace + submission
6
+ * 2. Supervisor relay (agent + supervisor) → produce traces + submission
7
7
  * 3. Scorer.runScoring → exit-code-driven verdict via fd-3 NDJSON
8
8
  * 4. Judge.runJudge → Conclude-driven verdict mapped to pass/fail
9
9
  * 5. WorkdirManager.teardown → process-group cleanup
@@ -15,15 +15,12 @@
15
15
  */
16
16
 
17
17
  import { createReadStream, createWriteStream } from "node:fs";
18
- import { access, constants, mkdir, readFile } from "node:fs/promises";
18
+ import { mkdir, readFile, unlink } from "node:fs/promises";
19
19
  import { createInterface } from "node:readline";
20
20
  import { join, resolve as resolvePath } from "node:path";
21
21
 
22
- import { createAgentRunner } from "../agent-runner.js";
23
- import { composeProfilePrompt } from "../profile-prompt.js";
24
- import { createRedactor } from "../redaction.js";
25
- import { AGENT_SYSTEM_PROMPT } from "../supervisor.js";
26
- import { createTraceCollector } from "../trace-collector.js";
22
+ import { DEFAULT_ENV_ALLOWLIST, createRedactor } from "../redaction.js";
23
+ import { createSupervisor } from "../supervisor.js";
27
24
  import { installApm } from "./apm-installer.js";
28
25
  import { runJudge } from "./judge.js";
29
26
  import { validateResultRecord } from "./result.js";
@@ -31,7 +28,16 @@ import { runScoring } from "./scorer.js";
31
28
  import { assertJudgeProfileStaged, loadTaskFamily } from "./task-family.js";
32
29
  import { createWorkdirManager } from "./workdir.js";
33
30
 
34
- const BASE_TOOLS = ["Bash", "Read", "Glob", "Grep", "Write", "Edit"];
31
+ const BASE_TOOLS = [
32
+ "Bash",
33
+ "Read",
34
+ "Glob",
35
+ "Grep",
36
+ "Write",
37
+ "Edit",
38
+ "Agent",
39
+ "TodoWrite",
40
+ ];
35
41
 
36
42
  /** Sole orchestrator for a task-family benchmark run. */
37
43
  export class BenchmarkRunner {
@@ -40,9 +46,12 @@ export class BenchmarkRunner {
40
46
  * @param {import("./task-family.js").TaskFamily | string} opts.family
41
47
  * @param {number} opts.runs - Runs per task (≥ 1).
42
48
  * @param {string} opts.output - Run-output directory.
43
- * @param {string} opts.model
49
+ * @param {string} opts.agentModel
50
+ * @param {string} opts.supervisorModel
51
+ * @param {string} opts.judgeModel
44
52
  * @param {{agent?: string, judge?: string}} [opts.profiles]
45
53
  * @param {Function} opts.query - SDK query (injected for testability).
54
+ * @param {string[]} [opts.allowedTools] - Agent tool allowlist (default: BASE_TOOLS).
46
55
  * @param {number} [opts.maxTurns] - Agent-under-test turn budget.
47
56
  * @param {number} [opts.termGraceMs] - SIGTERM→SIGKILL grace (ms) for the per-task process group.
48
57
  * @param {Function} [opts.runAgent] - Test seam: replaces the agent-under-test
@@ -60,9 +69,12 @@ export class BenchmarkRunner {
60
69
  family,
61
70
  runs,
62
71
  output,
63
- model,
72
+ agentModel,
73
+ supervisorModel,
74
+ judgeModel,
64
75
  profiles,
65
76
  query,
77
+ allowedTools,
66
78
  maxTurns,
67
79
  termGraceMs,
68
80
  // Test seams — default to the real implementations.
@@ -74,12 +86,15 @@ export class BenchmarkRunner {
74
86
  if (!Number.isInteger(runs) || runs < 1)
75
87
  throw new Error("runs must be an integer ≥ 1");
76
88
  if (!output) throw new Error("output is required");
77
- if (!model) throw new Error("model is required");
89
+ if (!agentModel) throw new Error("agentModel is required");
78
90
  if (!query) throw new Error("query is required");
79
91
  this.familyInput = family;
80
92
  this.runs = runs;
81
93
  this.output = output;
82
- this.model = model;
94
+ this.agentModel = agentModel;
95
+ this.supervisorModel = supervisorModel;
96
+ this.judgeModel = judgeModel;
97
+ this.allowedTools = allowedTools ?? BASE_TOOLS;
83
98
  this.profiles = {
84
99
  agent: profiles?.agent ?? null,
85
100
  judge: profiles?.judge ?? null,
@@ -103,20 +118,25 @@ export class BenchmarkRunner {
103
118
  : this.familyInput;
104
119
 
105
120
  await mkdir(this.output, { recursive: true });
106
- const { stagingDir, skillSetHash } = await installApm(family, this.output);
121
+ const { stagingDir, skillSetHash, judgeProfilesDir } = await installApm(
122
+ family,
123
+ this.output,
124
+ );
107
125
 
108
126
  const tasks = family.tasks();
109
- for (const task of tasks) {
110
- await assertPreflightExecutable(task);
111
- }
112
127
  if (this.profiles.judge) {
113
- await assertJudgeProfileStaged(family, stagingDir, this.profiles.judge);
128
+ await assertJudgeProfileStaged(
129
+ family,
130
+ judgeProfilesDir,
131
+ this.profiles.judge,
132
+ );
114
133
  }
115
134
 
116
135
  const wm = createWorkdirManager({
117
136
  stagingDir,
118
137
  runOutputDir: this.output,
119
138
  termGraceMs: this.termGraceMs,
139
+ familyRootPath: family.rootPath,
120
140
  });
121
141
 
122
142
  const resultsPath = join(this.output, "results.jsonl");
@@ -130,6 +150,7 @@ export class BenchmarkRunner {
130
150
  task,
131
151
  runIndex,
132
152
  skillSetHash,
153
+ judgeProfilesDir,
133
154
  );
134
155
  await writeRecord(resultsStream, record);
135
156
  yield record;
@@ -140,7 +161,7 @@ export class BenchmarkRunner {
140
161
  }
141
162
  }
142
163
 
143
- async #runOne(family, wm, task, runIndex, skillSetHash) {
164
+ async #runOne(family, wm, task, runIndex, skillSetHash, judgeProfilesDir) {
144
165
  const t0 = Date.now();
145
166
  const workdir = await wm.start(task, runIndex);
146
167
  try {
@@ -165,42 +186,53 @@ export class BenchmarkRunner {
165
186
  port: workdir.port,
166
187
  runDir: workdir.runDir,
167
188
  });
168
- const judgeContext = await this.#buildJudgeContext(
169
- task,
170
- workdir,
171
- skillSetHash,
172
- );
173
- const judgeVerdict = await this._runJudgeHook(
174
- task,
175
- workdir,
176
- scoring,
177
- {
178
- query: this.query,
179
- model: this.model,
180
- judgeProfile: this.profiles.judge ?? undefined,
181
- },
182
- judgeContext,
183
- );
189
+ let judgeVerdict = null;
190
+ if (task.paths.judge) {
191
+ const judgeContext = await this.#buildJudgeContext(
192
+ task,
193
+ workdir,
194
+ skillSetHash,
195
+ );
196
+ judgeVerdict = await this._runJudgeHook(
197
+ task,
198
+ workdir,
199
+ scoring,
200
+ {
201
+ query: this.query,
202
+ model: this.judgeModel,
203
+ judgeProfile: this.profiles.judge ?? undefined,
204
+ profilesDir: judgeProfilesDir,
205
+ },
206
+ judgeContext,
207
+ );
208
+ }
209
+ const verdict =
210
+ scoring.verdict === "pass" &&
211
+ (judgeVerdict === null || judgeVerdict.verdict === "pass")
212
+ ? "pass"
213
+ : "fail";
184
214
  const record = {
185
215
  taskId: task.id,
186
216
  runIndex,
187
- verdict:
188
- scoring.verdict === "pass" && judgeVerdict.verdict === "pass"
189
- ? "pass"
190
- : "fail",
217
+ verdict,
191
218
  scoring,
192
219
  submission,
193
- judgeVerdict,
220
+ ...(judgeVerdict && { judgeVerdict }),
194
221
  costUsd,
195
222
  turns,
196
223
  agentTracePath: workdir.agentTracePath,
224
+ supervisorTracePath: workdir.supervisorTracePath,
197
225
  judgeTracePath: workdir.judgeTracePath,
198
226
  profiles: {
199
227
  agent: this.profiles.agent,
200
228
  supervisor: null,
201
229
  judge: this.profiles.judge,
202
230
  },
203
- model: this.model,
231
+ model: {
232
+ agent: this.agentModel,
233
+ supervisor: this.supervisorModel,
234
+ judge: this.judgeModel,
235
+ },
204
236
  skillSetHash,
205
237
  familyRevision: family.familyRevision,
206
238
  durationMs: Date.now() - t0,
@@ -236,54 +268,49 @@ export class BenchmarkRunner {
236
268
  }
237
269
 
238
270
  /**
239
- * Run the agent-under-test as a bare AgentRunner (design Decision 14).
240
- * Recover cost/turns/submission from the trace by replaying it into a
241
- * fresh TraceCollector the bare runner writes a single NDJSON stream
242
- * with one terminal `result` event.
243
- *
244
- * Inspects both thrown errors AND the resolved `{success, aborted, error}`
245
- * shape returned by `AgentRunner.run()` (agent-runner.js:69, 166–194):
246
- * the SDK iterator catches its own errors and resolves with `success:
247
- * false`, so a try/catch alone would silently treat a failed session as
248
- * a successful one (plan Step 8.5.c).
271
+ * Run the agent-under-test via a Supervisor relay. The supervisor writes
272
+ * a combined tagged NDJSON trace; after the session we split it into
273
+ * agent.ndjson and supervisor.ndjson and extract cost/turns/submission.
249
274
  */
250
275
  async #runAgent(task, workdir) {
251
- const agentTraceStream = createWriteStream(workdir.agentTracePath);
252
- const systemPrompt = this.profiles.agent
253
- ? composeProfilePrompt(this.profiles.agent, {
254
- profilesDir: resolvePath(workdir.cwd, ".claude/agents"),
255
- trailer: AGENT_SYSTEM_PROMPT,
256
- })
257
- : undefined;
258
- const runner = createAgentRunner({
259
- cwd: workdir.cwd,
276
+ const combinedPath = join(workdir.runDir, ".combined.ndjson");
277
+ const combinedStream = createWriteStream(combinedPath);
278
+ const supervisorInstructions = task.paths.supervisor
279
+ ? await readFile(task.paths.supervisor, "utf8").catch(() => null)
280
+ : null;
281
+ const supervisor = createSupervisor({
282
+ supervisorCwd: workdir.cwd,
283
+ agentCwd: workdir.cwd,
260
284
  query: this.query,
261
- output: agentTraceStream,
262
- model: this.model,
285
+ output: combinedStream,
286
+ agentModel: this.agentModel,
287
+ supervisorModel: this.supervisorModel,
263
288
  maxTurns: this.maxTurns ?? 50,
264
- allowedTools: BASE_TOOLS,
265
- settingSources: ["project"],
266
- systemPrompt,
267
- redactor: createRedactor(),
289
+ allowedTools: this.allowedTools,
290
+ ...(this.profiles.agent && { agentProfile: this.profiles.agent }),
291
+ ...(supervisorInstructions && { taskAmend: supervisorInstructions }),
292
+ redactor: createRedactor({
293
+ allowlist: [...DEFAULT_ENV_ALLOWLIST, ...(workdir.envNames ?? [])],
294
+ }),
268
295
  });
269
296
  const instructions = await readFile(task.paths.instructions, "utf8");
270
297
  let agentError = null;
271
298
  try {
272
- const result = await runner.run(instructions);
273
- if (!result.success) {
274
- agentError = {
275
- message:
276
- result.error?.message ??
277
- (result.aborted ? "aborted" : "agent did not succeed"),
278
- aborted: result.aborted ?? false,
279
- };
299
+ const result = await supervisor.run(instructions);
300
+ if (!result.success && !result.concluded) {
301
+ agentError = { message: "supervisor did not succeed", aborted: false };
280
302
  }
281
303
  } catch (e) {
282
304
  agentError = { message: e.message ?? String(e), aborted: false };
283
305
  } finally {
284
- await new Promise((r) => agentTraceStream.end(r));
306
+ await new Promise((r) => combinedStream.end(r));
285
307
  }
286
- const summary = await readAgentSummary(workdir.agentTracePath);
308
+ const summary = await splitAndSummarize(
309
+ combinedPath,
310
+ workdir.agentTracePath,
311
+ workdir.supervisorTracePath,
312
+ );
313
+ await unlink(combinedPath).catch(() => {});
287
314
  return { ...summary, agentError };
288
315
  }
289
316
 
@@ -321,11 +348,16 @@ export class BenchmarkRunner {
321
348
  supervisor: null,
322
349
  judge: this.profiles.judge,
323
350
  },
324
- model: this.model,
351
+ model: {
352
+ agent: this.agentModel,
353
+ supervisor: this.supervisorModel,
354
+ judge: this.judgeModel,
355
+ },
325
356
  skillSetHash,
326
357
  familyRevision,
327
358
  durationMs,
328
359
  agentTracePath: workdir.agentTracePath,
360
+ supervisorTracePath: workdir.supervisorTracePath,
329
361
  judgeTracePath: workdir.judgeTracePath,
330
362
  };
331
363
  }
@@ -360,52 +392,67 @@ async function writeRecord(stream, record) {
360
392
  }
361
393
 
362
394
  /**
363
- * Pre-flight install gate. Throws synchronously if any task's preflight
364
- * script is missing or not executable design § Pre-flight contract:
365
- * "The harness fails the family at install if any task's preflight script
366
- * is missing or non-executable, before any agent session starts."
395
+ * Split the combined supervisor trace into agent and supervisor files, and
396
+ * extract cost, turn count, and submission in a single pass. Agent-source
397
+ * events go to `agentPath`; supervisor and orchestrator events go to
398
+ * `supervisorPath`.
367
399
  */
368
- async function assertPreflightExecutable(task) {
369
- const path = join(task.paths.workdir, "scripts", "preflight.sh");
370
- try {
371
- await access(path, constants.X_OK);
372
- } catch (e) {
373
- throw new Error(
374
- `task ${task.id}: preflight script not executable at ${path} (${e.code ?? e.message})`,
375
- );
400
+ // biome-ignore lint/complexity/noExcessiveCognitiveComplexity: stream-splitting state machine
401
+ async function splitAndSummarize(combinedPath, agentPath, supervisorPath) {
402
+ const agentStream = createWriteStream(agentPath);
403
+ const supStream = createWriteStream(supervisorPath);
404
+ const rl = createInterface({
405
+ input: createReadStream(combinedPath),
406
+ crlfDelay: Infinity,
407
+ });
408
+ let agentCost = 0;
409
+ let supervisorCost = 0;
410
+ let turns = 0;
411
+ let submission = "";
412
+ for await (const line of rl) {
413
+ if (!line.trim()) continue;
414
+ let event;
415
+ try {
416
+ event = JSON.parse(line);
417
+ } catch {
418
+ continue;
419
+ }
420
+ const target = event.source === "agent" ? agentStream : supStream;
421
+ target.write(line + "\n");
422
+ const inner = event.event;
423
+ if (!inner) continue;
424
+ if (event.source === "agent") {
425
+ if (inner.type === "result" && typeof inner.total_cost_usd === "number") {
426
+ agentCost = inner.total_cost_usd;
427
+ }
428
+ if (inner.type === "assistant") {
429
+ const text = extractText(inner);
430
+ if (text) submission = text;
431
+ }
432
+ }
433
+ if (event.source === "supervisor") {
434
+ if (inner.type === "result" && typeof inner.total_cost_usd === "number") {
435
+ supervisorCost = inner.total_cost_usd;
436
+ }
437
+ }
438
+ if (event.source === "orchestrator" && inner.type === "summary") {
439
+ turns = inner.turns ?? 0;
440
+ }
376
441
  }
442
+ await Promise.all([
443
+ new Promise((r) => agentStream.end(r)),
444
+ new Promise((r) => supStream.end(r)),
445
+ ]);
446
+ return { costUsd: agentCost + supervisorCost, turns, submission };
377
447
  }
378
448
 
379
- /**
380
- * Replay the bare AgentRunner trace into a fresh TraceCollector to recover
381
- * cost, turn count, and the final assistant text block (the submission).
382
- */
383
- async function readAgentSummary(tracePath) {
384
- const collector = createTraceCollector();
385
- const stream = createReadStream(tracePath);
386
- const rl = createInterface({ input: stream, crlfDelay: Infinity });
387
- for await (const line of rl) collector.addLine(line);
388
- const json = collector.toJSON();
389
- const summary = json.summary ?? {};
390
- return {
391
- costUsd:
392
- typeof summary.totalCostUsd === "number" ? summary.totalCostUsd : 0,
393
- turns: typeof summary.numTurns === "number" ? summary.numTurns : 0,
394
- submission: lastAssistantText(json),
395
- };
396
- }
397
-
398
- function lastAssistantText(json) {
399
- const turns = json.turns ?? [];
400
- for (let i = turns.length - 1; i >= 0; i--) {
401
- const turn = turns[i];
402
- if (turn.role !== "assistant") continue;
403
- const content = turn.content ?? [];
404
- for (let j = content.length - 1; j >= 0; j--) {
405
- if (content[j].type === "text" && content[j].text) return content[j].text;
406
- }
449
+ function extractText(inner) {
450
+ const content = inner.message?.content ?? inner.content;
451
+ if (!Array.isArray(content)) return null;
452
+ for (let i = content.length - 1; i >= 0; i--) {
453
+ if (content[i].type === "text" && content[i].text) return content[i].text;
407
454
  }
408
- return "";
455
+ return null;
409
456
  }
410
457
 
411
458
  /**
@@ -1,10 +1,7 @@
1
1
  /**
2
- * Scorer — runs `<task.paths.scoring>/run.sh` from the template path against
2
+ * Scorer — runs `<task.paths.hooks>/score.sh` from the template path against
3
3
  * the post-run agent CWD. The exit code is authoritative for the verdict;
4
4
  * structured per-test rows arrive on fd 3 (`$RESULTS_FD=3`) as NDJSON.
5
- *
6
- * Scoring scripts are never copied into the agent CWD — they live only in the
7
- * task template (design Decision 3).
8
5
  */
9
6
 
10
7
  import { spawn } from "node:child_process";
@@ -31,8 +28,11 @@ import { join } from "node:path";
31
28
  * @returns {Promise<ScoringResult>}
32
29
  */
33
30
  export function runScoring(task, ctx) {
31
+ if (!task.paths.score) {
32
+ return Promise.resolve({ verdict: "pass", details: [], exitCode: 0 });
33
+ }
34
34
  return new Promise((res, rej) => {
35
- const script = join(task.paths.scoring, "run.sh");
35
+ const script = task.paths.score;
36
36
  const stderrLog = createWriteStream(join(ctx.runDir, "scoring.stderr.log"));
37
37
 
38
38
  // Bun's child_process pipe setup for fd >= 3 is racy under load (it
@@ -4,13 +4,14 @@
4
4
  * apm.lock.yaml
5
5
  * .claude/ # pre-staged skills + agents (P1)
6
6
  * tasks/<task_name>/
7
- * instructions.md
8
- * supervisor.task.md # preserved for v2; not read in v1
7
+ * agent.task.md
8
+ * supervisor.task.md # optional; appended to the task as supervisor context
9
9
  * judge.task.md
10
+ * hooks/ # harness-only; never copied to agent CWD
11
+ * preflight.sh
12
+ * score.sh
10
13
  * specs/ # copied into agent CWD
11
- * workdir/ # copied into agent CWD (excludes scripts/)
12
- * scripts/preflight.sh
13
- * scoring/ # template-only; never copied
14
+ * workdir/ # copied into agent CWD
14
15
  *
15
16
  * Local paths or git URLs are both accepted; git URLs are shallow-cloned into
16
17
  * a temp dir and `familyRevision` becomes `git:<sha>` of HEAD at clone time.
@@ -22,6 +23,7 @@ import { spawn } from "node:child_process";
22
23
  import { createHash } from "node:crypto";
23
24
  import {
24
25
  access,
26
+ constants,
25
27
  lstat,
26
28
  mkdtemp,
27
29
  readdir,
@@ -53,13 +55,11 @@ export async function loadTaskFamily(rootPathOrGitUrl) {
53
55
  familyRevision = "sha256:" + (await canonicalTreeHash(rootPath));
54
56
  }
55
57
 
56
- const apmLockBytes = await readApmLockBytes(rootPath);
57
58
  const tasks = await discoverTasks(rootPath);
58
59
 
59
60
  return {
60
61
  rootPath,
61
62
  familyRevision,
62
- apmLockBytes,
63
63
  tasks() {
64
64
  return tasks;
65
65
  },
@@ -67,58 +67,27 @@ export async function loadTaskFamily(rootPathOrGitUrl) {
67
67
  }
68
68
 
69
69
  /**
70
- * Assert that `<stagingDir>/.claude/agents/<judgeProfile>.md` exists. Called
71
- * from `BenchmarkRunner.run()` so a missing judge profile fails the family
70
+ * Assert that `<judgeProfilesDir>/<judgeProfile>.md` exists. Called from
71
+ * `BenchmarkRunner.run()` so a missing judge profile fails the family
72
72
  * install before any agent session starts.
73
73
  * @param {TaskFamily} _family
74
- * @param {string} stagingDir
74
+ * @param {string} judgeProfilesDir
75
75
  * @param {string} judgeProfile
76
76
  * @returns {Promise<void>}
77
77
  */
78
78
  export async function assertJudgeProfileStaged(
79
79
  _family,
80
- stagingDir,
80
+ judgeProfilesDir,
81
81
  judgeProfile,
82
82
  ) {
83
- const candidate = join(stagingDir, ".claude", "agents", `${judgeProfile}.md`);
83
+ const candidate = join(judgeProfilesDir, `${judgeProfile}.md`);
84
84
  try {
85
85
  await access(candidate);
86
86
  } catch {
87
- throw new Error(
88
- `judge profile not staged: ${candidate} (createSupervisor resolves profiles relative to <supervisorCwd>/.claude/agents)`,
89
- );
87
+ throw new Error(`judge profile not staged: ${candidate}`);
90
88
  }
91
89
  }
92
90
 
93
- async function readApmLockBytes(rootPath) {
94
- const lockPath = join(rootPath, "apm.lock.yaml");
95
- try {
96
- const raw = await readFile(lockPath);
97
- return normalizeLf(raw);
98
- } catch (e) {
99
- if (e.code === "ENOENT") {
100
- throw new Error(
101
- `task family missing apm.lock.yaml at ${lockPath} (matches libpack stager.js:126; .yml is not accepted)`,
102
- );
103
- }
104
- throw e;
105
- }
106
- }
107
-
108
- /**
109
- * Replace CRLF with LF so cross-OS authored lockfiles hash identically.
110
- * @param {Buffer} buf
111
- * @returns {Buffer}
112
- */
113
- function normalizeLf(buf) {
114
- const out = [];
115
- for (let i = 0; i < buf.length; i++) {
116
- if (buf[i] === 0x0d && i + 1 < buf.length && buf[i + 1] === 0x0a) continue;
117
- out.push(buf[i]);
118
- }
119
- return Buffer.from(out);
120
- }
121
-
122
91
  async function discoverTasks(rootPath) {
123
92
  const tasksRoot = join(rootPath, "tasks");
124
93
  const tasks = [];
@@ -132,15 +101,22 @@ async function discoverTasks(rootPath) {
132
101
  for (const entry of entries) {
133
102
  if (!entry.isDirectory()) continue;
134
103
  const taskDir = join(tasksRoot, entry.name);
104
+ const supervisorPath = join(taskDir, "supervisor.task.md");
105
+ const judgePath = join(taskDir, "judge.task.md");
106
+ const preflightPath = join(taskDir, "hooks", "preflight.sh");
107
+ const scorePath = join(taskDir, "hooks", "score.sh");
135
108
  tasks.push({
136
109
  id: entry.name,
137
110
  paths: {
138
- instructions: join(taskDir, "instructions.md"),
139
- supervisor: join(taskDir, "supervisor.task.md"),
140
- judge: join(taskDir, "judge.task.md"),
111
+ taskDir,
112
+ instructions: join(taskDir, "agent.task.md"),
113
+ supervisor: (await fileExists(supervisorPath)) ? supervisorPath : null,
114
+ judge: (await fileExists(judgePath)) ? judgePath : null,
115
+ hooks: join(taskDir, "hooks"),
116
+ preflight: (await fileExecutable(preflightPath)) ? preflightPath : null,
117
+ score: (await fileExecutable(scorePath)) ? scorePath : null,
141
118
  specs: join(taskDir, "specs"),
142
119
  workdir: join(taskDir, "workdir"),
143
- scoring: join(taskDir, "scoring"),
144
120
  },
145
121
  });
146
122
  }
@@ -148,6 +124,24 @@ async function discoverTasks(rootPath) {
148
124
  return tasks;
149
125
  }
150
126
 
127
+ async function fileExists(path) {
128
+ try {
129
+ await access(path);
130
+ return true;
131
+ } catch {
132
+ return false;
133
+ }
134
+ }
135
+
136
+ async function fileExecutable(path) {
137
+ try {
138
+ await access(path, constants.X_OK);
139
+ return true;
140
+ } catch {
141
+ return false;
142
+ }
143
+ }
144
+
151
145
  /**
152
146
  * Canonical-tree hash per design § Family revision algorithm:
153
147
  * list regular files (excluding .git/, node_modules/)
@@ -242,13 +236,12 @@ function run(cmd, args) {
242
236
  /**
243
237
  * @typedef {object} Task
244
238
  * @property {string} id - Task name (directory name under tasks/)
245
- * @property {{instructions: string, supervisor: string, judge: string, specs: string, workdir: string, scoring: string}} paths
239
+ * @property {{taskDir: string, instructions: string, supervisor: string|null, judge: string|null, hooks: string, preflight: string|null, score: string|null, specs: string, workdir: string}} paths
246
240
  */
247
241
 
248
242
  /**
249
243
  * @typedef {object} TaskFamily
250
244
  * @property {string} rootPath
251
245
  * @property {string} familyRevision - `git:<sha>` or `sha256:<hex>`
252
- * @property {Buffer} apmLockBytes - LF-normalised
253
246
  * @property {() => Task[]} tasks
254
247
  */