@forwardimpact/libeval 0.1.36 → 0.1.38

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -3,7 +3,7 @@
3
3
  *
4
4
  * Phases per (task, runIndex):
5
5
  * 1. WorkdirManager.start → seed CWD + run pre-flight probe
6
- * 2. AgentRunner (bare; design Decision 14) → produce trace + submission
6
+ * 2. Supervisor relay (agent + supervisor) → produce traces + submission
7
7
  * 3. Scorer.runScoring → exit-code-driven verdict via fd-3 NDJSON
8
8
  * 4. Judge.runJudge → Conclude-driven verdict mapped to pass/fail
9
9
  * 5. WorkdirManager.teardown → process-group cleanup
@@ -15,15 +15,12 @@
15
15
  */
16
16
 
17
17
  import { createReadStream, createWriteStream } from "node:fs";
18
- import { access, constants, mkdir, readFile } from "node:fs/promises";
18
+ import { access, constants, mkdir, readFile, unlink } from "node:fs/promises";
19
19
  import { createInterface } from "node:readline";
20
20
  import { join, resolve as resolvePath } from "node:path";
21
21
 
22
- import { createAgentRunner } from "../agent-runner.js";
23
- import { composeProfilePrompt } from "../profile-prompt.js";
24
22
  import { createRedactor } from "../redaction.js";
25
- import { AGENT_SYSTEM_PROMPT } from "../supervisor.js";
26
- import { createTraceCollector } from "../trace-collector.js";
23
+ import { createSupervisor } from "../supervisor.js";
27
24
  import { installApm } from "./apm-installer.js";
28
25
  import { runJudge } from "./judge.js";
29
26
  import { validateResultRecord } from "./result.js";
@@ -40,7 +37,9 @@ export class BenchmarkRunner {
40
37
  * @param {import("./task-family.js").TaskFamily | string} opts.family
41
38
  * @param {number} opts.runs - Runs per task (≥ 1).
42
39
  * @param {string} opts.output - Run-output directory.
43
- * @param {string} opts.model
40
+ * @param {string} opts.agentModel
41
+ * @param {string} opts.supervisorModel
42
+ * @param {string} opts.judgeModel
44
43
  * @param {{agent?: string, judge?: string}} [opts.profiles]
45
44
  * @param {Function} opts.query - SDK query (injected for testability).
46
45
  * @param {number} [opts.maxTurns] - Agent-under-test turn budget.
@@ -60,7 +59,9 @@ export class BenchmarkRunner {
60
59
  family,
61
60
  runs,
62
61
  output,
63
- model,
62
+ agentModel,
63
+ supervisorModel,
64
+ judgeModel,
64
65
  profiles,
65
66
  query,
66
67
  maxTurns,
@@ -74,12 +75,16 @@ export class BenchmarkRunner {
74
75
  if (!Number.isInteger(runs) || runs < 1)
75
76
  throw new Error("runs must be an integer ≥ 1");
76
77
  if (!output) throw new Error("output is required");
77
- if (!model) throw new Error("model is required");
78
+ if (!agentModel) throw new Error("agentModel is required");
79
+ if (!supervisorModel) throw new Error("supervisorModel is required");
80
+ if (!judgeModel) throw new Error("judgeModel is required");
78
81
  if (!query) throw new Error("query is required");
79
82
  this.familyInput = family;
80
83
  this.runs = runs;
81
84
  this.output = output;
82
- this.model = model;
85
+ this.agentModel = agentModel;
86
+ this.supervisorModel = supervisorModel;
87
+ this.judgeModel = judgeModel;
83
88
  this.profiles = {
84
89
  agent: profiles?.agent ?? null,
85
90
  judge: profiles?.judge ?? null,
@@ -103,14 +108,21 @@ export class BenchmarkRunner {
103
108
  : this.familyInput;
104
109
 
105
110
  await mkdir(this.output, { recursive: true });
106
- const { stagingDir, skillSetHash } = await installApm(family, this.output);
111
+ const { stagingDir, skillSetHash, judgeProfilesDir } = await installApm(
112
+ family,
113
+ this.output,
114
+ );
107
115
 
108
116
  const tasks = family.tasks();
109
117
  for (const task of tasks) {
110
118
  await assertPreflightExecutable(task);
111
119
  }
112
120
  if (this.profiles.judge) {
113
- await assertJudgeProfileStaged(family, stagingDir, this.profiles.judge);
121
+ await assertJudgeProfileStaged(
122
+ family,
123
+ judgeProfilesDir,
124
+ this.profiles.judge,
125
+ );
114
126
  }
115
127
 
116
128
  const wm = createWorkdirManager({
@@ -130,6 +142,7 @@ export class BenchmarkRunner {
130
142
  task,
131
143
  runIndex,
132
144
  skillSetHash,
145
+ judgeProfilesDir,
133
146
  );
134
147
  await writeRecord(resultsStream, record);
135
148
  yield record;
@@ -140,7 +153,7 @@ export class BenchmarkRunner {
140
153
  }
141
154
  }
142
155
 
143
- async #runOne(family, wm, task, runIndex, skillSetHash) {
156
+ async #runOne(family, wm, task, runIndex, skillSetHash, judgeProfilesDir) {
144
157
  const t0 = Date.now();
145
158
  const workdir = await wm.start(task, runIndex);
146
159
  try {
@@ -176,8 +189,9 @@ export class BenchmarkRunner {
176
189
  scoring,
177
190
  {
178
191
  query: this.query,
179
- model: this.model,
192
+ model: this.judgeModel,
180
193
  judgeProfile: this.profiles.judge ?? undefined,
194
+ profilesDir: judgeProfilesDir,
181
195
  },
182
196
  judgeContext,
183
197
  );
@@ -194,13 +208,18 @@ export class BenchmarkRunner {
194
208
  costUsd,
195
209
  turns,
196
210
  agentTracePath: workdir.agentTracePath,
211
+ supervisorTracePath: workdir.supervisorTracePath,
197
212
  judgeTracePath: workdir.judgeTracePath,
198
213
  profiles: {
199
214
  agent: this.profiles.agent,
200
215
  supervisor: null,
201
216
  judge: this.profiles.judge,
202
217
  },
203
- model: this.model,
218
+ model: {
219
+ agent: this.agentModel,
220
+ supervisor: this.supervisorModel,
221
+ judge: this.judgeModel,
222
+ },
204
223
  skillSetHash,
205
224
  familyRevision: family.familyRevision,
206
225
  durationMs: Date.now() - t0,
@@ -236,54 +255,43 @@ export class BenchmarkRunner {
236
255
  }
237
256
 
238
257
  /**
239
- * Run the agent-under-test as a bare AgentRunner (design Decision 14).
240
- * Recover cost/turns/submission from the trace by replaying it into a
241
- * fresh TraceCollector the bare runner writes a single NDJSON stream
242
- * with one terminal `result` event.
243
- *
244
- * Inspects both thrown errors AND the resolved `{success, aborted, error}`
245
- * shape returned by `AgentRunner.run()` (agent-runner.js:69, 166–194):
246
- * the SDK iterator catches its own errors and resolves with `success:
247
- * false`, so a try/catch alone would silently treat a failed session as
248
- * a successful one (plan Step 8.5.c).
258
+ * Run the agent-under-test via a Supervisor relay. The supervisor writes
259
+ * a combined tagged NDJSON trace; after the session we split it into
260
+ * agent.ndjson and supervisor.ndjson and extract cost/turns/submission.
249
261
  */
250
262
  async #runAgent(task, workdir) {
251
- const agentTraceStream = createWriteStream(workdir.agentTracePath);
252
- const systemPrompt = this.profiles.agent
253
- ? composeProfilePrompt(this.profiles.agent, {
254
- profilesDir: resolvePath(workdir.cwd, ".claude/agents"),
255
- trailer: AGENT_SYSTEM_PROMPT,
256
- })
257
- : undefined;
258
- const runner = createAgentRunner({
259
- cwd: workdir.cwd,
263
+ const combinedPath = join(workdir.runDir, ".combined.ndjson");
264
+ const combinedStream = createWriteStream(combinedPath);
265
+ const supervisor = createSupervisor({
266
+ supervisorCwd: workdir.cwd,
267
+ agentCwd: workdir.cwd,
260
268
  query: this.query,
261
- output: agentTraceStream,
262
- model: this.model,
269
+ output: combinedStream,
270
+ agentModel: this.agentModel,
271
+ supervisorModel: this.supervisorModel,
263
272
  maxTurns: this.maxTurns ?? 50,
264
273
  allowedTools: BASE_TOOLS,
265
- settingSources: ["project"],
266
- systemPrompt,
274
+ ...(this.profiles.agent && { agentProfile: this.profiles.agent }),
267
275
  redactor: createRedactor(),
268
276
  });
269
277
  const instructions = await readFile(task.paths.instructions, "utf8");
270
278
  let agentError = null;
271
279
  try {
272
- const result = await runner.run(instructions);
273
- if (!result.success) {
274
- agentError = {
275
- message:
276
- result.error?.message ??
277
- (result.aborted ? "aborted" : "agent did not succeed"),
278
- aborted: result.aborted ?? false,
279
- };
280
+ const result = await supervisor.run(instructions);
281
+ if (!result.success && !result.concluded) {
282
+ agentError = { message: "supervisor did not succeed", aborted: false };
280
283
  }
281
284
  } catch (e) {
282
285
  agentError = { message: e.message ?? String(e), aborted: false };
283
286
  } finally {
284
- await new Promise((r) => agentTraceStream.end(r));
287
+ await new Promise((r) => combinedStream.end(r));
285
288
  }
286
- const summary = await readAgentSummary(workdir.agentTracePath);
289
+ const summary = await splitAndSummarize(
290
+ combinedPath,
291
+ workdir.agentTracePath,
292
+ workdir.supervisorTracePath,
293
+ );
294
+ await unlink(combinedPath).catch(() => {});
287
295
  return { ...summary, agentError };
288
296
  }
289
297
 
@@ -321,11 +329,16 @@ export class BenchmarkRunner {
321
329
  supervisor: null,
322
330
  judge: this.profiles.judge,
323
331
  },
324
- model: this.model,
332
+ model: {
333
+ agent: this.agentModel,
334
+ supervisor: this.supervisorModel,
335
+ judge: this.judgeModel,
336
+ },
325
337
  skillSetHash,
326
338
  familyRevision,
327
339
  durationMs,
328
340
  agentTracePath: workdir.agentTracePath,
341
+ supervisorTracePath: workdir.supervisorTracePath,
329
342
  judgeTracePath: workdir.judgeTracePath,
330
343
  };
331
344
  }
@@ -366,7 +379,7 @@ async function writeRecord(stream, record) {
366
379
  * is missing or non-executable, before any agent session starts."
367
380
  */
368
381
  async function assertPreflightExecutable(task) {
369
- const path = join(task.paths.workdir, "scripts", "preflight.sh");
382
+ const path = join(task.paths.hooks, "preflight.sh");
370
383
  try {
371
384
  await access(path, constants.X_OK);
372
385
  } catch (e) {
@@ -377,35 +390,67 @@ async function assertPreflightExecutable(task) {
377
390
  }
378
391
 
379
392
  /**
380
- * Replay the bare AgentRunner trace into a fresh TraceCollector to recover
381
- * cost, turn count, and the final assistant text block (the submission).
393
+ * Split the combined supervisor trace into agent and supervisor files, and
394
+ * extract cost, turn count, and submission in a single pass. Agent-source
395
+ * events go to `agentPath`; supervisor and orchestrator events go to
396
+ * `supervisorPath`.
382
397
  */
383
- async function readAgentSummary(tracePath) {
384
- const collector = createTraceCollector();
385
- const stream = createReadStream(tracePath);
386
- const rl = createInterface({ input: stream, crlfDelay: Infinity });
387
- for await (const line of rl) collector.addLine(line);
388
- const json = collector.toJSON();
389
- const summary = json.summary ?? {};
390
- return {
391
- costUsd:
392
- typeof summary.totalCostUsd === "number" ? summary.totalCostUsd : 0,
393
- turns: typeof summary.numTurns === "number" ? summary.numTurns : 0,
394
- submission: lastAssistantText(json),
395
- };
398
+ // biome-ignore lint/complexity/noExcessiveCognitiveComplexity: stream-splitting state machine
399
+ async function splitAndSummarize(combinedPath, agentPath, supervisorPath) {
400
+ const agentStream = createWriteStream(agentPath);
401
+ const supStream = createWriteStream(supervisorPath);
402
+ const rl = createInterface({
403
+ input: createReadStream(combinedPath),
404
+ crlfDelay: Infinity,
405
+ });
406
+ let agentCost = 0;
407
+ let supervisorCost = 0;
408
+ let turns = 0;
409
+ let submission = "";
410
+ for await (const line of rl) {
411
+ if (!line.trim()) continue;
412
+ let event;
413
+ try {
414
+ event = JSON.parse(line);
415
+ } catch {
416
+ continue;
417
+ }
418
+ const target = event.source === "agent" ? agentStream : supStream;
419
+ target.write(line + "\n");
420
+ const inner = event.event;
421
+ if (!inner) continue;
422
+ if (event.source === "agent") {
423
+ if (inner.type === "result" && typeof inner.total_cost_usd === "number") {
424
+ agentCost = inner.total_cost_usd;
425
+ }
426
+ if (inner.type === "assistant") {
427
+ const text = extractText(inner);
428
+ if (text) submission = text;
429
+ }
430
+ }
431
+ if (event.source === "supervisor") {
432
+ if (inner.type === "result" && typeof inner.total_cost_usd === "number") {
433
+ supervisorCost = inner.total_cost_usd;
434
+ }
435
+ }
436
+ if (event.source === "orchestrator" && inner.type === "summary") {
437
+ turns = inner.turns ?? 0;
438
+ }
439
+ }
440
+ await Promise.all([
441
+ new Promise((r) => agentStream.end(r)),
442
+ new Promise((r) => supStream.end(r)),
443
+ ]);
444
+ return { costUsd: agentCost + supervisorCost, turns, submission };
396
445
  }
397
446
 
398
- function lastAssistantText(json) {
399
- const turns = json.turns ?? [];
400
- for (let i = turns.length - 1; i >= 0; i--) {
401
- const turn = turns[i];
402
- if (turn.role !== "assistant") continue;
403
- const content = turn.content ?? [];
404
- for (let j = content.length - 1; j >= 0; j--) {
405
- if (content[j].type === "text" && content[j].text) return content[j].text;
406
- }
447
+ function extractText(inner) {
448
+ const content = inner.message?.content ?? inner.content;
449
+ if (!Array.isArray(content)) return null;
450
+ for (let i = content.length - 1; i >= 0; i--) {
451
+ if (content[i].type === "text" && content[i].text) return content[i].text;
407
452
  }
408
- return "";
453
+ return null;
409
454
  }
410
455
 
411
456
  /**
@@ -1,10 +1,7 @@
1
1
  /**
2
- * Scorer — runs `<task.paths.scoring>/run.sh` from the template path against
2
+ * Scorer — runs `<task.paths.hooks>/score.sh` from the template path against
3
3
  * the post-run agent CWD. The exit code is authoritative for the verdict;
4
4
  * structured per-test rows arrive on fd 3 (`$RESULTS_FD=3`) as NDJSON.
5
- *
6
- * Scoring scripts are never copied into the agent CWD — they live only in the
7
- * task template (design Decision 3).
8
5
  */
9
6
 
10
7
  import { spawn } from "node:child_process";
@@ -32,7 +29,7 @@ import { join } from "node:path";
32
29
  */
33
30
  export function runScoring(task, ctx) {
34
31
  return new Promise((res, rej) => {
35
- const script = join(task.paths.scoring, "run.sh");
32
+ const script = join(task.paths.hooks, "score.sh");
36
33
  const stderrLog = createWriteStream(join(ctx.runDir, "scoring.stderr.log"));
37
34
 
38
35
  // Bun's child_process pipe setup for fd >= 3 is racy under load (it
@@ -4,13 +4,14 @@
4
4
  * apm.lock.yaml
5
5
  * .claude/ # pre-staged skills + agents (P1)
6
6
  * tasks/<task_name>/
7
- * instructions.md
7
+ * agent.task.md
8
8
  * supervisor.task.md # preserved for v2; not read in v1
9
9
  * judge.task.md
10
+ * hooks/ # harness-only; never copied to agent CWD
11
+ * preflight.sh
12
+ * score.sh
10
13
  * specs/ # copied into agent CWD
11
- * workdir/ # copied into agent CWD (excludes scripts/)
12
- * scripts/preflight.sh
13
- * scoring/ # template-only; never copied
14
+ * workdir/ # copied into agent CWD
14
15
  *
15
16
  * Local paths or git URLs are both accepted; git URLs are shallow-cloned into
16
17
  * a temp dir and `familyRevision` becomes `git:<sha>` of HEAD at clone time.
@@ -53,13 +54,11 @@ export async function loadTaskFamily(rootPathOrGitUrl) {
53
54
  familyRevision = "sha256:" + (await canonicalTreeHash(rootPath));
54
55
  }
55
56
 
56
- const apmLockBytes = await readApmLockBytes(rootPath);
57
57
  const tasks = await discoverTasks(rootPath);
58
58
 
59
59
  return {
60
60
  rootPath,
61
61
  familyRevision,
62
- apmLockBytes,
63
62
  tasks() {
64
63
  return tasks;
65
64
  },
@@ -67,58 +66,27 @@ export async function loadTaskFamily(rootPathOrGitUrl) {
67
66
  }
68
67
 
69
68
  /**
70
- * Assert that `<stagingDir>/.claude/agents/<judgeProfile>.md` exists. Called
71
- * from `BenchmarkRunner.run()` so a missing judge profile fails the family
69
+ * Assert that `<judgeProfilesDir>/<judgeProfile>.md` exists. Called from
70
+ * `BenchmarkRunner.run()` so a missing judge profile fails the family
72
71
  * install before any agent session starts.
73
72
  * @param {TaskFamily} _family
74
- * @param {string} stagingDir
73
+ * @param {string} judgeProfilesDir
75
74
  * @param {string} judgeProfile
76
75
  * @returns {Promise<void>}
77
76
  */
78
77
  export async function assertJudgeProfileStaged(
79
78
  _family,
80
- stagingDir,
79
+ judgeProfilesDir,
81
80
  judgeProfile,
82
81
  ) {
83
- const candidate = join(stagingDir, ".claude", "agents", `${judgeProfile}.md`);
82
+ const candidate = join(judgeProfilesDir, `${judgeProfile}.md`);
84
83
  try {
85
84
  await access(candidate);
86
85
  } catch {
87
- throw new Error(
88
- `judge profile not staged: ${candidate} (createSupervisor resolves profiles relative to <supervisorCwd>/.claude/agents)`,
89
- );
86
+ throw new Error(`judge profile not staged: ${candidate}`);
90
87
  }
91
88
  }
92
89
 
93
- async function readApmLockBytes(rootPath) {
94
- const lockPath = join(rootPath, "apm.lock.yaml");
95
- try {
96
- const raw = await readFile(lockPath);
97
- return normalizeLf(raw);
98
- } catch (e) {
99
- if (e.code === "ENOENT") {
100
- throw new Error(
101
- `task family missing apm.lock.yaml at ${lockPath} (matches libpack stager.js:126; .yml is not accepted)`,
102
- );
103
- }
104
- throw e;
105
- }
106
- }
107
-
108
- /**
109
- * Replace CRLF with LF so cross-OS authored lockfiles hash identically.
110
- * @param {Buffer} buf
111
- * @returns {Buffer}
112
- */
113
- function normalizeLf(buf) {
114
- const out = [];
115
- for (let i = 0; i < buf.length; i++) {
116
- if (buf[i] === 0x0d && i + 1 < buf.length && buf[i + 1] === 0x0a) continue;
117
- out.push(buf[i]);
118
- }
119
- return Buffer.from(out);
120
- }
121
-
122
90
  async function discoverTasks(rootPath) {
123
91
  const tasksRoot = join(rootPath, "tasks");
124
92
  const tasks = [];
@@ -135,12 +103,12 @@ async function discoverTasks(rootPath) {
135
103
  tasks.push({
136
104
  id: entry.name,
137
105
  paths: {
138
- instructions: join(taskDir, "instructions.md"),
106
+ instructions: join(taskDir, "agent.task.md"),
139
107
  supervisor: join(taskDir, "supervisor.task.md"),
140
108
  judge: join(taskDir, "judge.task.md"),
109
+ hooks: join(taskDir, "hooks"),
141
110
  specs: join(taskDir, "specs"),
142
111
  workdir: join(taskDir, "workdir"),
143
- scoring: join(taskDir, "scoring"),
144
112
  },
145
113
  });
146
114
  }
@@ -242,13 +210,12 @@ function run(cmd, args) {
242
210
  /**
243
211
  * @typedef {object} Task
244
212
  * @property {string} id - Task name (directory name under tasks/)
245
- * @property {{instructions: string, supervisor: string, judge: string, specs: string, workdir: string, scoring: string}} paths
213
+ * @property {{instructions: string, supervisor: string, judge: string, hooks: string, specs: string, workdir: string}} paths
246
214
  */
247
215
 
248
216
  /**
249
217
  * @typedef {object} TaskFamily
250
218
  * @property {string} rootPath
251
219
  * @property {string} familyRevision - `git:<sha>` or `sha256:<hex>`
252
- * @property {Buffer} apmLockBytes - LF-normalised
253
220
  * @property {() => Task[]} tasks
254
221
  */
@@ -11,9 +11,8 @@ import { spawn } from "node:child_process";
11
11
  import { cp, mkdir } from "node:fs/promises";
12
12
  import { createServer } from "node:net";
13
13
  import { connect } from "node:net";
14
- import { join, sep } from "node:path";
14
+ import { join } from "node:path";
15
15
 
16
- const PREFLIGHT_REL = join("workdir", "scripts");
17
16
  const DEFAULT_TERM_GRACE_MS = 5_000;
18
17
 
19
18
  /**
@@ -24,6 +23,7 @@ const DEFAULT_TERM_GRACE_MS = 5_000;
24
23
  * @property {number} pgid - Process-group id captured from the preflight child.
25
24
  * @property {*} scaffold - Reserved per design § Components; v1 sets null.
26
25
  * @property {string} agentTracePath
26
+ * @property {string} supervisorTracePath
27
27
  * @property {string} judgeTracePath
28
28
  * @property {{phase: string, message: string, exitCode: number}} [preflightError]
29
29
  */
@@ -55,9 +55,8 @@ export class WorkdirManager {
55
55
  const cwd = join(runDir, "cwd");
56
56
  await mkdir(cwd, { recursive: true });
57
57
 
58
- await cp(task.paths.workdir, cwd, {
59
- recursive: true,
60
- filter: (src) => !src.endsWith(sep + PREFLIGHT_REL),
58
+ await cp(task.paths.workdir, cwd, { recursive: true }).catch((e) => {
59
+ if (e.code !== "ENOENT") throw e;
61
60
  });
62
61
  await cp(task.paths.specs, join(cwd, "specs"), {
63
62
  recursive: true,
@@ -70,9 +69,10 @@ export class WorkdirManager {
70
69
 
71
70
  const port = await allocatePort();
72
71
  const agentTracePath = join(runDir, "agent.ndjson");
72
+ const supervisorTracePath = join(runDir, "supervisor.ndjson");
73
73
  const judgeTracePath = join(runDir, "judge.ndjson");
74
74
 
75
- const preflightScript = join(task.paths.workdir, "scripts", "preflight.sh");
75
+ const preflightScript = join(task.paths.hooks, "preflight.sh");
76
76
  const preflight = await runPreflight(preflightScript, cwd, port);
77
77
 
78
78
  return {
@@ -82,6 +82,7 @@ export class WorkdirManager {
82
82
  pgid: preflight.pgid,
83
83
  scaffold: null,
84
84
  agentTracePath,
85
+ supervisorTracePath,
85
86
  judgeTracePath,
86
87
  ...(preflight.error && { preflightError: preflight.error }),
87
88
  };
@@ -0,0 +1,145 @@
1
+ import { existsSync, readFileSync } from "node:fs";
2
+ import { basename } from "node:path";
3
+ import jmespath from "jmespath";
4
+
5
+ /**
6
+ * Evaluate an assertion and return the structured result.
7
+ * @param {object} values - { grep?: string, query?: string, exists?: boolean, not?: boolean, message?: string }
8
+ * @param {string[]} args - [testName, file]
9
+ * @returns {{ test: string, pass: boolean, message?: string }}
10
+ */
11
+ // biome-ignore lint/complexity/noExcessiveCognitiveComplexity: assertion dispatch by type
12
+ export function evaluateAssertion(values, args) {
13
+ const testName = args[0];
14
+ if (!testName) throw new Error("assert: missing test name");
15
+
16
+ const file = args[1];
17
+ const modes = [
18
+ values.grep,
19
+ values.query,
20
+ values.exists,
21
+ values["cites-job"],
22
+ ].filter((v) => v !== undefined && v !== false);
23
+ if (modes.length === 0) {
24
+ throw new Error(
25
+ "assert: specify one of --grep, --query, --exists, or --cites-job",
26
+ );
27
+ }
28
+ if (modes.length > 1) {
29
+ throw new Error(
30
+ "assert: specify only one of --grep, --query, --exists, or --cites-job",
31
+ );
32
+ }
33
+
34
+ let result;
35
+ if (values.exists) {
36
+ if (!file) throw new Error("assert: missing file argument");
37
+ result = assertExists(file);
38
+ } else if (values.grep) {
39
+ if (!file) throw new Error("assert: missing file argument for --grep");
40
+ result = assertGrep(values.grep, file);
41
+ } else if (values["cites-job"]) {
42
+ if (!file) throw new Error("assert: missing file argument for --cites-job");
43
+ result = assertCitesJob(values["cites-job"], file);
44
+ } else {
45
+ if (!file) throw new Error("assert: missing file argument for --query");
46
+ result = assertQuery(values.query, file);
47
+ }
48
+
49
+ if (values.not) {
50
+ result.pass = !result.pass;
51
+ if (result.pass) {
52
+ delete result.message;
53
+ } else {
54
+ result.message =
55
+ result.message ?? `inverted assertion failed for ${basename(file)}`;
56
+ }
57
+ }
58
+
59
+ if (!result.pass && values.message) {
60
+ result.message = values.message;
61
+ }
62
+
63
+ const output = { test: testName, pass: result.pass };
64
+ if (result.message) output.message = result.message;
65
+ return output;
66
+ }
67
+
68
+ /**
69
+ * Run an assertion, write JSON to stdout, and set process.exitCode on failure.
70
+ * @param {object} values
71
+ * @param {string[]} args
72
+ */
73
+ export async function runAssertCommand(values, args) {
74
+ const result = evaluateAssertion(values, args);
75
+ process.stdout.write(JSON.stringify(result) + "\n");
76
+ if (!result.pass) process.exitCode = 1;
77
+ }
78
+
79
+ function assertExists(file) {
80
+ if (existsSync(file)) return { pass: true };
81
+ return { pass: false, message: `${file} not found` };
82
+ }
83
+
84
+ function assertGrep(pattern, file) {
85
+ const content = readFileSync(file, "utf8");
86
+ const re = new RegExp(pattern, "im");
87
+ if (re.test(content)) return { pass: true };
88
+ return {
89
+ pass: false,
90
+ message: `pattern "${pattern}" not found in ${basename(file)}`,
91
+ };
92
+ }
93
+
94
+ function assertQuery(expression, file) {
95
+ const content = readFileSync(file, "utf8");
96
+ const data = parseJsonOrNdjson(content);
97
+ const result = jmespath.search(data, expression);
98
+ const truthy =
99
+ result !== null &&
100
+ result !== undefined &&
101
+ result !== false &&
102
+ (Array.isArray(result) ? result.length > 0 : true);
103
+ if (truthy) return { pass: true };
104
+ return {
105
+ pass: false,
106
+ message: `query returned ${JSON.stringify(result)}`,
107
+ };
108
+ }
109
+
110
+ const JOB_TAG_RE = /<job\s+user="([^"]*)"\s+goal="([^"]*)">/;
111
+
112
+ function assertCitesJob(jobFile, file) {
113
+ const jobContent = readFileSync(jobFile, "utf8");
114
+ const match = JOB_TAG_RE.exec(jobContent);
115
+ if (!match) {
116
+ return {
117
+ pass: false,
118
+ message: `no <job> tag found in ${basename(jobFile)}`,
119
+ };
120
+ }
121
+ const citation = `${match[1]}: ${match[2]}`;
122
+ const content = readFileSync(file, "utf8");
123
+ if (content.includes(citation)) return { pass: true };
124
+ return { pass: false, message: `missing "${citation}"` };
125
+ }
126
+
127
+ function parseJsonOrNdjson(content) {
128
+ try {
129
+ return JSON.parse(content);
130
+ } catch {
131
+ // Fall through to NDJSON
132
+ }
133
+ const lines = [];
134
+ for (const raw of content.split("\n")) {
135
+ const trimmed = raw.trim();
136
+ if (!trimmed) continue;
137
+ try {
138
+ lines.push(JSON.parse(trimmed));
139
+ } catch {
140
+ // skip unparseable lines
141
+ }
142
+ }
143
+ if (lines.length === 0) throw new Error("assert: no valid JSON in file");
144
+ return lines;
145
+ }
@@ -13,8 +13,7 @@ import { aggregate, renderTextReport } from "../benchmark/report.js";
13
13
  * @param {string[]} _args
14
14
  */
15
15
  export async function runBenchmarkReportCommand(values, _args) {
16
- const inputDir = values.input;
17
- if (!inputDir) throw new Error("--input is required");
16
+ const inputDir = values.input ?? "benchmark-runs";
18
17
  const kRaw = values.k ?? "1,3,5";
19
18
  const kValues = kRaw.split(",").map((t) => {
20
19
  const n = Number.parseInt(t.trim(), 10);