@forwardimpact/libeval 0.1.39 → 0.1.41

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@forwardimpact/libeval",
3
- "version": "0.1.39",
3
+ "version": "0.1.41",
4
4
  "description": "Agent evaluation framework — prove whether agent changes improved outcomes with reproducible evidence.",
5
5
  "keywords": [
6
6
  "eval",
@@ -77,22 +77,7 @@ export class AgentRunner {
77
77
  try {
78
78
  const iterator = this.query({
79
79
  prompt: effectiveTask,
80
- options: {
81
- cwd: this.cwd,
82
- allowedTools: this.allowedTools,
83
- maxTurns:
84
- this.maxTurns === 0 ? Number.MAX_SAFE_INTEGER : this.maxTurns,
85
- model: this.model,
86
- permissionMode: PERMISSION_MODE,
87
- allowDangerouslySkipPermissions: true,
88
- settingSources: this.settingSources,
89
- abortController,
90
- ...(this.disallowedTools.length > 0 && {
91
- disallowedTools: this.disallowedTools,
92
- }),
93
- ...(this.systemPrompt && { systemPrompt: this.systemPrompt }),
94
- ...(this.mcpServers && { mcpServers: this.mcpServers }),
95
- },
80
+ options: this.#callOptions(abortController),
96
81
  });
97
82
  return await this.#consumeQuery(iterator);
98
83
  } finally {
@@ -112,12 +97,8 @@ export class AgentRunner {
112
97
  const iterator = this.query({
113
98
  prompt,
114
99
  options: {
100
+ ...this.#callOptions(abortController),
115
101
  resume: this.sessionId,
116
- model: this.model,
117
- permissionMode: PERMISSION_MODE,
118
- allowDangerouslySkipPermissions: true,
119
- abortController,
120
- ...(this.mcpServers && { mcpServers: this.mcpServers }),
121
102
  },
122
103
  });
123
104
  return await this.#consumeQuery(iterator);
@@ -126,6 +107,37 @@ export class AgentRunner {
126
107
  }
127
108
  }
128
109
 
110
+ /**
111
+ * Build the options passed to every SDK query() call. Shared by run() and
112
+ * resume() so the agent's configuration — cwd, tools, prompt, setting
113
+ * sources, turn budget — is identical across the session's lifetime. Only
114
+ * resume() layers `resume: this.sessionId` on top.
115
+ *
116
+ * SDK options are call-attached, not session-attached: the resumed call
117
+ * loads the prior conversation but otherwise uses whatever options this
118
+ * call passes. Omitting tool/prompt/setting options on resume causes the
119
+ * agent to silently lose its restrictions and persona between turns.
120
+ * @param {AbortController} abortController
121
+ * @returns {object}
122
+ */
123
+ #callOptions(abortController) {
124
+ return {
125
+ cwd: this.cwd,
126
+ allowedTools: this.allowedTools,
127
+ maxTurns: this.maxTurns === 0 ? Number.MAX_SAFE_INTEGER : this.maxTurns,
128
+ model: this.model,
129
+ permissionMode: PERMISSION_MODE,
130
+ allowDangerouslySkipPermissions: true,
131
+ settingSources: this.settingSources,
132
+ abortController,
133
+ ...(this.disallowedTools.length > 0 && {
134
+ disallowedTools: this.disallowedTools,
135
+ }),
136
+ ...(this.systemPrompt && { systemPrompt: this.systemPrompt }),
137
+ ...(this.mcpServers && { mcpServers: this.mcpServers }),
138
+ };
139
+ }
140
+
129
141
  /**
130
142
  * Shared consumer for both `run()` and `resume()`. Iterates the SDK query
131
143
  * iterator, mirroring every line to the output stream / buffer / onLine
@@ -3,71 +3,109 @@
3
3
  * materialise skills and agents, copies the resulting `.claude/` into a
4
4
  * staging directory, and computes the manifest fingerprint from the lockfile.
5
5
  * Per-task copy happens later in WorkdirManager.
6
+ *
7
+ * The class takes a `spawn` seam so tests can substitute a fake child process
8
+ * without ever shelling out to a real `apm` binary. See `createApmInstaller`
9
+ * for the real-dependency wiring; `installApm` is a thin free-function wrapper
10
+ * for callers that don't need to inject anything.
6
11
  */
7
12
 
8
- import { spawn } from "node:child_process";
13
+ import { spawn as nodeSpawn } from "node:child_process";
9
14
  import { createHash } from "node:crypto";
10
15
  import { access, cp, mkdir, readFile, rm } from "node:fs/promises";
11
16
  import { join } from "node:path";
12
17
 
13
- /**
14
- * @param {import("./task-family.js").TaskFamily} family
15
- * @param {string} outputDir - The benchmark run's output directory.
16
- * @returns {Promise<{stagingDir: string, skillSetHash: string}>}
17
- */
18
- export async function installApm(family, outputDir) {
19
- const stagingDir = join(outputDir, ".apm-staging");
20
- const stagedClaude = join(stagingDir, ".claude");
21
- const sourceClaude = join(family.rootPath, ".claude");
22
- const apmYml = join(family.rootPath, "apm.yml");
18
+ /** Installs apm and stages `.claude/` for a task family. */
19
+ export class ApmInstaller {
20
+ /**
21
+ * @param {object} [deps]
22
+ * @param {typeof nodeSpawn} [deps.spawn] - Spawn seam (defaults to
23
+ * `node:child_process` spawn). Tests inject a fake to avoid shelling out.
24
+ */
25
+ constructor({ spawn } = {}) {
26
+ this.spawn = spawn ?? nodeSpawn;
27
+ }
23
28
 
24
- const hasApm = await access(apmYml)
25
- .then(() => true)
26
- .catch(() => false);
29
+ /**
30
+ * @param {import("./task-family.js").TaskFamily} family
31
+ * @param {string} outputDir - The benchmark run's output directory.
32
+ * @returns {Promise<{stagingDir: string, skillSetHash: string, judgeProfilesDir: string}>}
33
+ */
34
+ async install(family, outputDir) {
35
+ const stagingDir = join(outputDir, ".apm-staging");
36
+ const stagedClaude = join(stagingDir, ".claude");
37
+ const sourceClaude = join(family.rootPath, ".claude");
38
+ const apmYml = join(family.rootPath, "apm.yml");
39
+
40
+ const hasApm = await access(apmYml)
41
+ .then(() => true)
42
+ .catch(() => false);
43
+
44
+ if (hasApm) {
45
+ await this.#runApmInstall(family.rootPath);
46
+ try {
47
+ await access(sourceClaude);
48
+ } catch {
49
+ throw new Error(
50
+ `apm install did not produce .claude/ at ${sourceClaude}; check the family's apm.yml`,
51
+ );
52
+ }
53
+ }
27
54
 
28
- if (hasApm) {
29
- await runApmInstall(family.rootPath);
55
+ await rm(stagingDir, { recursive: true, force: true });
56
+ const hasClaudeDir = await access(sourceClaude)
57
+ .then(() => true)
58
+ .catch(() => false);
59
+ if (hasClaudeDir) {
60
+ await cp(sourceClaude, stagedClaude, { recursive: true });
61
+ } else {
62
+ await mkdir(stagedClaude, { recursive: true });
63
+ }
64
+
65
+ // Stage the family-local judge profile outside .claude/ so it is available
66
+ // to the judge but never copied into the agent-under-test's CWD.
67
+ const judgeSource = join(family.rootPath, "judge.md");
68
+ const judgeProfilesDir = join(stagingDir, "judge-profiles");
69
+ try {
70
+ await access(judgeSource);
71
+ await mkdir(judgeProfilesDir, { recursive: true });
72
+ await cp(judgeSource, join(judgeProfilesDir, "judge.md"));
73
+ } catch {}
74
+
75
+ const lockPath = join(family.rootPath, "apm.lock.yaml");
76
+ let skillSetHash = "";
30
77
  try {
31
- await access(sourceClaude);
78
+ const lockBytes = await readFile(lockPath);
79
+ skillSetHash =
80
+ "sha256:" +
81
+ createHash("sha256").update(normalizeLf(lockBytes)).digest("hex");
32
82
  } catch {
33
- throw new Error(
34
- `apm install did not produce .claude/ at ${sourceClaude}; check the family's apm.yml`,
35
- );
83
+ // No lockfile — family doesn't use skill packs.
36
84
  }
37
- }
38
85
 
39
- await rm(stagingDir, { recursive: true, force: true });
40
- const hasClaudeDir = await access(sourceClaude)
41
- .then(() => true)
42
- .catch(() => false);
43
- if (hasClaudeDir) {
44
- await cp(sourceClaude, stagedClaude, { recursive: true });
45
- } else {
46
- await mkdir(stagedClaude, { recursive: true });
86
+ return { stagingDir, skillSetHash, judgeProfilesDir };
47
87
  }
48
88
 
49
- // Stage the family-local judge profile outside .claude/ so it is available
50
- // to the judge but never copied into the agent-under-test's CWD.
51
- const judgeSource = join(family.rootPath, "judge.md");
52
- const judgeProfilesDir = join(stagingDir, "judge-profiles");
53
- try {
54
- await access(judgeSource);
55
- await mkdir(judgeProfilesDir, { recursive: true });
56
- await cp(judgeSource, join(judgeProfilesDir, "judge.md"));
57
- } catch {}
58
-
59
- const lockPath = join(family.rootPath, "apm.lock.yaml");
60
- let skillSetHash = "";
61
- try {
62
- const lockBytes = await readFile(lockPath);
63
- skillSetHash =
64
- "sha256:" +
65
- createHash("sha256").update(normalizeLf(lockBytes)).digest("hex");
66
- } catch {
67
- // No lockfile — family doesn't use skill packs.
89
+ #runApmInstall(cwd) {
90
+ return new Promise((res, rej) => {
91
+ const child = this.spawn("apm", ["install", "--target", "claude"], {
92
+ cwd,
93
+ stdio: ["ignore", "pipe", "pipe"],
94
+ });
95
+ let stderr = "";
96
+ child.stdout.on("data", () => {});
97
+ child.stderr.on("data", (d) => {
98
+ stderr += d.toString();
99
+ });
100
+ child.on("error", (e) => {
101
+ rej(new Error(`failed to spawn apm: ${e.message}`));
102
+ });
103
+ child.on("close", (code) => {
104
+ if (code === 0) res();
105
+ else rej(new Error(`apm install exited ${code}: ${stderr}`));
106
+ });
107
+ });
68
108
  }
69
-
70
- return { stagingDir, skillSetHash, judgeProfilesDir };
71
109
  }
72
110
 
73
111
  function normalizeLf(buf) {
@@ -79,23 +117,20 @@ function normalizeLf(buf) {
79
117
  return Buffer.from(out);
80
118
  }
81
119
 
82
- function runApmInstall(cwd) {
83
- return new Promise((res, rej) => {
84
- const child = spawn("apm", ["install", "--target", "claude"], {
85
- cwd,
86
- stdio: ["ignore", "pipe", "pipe"],
87
- });
88
- let stderr = "";
89
- child.stdout.on("data", () => {});
90
- child.stderr.on("data", (d) => {
91
- stderr += d.toString();
92
- });
93
- child.on("error", (e) => {
94
- rej(new Error(`failed to spawn apm: ${e.message}`));
95
- });
96
- child.on("close", (code) => {
97
- if (code === 0) res();
98
- else rej(new Error(`apm install exited ${code}: ${stderr}`));
99
- });
100
- });
120
+ /**
121
+ * Factory function wires real dependencies.
122
+ * @param {ConstructorParameters<typeof ApmInstaller>[0]} [deps]
123
+ * @returns {ApmInstaller}
124
+ */
125
+ export function createApmInstaller(deps) {
126
+ return new ApmInstaller(deps);
127
+ }
128
+
129
+ /**
130
+ * Free-function shorthand for callers that don't need to inject a spawn seam.
131
+ * @param {import("./task-family.js").TaskFamily} family
132
+ * @param {string} outputDir
133
+ */
134
+ export function installApm(family, outputDir) {
135
+ return new ApmInstaller().install(family, outputDir);
101
136
  }
@@ -21,7 +21,7 @@ import { join, resolve as resolvePath } from "node:path";
21
21
 
22
22
  import { DEFAULT_ENV_ALLOWLIST, createRedactor } from "../redaction.js";
23
23
  import { createSupervisor } from "../supervisor.js";
24
- import { installApm } from "./apm-installer.js";
24
+ import { installApm as defaultInstallApm } from "./apm-installer.js";
25
25
  import { runJudge } from "./judge.js";
26
26
  import { validateResultRecord } from "./result.js";
27
27
  import { runScoring } from "./scorer.js";
@@ -64,6 +64,10 @@ export class BenchmarkRunner {
64
64
  * @param {Function} [opts.runJudge] - Test seam: replaces `runJudge`. Same
65
65
  * contract as `runJudge(task, workdir, scoring, deps)`. Internal testing
66
66
  * only.
67
+ * @param {Function} [opts.installApm] - Test seam: replaces `installApm`.
68
+ * Same contract as `installApm(family, outputDir)`. Lets tests inject a
69
+ * fake `apm` spawn (or skip the install entirely) so the suite never
70
+ * shells out to a real `apm` binary. Internal testing only.
67
71
  */
68
72
  constructor({
69
73
  family,
@@ -81,6 +85,7 @@ export class BenchmarkRunner {
81
85
  runAgent,
82
86
  runScoring: runScoringHook,
83
87
  runJudge: runJudgeHook,
88
+ installApm: installApmHook,
84
89
  }) {
85
90
  if (!family) throw new Error("family is required");
86
91
  if (!Number.isInteger(runs) || runs < 1)
@@ -105,6 +110,7 @@ export class BenchmarkRunner {
105
110
  this._runAgentHook = runAgent ?? null;
106
111
  this._runScoringHook = runScoringHook ?? runScoring;
107
112
  this._runJudgeHook = runJudgeHook ?? runJudge;
113
+ this._installApmHook = installApmHook ?? defaultInstallApm;
108
114
  }
109
115
 
110
116
  /**
@@ -118,10 +124,8 @@ export class BenchmarkRunner {
118
124
  : this.familyInput;
119
125
 
120
126
  await mkdir(this.output, { recursive: true });
121
- const { stagingDir, skillSetHash, judgeProfilesDir } = await installApm(
122
- family,
123
- this.output,
124
- );
127
+ const { stagingDir, skillSetHash, judgeProfilesDir } =
128
+ await this._installApmHook(family, this.output);
125
129
 
126
130
  const tasks = family.tasks();
127
131
  if (this.profiles.judge) {
package/src/supervisor.js CHANGED
@@ -104,7 +104,6 @@ export class Supervisor {
104
104
  */
105
105
  async run(task) {
106
106
  const initialTask = this.taskAmend ? `${task}\n\n${this.taskAmend}` : task;
107
- this.taskContext = initialTask;
108
107
  this.currentSource = "supervisor";
109
108
  this.currentTurn = 0;
110
109
  let supervisorResult = await this.supervisorRunner.run(initialTask);
@@ -252,22 +251,6 @@ export class Supervisor {
252
251
  return { type: "continue" };
253
252
  }
254
253
 
255
- /**
256
- * Resume the supervisor runner, falling back to a fresh session when the
257
- * SDK reports that the conversation no longer exists (e.g. session GC'd
258
- * while the agent was running). The fresh session includes the original
259
- * task context so the supervisor can still evaluate the agent's work.
260
- * @param {string} prompt
261
- * @returns {Promise<object>}
262
- */
263
- async #resumeSupervisor(prompt) {
264
- const result = await this.supervisorRunner.resume(prompt);
265
- if (result.error && isSessionNotFound(result.error)) {
266
- return this.supervisorRunner.run(`${this.taskContext}\n\n${prompt}`);
267
- }
268
- return result;
269
- }
270
-
271
254
  /**
272
255
  * If the agent has an unanswered ask, drain reminders and return a
273
256
  * formatted relay string. Returns null when no relay is needed.
@@ -295,7 +278,7 @@ export class Supervisor {
295
278
  this.currentSource = "supervisor";
296
279
  this.ctx.redirect = null;
297
280
 
298
- await this.#resumeSupervisor(
281
+ await this.supervisorRunner.resume(
299
282
  `The agent is mid-turn. Latest batch:\n\n${batchTranscript}\n\n` +
300
283
  `Review and use your tools if action is needed.`,
301
284
  );
@@ -333,7 +316,7 @@ export class Supervisor {
333
316
  `Review and decide how to proceed.`
334
317
  : `The agent reported:\n\n${agentTranscript}\n\nReview the agent's work and decide how to proceed.`;
335
318
 
336
- let supervisorResult = await this.#resumeSupervisor(reviewPrompt);
319
+ let supervisorResult = await this.supervisorRunner.resume(reviewPrompt);
337
320
 
338
321
  if (supervisorResult.error) {
339
322
  this.emitSummary({ success: false, turns: turn });
@@ -354,7 +337,7 @@ export class Supervisor {
354
337
  if (this.#checkAsk("supervisor") === "recheck" && !this.ctx.concluded) {
355
338
  const reminders = this.messageBus.drain("supervisor");
356
339
  if (reminders.length > 0) {
357
- supervisorResult = await this.#resumeSupervisor(
340
+ supervisorResult = await this.supervisorRunner.resume(
358
341
  formatMessages(reminders),
359
342
  );
360
343
  if (this.ctx.concluded) {
@@ -617,8 +600,3 @@ export function createSupervisor({
617
600
  });
618
601
  return supervisor;
619
602
  }
620
-
621
- function isSessionNotFound(error) {
622
- const msg = error?.message ?? String(error);
623
- return msg.includes("No conversation found with session ID");
624
- }