@forwardimpact/libeval 0.1.50 → 0.1.51

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (38) hide show
  1. package/README.md +11 -8
  2. package/bin/fit-benchmark.js +26 -27
  3. package/bin/fit-eval.js +49 -30
  4. package/bin/fit-trace.js +83 -57
  5. package/package.json +1 -1
  6. package/src/agent-runner.js +20 -12
  7. package/src/benchmark/env-loader.js +35 -23
  8. package/src/benchmark/{scorer.js → invariants.js} +14 -12
  9. package/src/benchmark/judge.js +5 -8
  10. package/src/benchmark/report.js +15 -15
  11. package/src/benchmark/result.js +11 -11
  12. package/src/benchmark/runner.js +11 -11
  13. package/src/benchmark/task-family.js +6 -4
  14. package/src/benchmark/workdir.js +18 -3
  15. package/src/commands/assert.js +30 -22
  16. package/src/commands/benchmark-invariants.js +74 -0
  17. package/src/commands/benchmark-report.js +23 -15
  18. package/src/commands/benchmark-run.js +15 -8
  19. package/src/commands/by-discussion.js +29 -18
  20. package/src/commands/callback.js +20 -11
  21. package/src/commands/discuss.js +28 -11
  22. package/src/commands/facilitate.js +18 -12
  23. package/src/commands/output.js +11 -12
  24. package/src/commands/run.js +22 -12
  25. package/src/commands/supervise.js +27 -18
  26. package/src/commands/task-input.js +10 -5
  27. package/src/commands/trace.js +174 -97
  28. package/src/discuss-tools.js +48 -2
  29. package/src/discusser.js +49 -2
  30. package/src/events/github.js +27 -5
  31. package/src/inbox-poller.js +84 -0
  32. package/src/judge.js +1 -1
  33. package/src/message-bus.js +6 -0
  34. package/src/orchestration-loop.js +14 -4
  35. package/src/orchestration-toolkit.js +14 -0
  36. package/src/redaction.js +31 -9
  37. package/src/reply-emitter.js +47 -0
  38. package/src/commands/benchmark-score.js +0 -68
@@ -1,4 +1,3 @@
1
- import { existsSync, readFileSync } from "node:fs";
2
1
  import { basename } from "node:path";
3
2
  import jmespath from "jmespath";
4
3
 
@@ -6,10 +5,11 @@ import jmespath from "jmespath";
6
5
  * Evaluate an assertion and return the structured result.
7
6
  * @param {object} values - { grep?: string, query?: string, exists?: boolean, not?: boolean, message?: string }
8
7
  * @param {string[]} args - [testName, file]
8
+ * @param {object} fsSync - Sync filesystem surface (`runtime.fsSync`): `existsSync`, `readFileSync`.
9
9
  * @returns {{ test: string, pass: boolean, message?: string }}
10
10
  */
11
11
  // biome-ignore lint/complexity/noExcessiveCognitiveComplexity: assertion dispatch by type
12
- export function evaluateAssertion(values, args) {
12
+ export function evaluateAssertion(values, args, fsSync) {
13
13
  const testName = args[0];
14
14
  if (!testName) throw new Error("assert: missing test name");
15
15
 
@@ -34,16 +34,16 @@ export function evaluateAssertion(values, args) {
34
34
  let result;
35
35
  if (values.exists) {
36
36
  if (!file) throw new Error("assert: missing file argument");
37
- result = assertExists(file);
37
+ result = assertExists(file, fsSync);
38
38
  } else if (values.grep) {
39
39
  if (!file) throw new Error("assert: missing file argument for --grep");
40
- result = assertGrep(values.grep, file);
40
+ result = assertGrep(values.grep, file, fsSync);
41
41
  } else if (values["cites-job"]) {
42
42
  if (!file) throw new Error("assert: missing file argument for --cites-job");
43
- result = assertCitesJob(values["cites-job"], file);
43
+ result = assertCitesJob(values["cites-job"], file, fsSync);
44
44
  } else {
45
45
  if (!file) throw new Error("assert: missing file argument for --query");
46
- result = assertQuery(values.query, file);
46
+ result = assertQuery(values.query, file, fsSync);
47
47
  }
48
48
 
49
49
  if (values.not) {
@@ -66,23 +66,31 @@ export function evaluateAssertion(values, args) {
66
66
  }
67
67
 
68
68
  /**
69
- * Run an assertion, write JSON to stdout, and set process.exitCode on failure.
70
- * @param {object} values
71
- * @param {string[]} args
69
+ * Run an assertion, write JSON to stdout, and return a failure envelope when
70
+ * the assertion does not pass.
71
+ * @param {import("@forwardimpact/libcli").InvocationContext} ctx
72
+ * @returns {Promise<{ok: true} | {ok: false, code: number, error: string}>}
72
73
  */
73
- export async function runAssertCommand(values, args) {
74
- const result = evaluateAssertion(values, args);
75
- process.stdout.write(JSON.stringify(result) + "\n");
76
- if (!result.pass) process.exitCode = 1;
74
+ export async function runAssertCommand(ctx) {
75
+ const runtime = ctx.deps.runtime;
76
+ const args = [ctx.args["test-name"], ctx.args.file];
77
+ let result;
78
+ try {
79
+ result = evaluateAssertion(ctx.options, args, runtime.fsSync);
80
+ } catch (err) {
81
+ return { ok: false, code: 1, error: err.message };
82
+ }
83
+ runtime.proc.stdout.write(JSON.stringify(result) + "\n");
84
+ return result.pass ? { ok: true } : { ok: false, code: 1, error: "" };
77
85
  }
78
86
 
79
- function assertExists(file) {
80
- if (existsSync(file)) return { pass: true };
87
+ function assertExists(file, fsSync) {
88
+ if (fsSync.existsSync(file)) return { pass: true };
81
89
  return { pass: false, message: `${file} not found` };
82
90
  }
83
91
 
84
- function assertGrep(pattern, file) {
85
- const content = readFileSync(file, "utf8");
92
+ function assertGrep(pattern, file, fsSync) {
93
+ const content = fsSync.readFileSync(file, "utf8");
86
94
  const re = new RegExp(pattern, "im");
87
95
  if (re.test(content)) return { pass: true };
88
96
  return {
@@ -91,8 +99,8 @@ function assertGrep(pattern, file) {
91
99
  };
92
100
  }
93
101
 
94
- function assertQuery(expression, file) {
95
- const content = readFileSync(file, "utf8");
102
+ function assertQuery(expression, file, fsSync) {
103
+ const content = fsSync.readFileSync(file, "utf8");
96
104
  const data = parseJsonOrNdjson(content);
97
105
  const result = jmespath.search(data, expression);
98
106
  const truthy =
@@ -109,8 +117,8 @@ function assertQuery(expression, file) {
109
117
 
110
118
  const JOB_TAG_RE = /<job\s+user="([^"]*)"\s+goal="([^"]*)">/;
111
119
 
112
- function assertCitesJob(jobFile, file) {
113
- const jobContent = readFileSync(jobFile, "utf8");
120
+ function assertCitesJob(jobFile, file, fsSync) {
121
+ const jobContent = fsSync.readFileSync(jobFile, "utf8");
114
122
  const match = JOB_TAG_RE.exec(jobContent);
115
123
  if (!match) {
116
124
  return {
@@ -119,7 +127,7 @@ function assertCitesJob(jobFile, file) {
119
127
  };
120
128
  }
121
129
  const citation = `${match[1]}: ${match[2]}`;
122
- const content = readFileSync(file, "utf8");
130
+ const content = fsSync.readFileSync(file, "utf8");
123
131
  if (content.includes(citation)) return { pass: true };
124
132
  return { pass: false, message: `missing "${citation}"` };
125
133
  }
@@ -0,0 +1,74 @@
1
+ /**
2
+ * `fit-benchmark invariants` — check a single task's invariants against a
3
+ * post-run workdir directory without invoking an agent (P6/P7). Useful for
4
+ * re-checking an agent's output against revised grading material.
5
+ */
6
+
7
+ import { join, resolve } from "node:path";
8
+ import { createServer } from "node:net";
9
+
10
+ import { validateInvariantsRecord } from "../benchmark/result.js";
11
+ import { runInvariants } from "../benchmark/invariants.js";
12
+ import { loadTaskFamily } from "../benchmark/task-family.js";
13
+
14
+ /**
15
+ * @param {import("@forwardimpact/libcli").InvocationContext} ctx
16
+ * @returns {Promise<{ok: true} | {ok: false, code: number, error: string}>}
17
+ */
18
+ export async function runBenchmarkInvariantsCommand(ctx) {
19
+ const values = ctx.options;
20
+ const runtime = ctx.deps.runtime;
21
+ const familyInput = values.family;
22
+ if (!familyInput)
23
+ return { ok: false, code: 1, error: "--family is required" };
24
+ const taskId = values.task;
25
+ if (!taskId) return { ok: false, code: 1, error: "--task is required" };
26
+ const workdirArg = values.workdir;
27
+ if (!workdirArg)
28
+ return { ok: false, code: 1, error: "--workdir is required" };
29
+
30
+ const family = await loadTaskFamily(familyInput);
31
+ const task = family.tasks().find((t) => t.id === taskId);
32
+ if (!task)
33
+ return { ok: false, code: 1, error: `task not found in family: ${taskId}` };
34
+
35
+ const runDir = resolve(workdirArg);
36
+ const cwd = join(runDir, "cwd");
37
+ const port = await allocatePort();
38
+
39
+ const invariants = await runInvariants(task, { cwd, port, runDir });
40
+ const record = {
41
+ taskId: task.id,
42
+ invariants,
43
+ exitCode: invariants.exitCode,
44
+ };
45
+ validateInvariantsRecord(record);
46
+
47
+ const line = JSON.stringify(record) + "\n";
48
+ if (values.output) {
49
+ runtime.fsSync.writeFileSync(resolve(values.output), line);
50
+ } else {
51
+ runtime.proc.stdout.write(line);
52
+ }
53
+ return invariants.verdict === "pass"
54
+ ? { ok: true }
55
+ : { ok: false, code: 1, error: "" };
56
+ }
57
+
58
+ function allocatePort() {
59
+ return new Promise((res, rej) => {
60
+ const server = createServer();
61
+ server.unref();
62
+ server.on("error", rej);
63
+ server.listen(0, "127.0.0.1", () => {
64
+ const addr = server.address();
65
+ if (!addr || typeof addr === "string") {
66
+ server.close();
67
+ rej(new Error("failed to allocate port"));
68
+ return;
69
+ }
70
+ const port = addr.port;
71
+ server.close(() => res(port));
72
+ });
73
+ });
74
+ }
@@ -9,24 +9,31 @@ import { resolve } from "node:path";
9
9
  import { aggregate, renderTextReport } from "../benchmark/report.js";
10
10
 
11
11
  /**
12
- * @param {object} values
13
- * @param {string[]} _args
12
+ * @param {import("@forwardimpact/libcli").InvocationContext} ctx
13
+ * @returns {Promise<{ok: true} | {ok: false, code: number, error: string}>}
14
14
  */
15
- export async function runBenchmarkReportCommand(values, _args) {
15
+ export async function runBenchmarkReportCommand(ctx) {
16
+ const values = ctx.options;
17
+ const runtime = ctx.deps.runtime;
16
18
  const inputDir = values.input ?? "benchmark-runs";
17
19
  const kRaw = values.k ?? "1,3,5";
18
- const kValues = kRaw.split(",").map((t) => {
19
- const n = Number.parseInt(t.trim(), 10);
20
- if (!Number.isFinite(n) || n < 1) {
21
- throw new Error(
22
- "--k must be a comma-separated list of positive integers",
23
- );
24
- }
25
- return n;
26
- });
20
+ let kValues;
21
+ try {
22
+ kValues = kRaw.split(",").map((t) => {
23
+ const n = Number.parseInt(t.trim(), 10);
24
+ if (!Number.isFinite(n) || n < 1) {
25
+ throw new Error(
26
+ "--k must be a comma-separated list of positive integers",
27
+ );
28
+ }
29
+ return n;
30
+ });
31
+ } catch (err) {
32
+ return { ok: false, code: 1, error: err.message };
33
+ }
27
34
  const format = values.format ?? "json";
28
35
  if (format !== "json" && format !== "text") {
29
- throw new Error("--format must be 'json' or 'text'");
36
+ return { ok: false, code: 1, error: "--format must be 'json' or 'text'" };
30
37
  }
31
38
 
32
39
  const report = await aggregate({
@@ -35,8 +42,9 @@ export async function runBenchmarkReportCommand(values, _args) {
35
42
  includeRuns: format === "text",
36
43
  });
37
44
  if (format === "text") {
38
- process.stdout.write(renderTextReport(report, kValues) + "\n");
45
+ runtime.proc.stdout.write(renderTextReport(report, kValues) + "\n");
39
46
  } else {
40
- process.stdout.write(JSON.stringify(report, null, 2) + "\n");
47
+ runtime.proc.stdout.write(JSON.stringify(report, null, 2) + "\n");
41
48
  }
49
+ return { ok: true };
42
50
  }
@@ -10,30 +10,37 @@ import { createConfig } from "@forwardimpact/libconfig";
10
10
  import { createBenchmarkRunner } from "../benchmark/runner.js";
11
11
 
12
12
  /**
13
- * @param {object} values
14
- * @param {string[]} _args
13
+ * @param {import("@forwardimpact/libcli").InvocationContext} ctx
14
+ * @returns {Promise<{ok: true} | {ok: false, code: number, error: string}>}
15
15
  */
16
- export async function runBenchmarkRunCommand(values, _args) {
17
- const opts = parseRunOptions(values);
16
+ export async function runBenchmarkRunCommand(ctx) {
17
+ const values = ctx.options;
18
+ const runtime = ctx.deps.runtime;
19
+ let opts;
20
+ try {
21
+ opts = parseRunOptions(values);
22
+ } catch (err) {
23
+ return { ok: false, code: 1, error: err.message };
24
+ }
18
25
  const config = await createConfig("script", "benchmark");
19
- process.env.ANTHROPIC_API_KEY = await config.anthropicToken();
26
+ runtime.proc.env.ANTHROPIC_API_KEY = await config.anthropicToken();
20
27
 
21
28
  // The Claude Agent SDK spawns a `claude` subprocess that inherits
22
29
  // process.env. NODE_EXTRA_CA_CERTS causes undici (the HTTP client
23
30
  // inside that subprocess) to fail with UND_ERR_INVALID_ARG on
24
31
  // Node 22+, aborting every API call after 10 retries. Strip it
25
32
  // before the SDK loads so the subprocess gets a clean environment.
26
- delete process.env.NODE_EXTRA_CA_CERTS;
33
+ delete runtime.proc.env.NODE_EXTRA_CA_CERTS;
27
34
 
28
35
  const { query } = await import("@anthropic-ai/claude-agent-sdk");
29
36
  const runner = createBenchmarkRunner({ ...opts, query });
30
37
 
31
38
  let anyFail = false;
32
39
  for await (const record of runner.run()) {
33
- process.stdout.write(JSON.stringify(record) + "\n");
40
+ runtime.proc.stdout.write(JSON.stringify(record) + "\n");
34
41
  if (record.verdict !== "pass") anyFail = true;
35
42
  }
36
- process.exit(anyFail ? 1 : 0);
43
+ return anyFail ? { ok: false, code: 1, error: "" } : { ok: true };
37
44
  }
38
45
 
39
46
  function parseRunOptions(values) {
@@ -1,9 +1,16 @@
1
- import { readdirSync, statSync, openSync, readSync, closeSync } from "node:fs";
1
+ import { closeSync, openSync, readSync } from "node:fs";
2
2
  import { join } from "node:path";
3
3
 
4
+ const FIRST_LINE_CAP = 64 * 1024;
5
+
4
6
  /**
5
- * Read the first newline-terminated line of a file. Bounded to 64 KiB
6
- * which is well above any orchestrator envelope.
7
+ * Read the first newline-terminated line of a file, bounded to the first
8
+ * {@link FIRST_LINE_CAP} bytes. Trace `.ndjson` files can be many MB; the
9
+ * Step 2.6 meta header is always small, so a bounded `readSync` avoids
10
+ * loading whole files into memory just to inspect the header. This uses
11
+ * `node:fs` directly because the `runtime.fsSync` surface exposes no
12
+ * positional `openSync`/`readSync` — the file is grandfathered for
13
+ * `import:fs` in `check-ambient-deps.deny.yml` until that seam exists.
7
14
  *
8
15
  * @param {string} path
9
16
  * @returns {string}
@@ -11,11 +18,11 @@ import { join } from "node:path";
11
18
  function readFirstLine(path) {
12
19
  const fd = openSync(path, "r");
13
20
  try {
14
- const buf = Buffer.alloc(65536);
21
+ const buf = Buffer.alloc(FIRST_LINE_CAP);
15
22
  const bytes = readSync(fd, buf, 0, buf.length, 0);
16
- const slice = buf.slice(0, bytes).toString("utf8");
17
- const nl = slice.indexOf("\n");
18
- return nl === -1 ? slice : slice.slice(0, nl);
23
+ const text = buf.toString("utf8", 0, bytes);
24
+ const nl = text.indexOf("\n");
25
+ return nl === -1 ? text : text.slice(0, nl);
19
26
  } finally {
20
27
  closeSync(fd);
21
28
  }
@@ -30,13 +37,14 @@ function readFirstLine(path) {
30
37
  *
31
38
  * @param {string} dir
32
39
  * @param {string} discussionId
40
+ * @param {object} fsSync - Sync filesystem surface (`runtime.fsSync`).
33
41
  * @returns {Array<{path: string, mtimeMs: number}>}
34
42
  */
35
- export function findTracesByDiscussion(dir, discussionId) {
43
+ export function findTracesByDiscussion(dir, discussionId, fsSync) {
36
44
  const matches = [];
37
45
  let entries;
38
46
  try {
39
- entries = readdirSync(dir);
47
+ entries = fsSync.readdirSync(dir);
40
48
  } catch {
41
49
  return [];
42
50
  }
@@ -58,7 +66,7 @@ export function findTracesByDiscussion(dir, discussionId) {
58
66
  const event = parsed.event ?? parsed;
59
67
  if (event?.type !== "meta") continue;
60
68
  if (event.discussion_id !== discussionId) continue;
61
- matches.push({ path, mtimeMs: statSync(path).mtimeMs });
69
+ matches.push({ path, mtimeMs: fsSync.statSync(path).mtimeMs });
62
70
  }
63
71
  matches.sort((a, b) => a.mtimeMs - b.mtimeMs);
64
72
  return matches;
@@ -70,15 +78,18 @@ export function findTracesByDiscussion(dir, discussionId) {
70
78
  * line, ordered by first-event timestamp (file mtime ascending). The
71
79
  * result is usable with `xargs cat` for a chronological merge.
72
80
  *
73
- * @param {object} values
74
- * @param {string[]} args
81
+ * @param {import("@forwardimpact/libcli").InvocationContext} ctx
82
+ * @returns {Promise<{ok: true} | {ok: false, code: number, error: string}>}
75
83
  */
76
- export async function runByDiscussionCommand(values, args) {
77
- const [discussionId, traceDirArg] = args;
78
- if (!discussionId) throw new Error("<discussion-id> is required");
79
- const dir = traceDirArg ?? values["trace-dir"] ?? "traces";
80
- const matches = findTracesByDiscussion(dir, discussionId);
84
+ export async function runByDiscussionCommand(ctx) {
85
+ const runtime = ctx.deps.runtime;
86
+ const discussionId = ctx.args["discussion-id"];
87
+ if (!discussionId)
88
+ return { ok: false, code: 1, error: "<discussion-id> is required" };
89
+ const dir = ctx.args["trace-dir"] ?? ctx.options["trace-dir"] ?? "traces";
90
+ const matches = findTracesByDiscussion(dir, discussionId, runtime.fsSync);
81
91
  for (const { path } of matches) {
82
- process.stdout.write(`${path}\n`);
92
+ runtime.proc.stdout.write(`${path}\n`);
83
93
  }
94
+ return { ok: true };
84
95
  }
@@ -1,5 +1,3 @@
1
- import { readFileSync } from "node:fs";
2
-
3
1
  /**
4
2
  * Scan an NDJSON trace and return the last orchestrator summary event,
5
3
  * the first `meta` event's `discussion_id`, and any structured replies
@@ -11,13 +9,14 @@ import { readFileSync } from "node:fs";
11
9
  * its channel semantics.
12
10
  *
13
11
  * @param {string} traceFile
12
+ * @param {object} fsSync - Sync filesystem surface (`runtime.fsSync`).
14
13
  * @returns {{verdict: string, summary: string, replies: object[], trigger?: object, discussionId?: string} | null}
15
14
  */
16
15
  // biome-ignore lint/complexity/noExcessiveCognitiveComplexity: NDJSON scan with malformed-line tolerance + meta/summary dual extraction
17
- function readTraceSummary(traceFile) {
16
+ function readTraceSummary(traceFile, fsSync) {
18
17
  let summary = null;
19
18
  let metaDiscussionId = null;
20
- for (const line of readFileSync(traceFile, "utf8").split("\n")) {
19
+ for (const line of fsSync.readFileSync(traceFile, "utf8").split("\n")) {
21
20
  if (!line.trim()) continue;
22
21
  let record;
23
22
  try {
@@ -40,6 +39,9 @@ function readTraceSummary(traceFile) {
40
39
  ...(record.event.discussion_id && {
41
40
  discussionId: record.event.discussion_id,
42
41
  }),
42
+ ...(typeof record.event.lastActedSeq === "number" && {
43
+ lastActedSeq: record.event.lastActedSeq,
44
+ }),
43
45
  };
44
46
  }
45
47
  }
@@ -64,20 +66,24 @@ function readTraceSummary(traceFile) {
64
66
  * }
65
67
  * ```
66
68
  *
67
- * @param {object} values - Parsed option values from cli.parse()
68
- * @param {string[]} _args - Positional arguments
69
+ * @param {import("@forwardimpact/libcli").InvocationContext} ctx
70
+ * @returns {Promise<{ok: true} | {ok: false, code: number, error: string}>}
69
71
  */
70
- export async function runCallbackCommand(values, _args) {
72
+ export async function runCallbackCommand(ctx) {
73
+ const values = ctx.options;
74
+ const runtime = ctx.deps.runtime;
71
75
  const traceFile = values["trace-file"];
72
76
  const callbackUrl = values["callback-url"];
73
77
  const correlationId = values["correlation-id"];
74
78
  const runUrl = values["run-url"] ?? "";
75
79
  const discussionIdOverride = values["discussion-id"] ?? null;
76
80
 
77
- if (!traceFile) throw new Error("--trace-file is required");
78
- if (!callbackUrl) throw new Error("--callback-url is required");
81
+ if (!traceFile)
82
+ return { ok: false, code: 1, error: "--trace-file is required" };
83
+ if (!callbackUrl)
84
+ return { ok: false, code: 1, error: "--callback-url is required" };
79
85
 
80
- const found = readTraceSummary(traceFile) ?? {
86
+ const found = readTraceSummary(traceFile, runtime.fsSync) ?? {
81
87
  verdict: "failed",
82
88
  summary: "Run ended without producing a summary.",
83
89
  replies: [],
@@ -86,10 +92,12 @@ export async function runCallbackCommand(values, _args) {
86
92
  const discussionId = found.discussionId ?? discussionIdOverride ?? null;
87
93
  const payload = {
88
94
  correlation_id: correlationId,
95
+ kind: "terminal",
89
96
  verdict: found.verdict,
90
97
  summary: found.summary,
91
98
  run_url: runUrl,
92
99
  replies: found.replies,
100
+ last_acted_seq: found.lastActedSeq ?? -1,
93
101
  ...(discussionId && { discussion_id: discussionId }),
94
102
  ...(found.trigger && { trigger: found.trigger }),
95
103
  };
@@ -99,6 +107,7 @@ export async function runCallbackCommand(values, _args) {
99
107
  body: JSON.stringify(payload),
100
108
  });
101
109
  if (!res.ok) {
102
- throw new Error(`Callback POST failed: ${res.status}`);
110
+ return { ok: false, code: 1, error: `Callback POST failed: ${res.status}` };
103
111
  }
112
+ return { ok: true };
104
113
  }
@@ -17,10 +17,14 @@ function parseAgentProfiles(raw, cwd, maxTurns) {
17
17
  * Parse and validate discuss command options. Exported so tests can verify
18
18
  * defaults and the legacy-flag clean break.
19
19
  * @param {object} values - Parsed option values
20
+ * @param {import("@forwardimpact/libutil/runtime").Runtime} runtime
20
21
  * @returns {object}
21
22
  */
22
- export function parseDiscussOptions(values) {
23
- const { task: taskContent, amend: taskAmend } = resolveTaskContent(values);
23
+ export function parseDiscussOptions(values, runtime) {
24
+ const { task: taskContent, amend: taskAmend } = resolveTaskContent(
25
+ values,
26
+ runtime,
27
+ );
24
28
 
25
29
  const profilesRaw = values["agent-profiles"];
26
30
  const agentCwd = resolve(values["agent-cwd"] ?? ".");
@@ -40,6 +44,9 @@ export function parseDiscussOptions(values) {
40
44
  }
41
45
  }
42
46
 
47
+ const maxLeadTurnsRaw = values["max-lead-turns"] ?? "200";
48
+ const maxLeadTurns = parseInt(maxLeadTurnsRaw, 10);
49
+
43
50
  return {
44
51
  taskContent,
45
52
  taskAmend,
@@ -48,9 +55,13 @@ export function parseDiscussOptions(values) {
48
55
  leadModel: values["lead-model"] ?? "claude-opus-4-7[1m]",
49
56
  agentModel: values["agent-model"] ?? "claude-opus-4-7[1m]",
50
57
  maxTurns,
58
+ maxLeadTurns,
51
59
  outputPath: values.output,
52
60
  discussionId: values["discussion-id"] ?? null,
53
61
  resumeContext,
62
+ callbackUrl: runtime.proc.env.CALLBACK_URL ?? null,
63
+ inboxUrl: runtime.proc.env.INBOX_URL ?? null,
64
+ correlationId: runtime.proc.env.CORRELATION_ID ?? null,
54
65
  };
55
66
  }
56
67
 
@@ -59,13 +70,14 @@ export function parseDiscussOptions(values) {
59
70
  * semantics, threading `discussion_id` through the trace so multi-run
60
71
  * conversations are queryable as one.
61
72
  *
62
- * @param {object} values - Parsed option values
63
- * @param {string[]} _args - Positional arguments
73
+ * @param {import("@forwardimpact/libcli").InvocationContext} ctx
74
+ * @returns {Promise<{ok: boolean, code?: number, error?: string}>}
64
75
  */
65
- export async function runDiscussCommand(values, _args) {
66
- const opts = parseDiscussOptions(values);
76
+ export async function runDiscussCommand(ctx) {
77
+ const runtime = ctx.deps.runtime;
78
+ const opts = parseDiscussOptions(ctx.options, runtime);
67
79
 
68
- const redactor = createRedactor();
80
+ const redactor = createRedactor({ runtime });
69
81
 
70
82
  const fileStream = opts.outputPath
71
83
  ? createWriteStream(opts.outputPath)
@@ -73,13 +85,13 @@ export async function runDiscussCommand(values, _args) {
73
85
  const output = fileStream
74
86
  ? createTeeWriter({
75
87
  fileStream,
76
- textStream: process.stdout,
88
+ textStream: runtime.proc.stdout,
77
89
  mode: "supervised",
78
90
  })
79
- : process.stdout;
91
+ : runtime.proc.stdout;
80
92
 
81
93
  if (opts.leadProfile) {
82
- process.env.LIBEVAL_AGENT_PROFILE = opts.leadProfile;
94
+ runtime.proc.env.LIBEVAL_AGENT_PROFILE = opts.leadProfile;
83
95
  }
84
96
 
85
97
  const { query } = await import("@anthropic-ai/claude-agent-sdk");
@@ -93,8 +105,13 @@ export async function runDiscussCommand(values, _args) {
93
105
  query,
94
106
  output,
95
107
  maxTurns: opts.maxTurns,
108
+ maxLeadTurns: opts.maxLeadTurns,
96
109
  taskAmend: opts.taskAmend,
97
110
  redactor,
111
+ callbackUrl: opts.callbackUrl,
112
+ inboxUrl: opts.inboxUrl,
113
+ correlationId: opts.correlationId,
114
+ runtime,
98
115
  });
99
116
 
100
117
  const result = await discusser.run(opts.taskContent);
@@ -104,5 +121,5 @@ export async function runDiscussCommand(values, _args) {
104
121
  await new Promise((r) => fileStream.end(r));
105
122
  }
106
123
 
107
- process.exit(result.success ? 0 : 1);
124
+ return result.success ? { ok: true } : { ok: false, code: 1, error: "" };
108
125
  }
@@ -23,10 +23,14 @@ function parseAgentProfiles(raw, cwd, maxTurns) {
23
23
  * coverage of the `--max-turns` → per-agent threading contract; not part
24
24
  * of the package's public API.
25
25
  * @param {object} values - Parsed option values
26
+ * @param {import("@forwardimpact/libutil/runtime").Runtime} runtime
26
27
  * @returns {object} Parsed options
27
28
  */
28
- export function parseFacilitateOptions(values) {
29
- const { task: taskContent, amend: taskAmend } = resolveTaskContent(values);
29
+ export function parseFacilitateOptions(values, runtime) {
30
+ const { task: taskContent, amend: taskAmend } = resolveTaskContent(
31
+ values,
32
+ runtime,
33
+ );
30
34
 
31
35
  const profilesRaw = values["agent-profiles"];
32
36
  if (!profilesRaw) throw new Error("--agent-profiles is required");
@@ -59,16 +63,17 @@ export function parseFacilitateOptions(values) {
59
63
  *
60
64
  * Usage: fit-eval facilitate [options]
61
65
  *
62
- * @param {object} values - Parsed option values from cli.parse()
63
- * @param {string[]} _args - Positional arguments
66
+ * @param {import("@forwardimpact/libcli").InvocationContext} ctx
67
+ * @returns {Promise<{ok: boolean, code?: number, error?: string}>}
64
68
  */
65
- export async function runFacilitateCommand(values, _args) {
66
- const opts = parseFacilitateOptions(values);
69
+ export async function runFacilitateCommand(ctx) {
70
+ const runtime = ctx.deps.runtime;
71
+ const opts = parseFacilitateOptions(ctx.options, runtime);
67
72
 
68
73
  // Build the redactor as the first observable side-effect after option
69
74
  // parsing — the env snapshot must freeze BEFORE any in-process
70
- // process.env writes the command performs (e.g. LIBEVAL_AGENT_PROFILE).
71
- const redactor = createRedactor();
75
+ // env writes the command performs (e.g. LIBEVAL_AGENT_PROFILE).
76
+ const redactor = createRedactor({ runtime });
72
77
 
73
78
  const fileStream = opts.outputPath
74
79
  ? createWriteStream(opts.outputPath)
@@ -76,13 +81,13 @@ export async function runFacilitateCommand(values, _args) {
76
81
  const output = fileStream
77
82
  ? createTeeWriter({
78
83
  fileStream,
79
- textStream: process.stdout,
84
+ textStream: runtime.proc.stdout,
80
85
  mode: "supervised",
81
86
  })
82
- : process.stdout;
87
+ : runtime.proc.stdout;
83
88
 
84
89
  if (opts.facilitatorProfile) {
85
- process.env.LIBEVAL_AGENT_PROFILE = opts.facilitatorProfile;
90
+ runtime.proc.env.LIBEVAL_AGENT_PROFILE = opts.facilitatorProfile;
86
91
  }
87
92
 
88
93
  const { query } = await import("@anthropic-ai/claude-agent-sdk");
@@ -97,6 +102,7 @@ export async function runFacilitateCommand(values, _args) {
97
102
  facilitatorProfile: opts.facilitatorProfile,
98
103
  taskAmend: opts.taskAmend,
99
104
  redactor,
105
+ runtime,
100
106
  });
101
107
 
102
108
  const result = await facilitator.run(opts.taskContent);
@@ -106,5 +112,5 @@ export async function runFacilitateCommand(values, _args) {
106
112
  await new Promise((r) => fileStream.end(r));
107
113
  }
108
114
 
109
- process.exit(result.success ? 0 : 1);
115
+ return result.success ? { ok: true } : { ok: false, code: 1, error: "" };
110
116
  }