@forwardimpact/libeval 0.1.50 → 0.1.52
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +11 -8
- package/bin/fit-benchmark.js +26 -27
- package/bin/fit-eval.js +36 -30
- package/bin/fit-trace.js +83 -57
- package/package.json +1 -1
- package/src/agent-runner.js +20 -12
- package/src/benchmark/apm-installer.js +48 -44
- package/src/benchmark/env-loader.js +35 -23
- package/src/benchmark/invariants.js +128 -0
- package/src/benchmark/judge.js +18 -19
- package/src/benchmark/npm-installer.js +33 -33
- package/src/benchmark/report.js +40 -26
- package/src/benchmark/result.js +11 -11
- package/src/benchmark/runner.js +90 -46
- package/src/benchmark/task-family.js +78 -65
- package/src/benchmark/workdir.js +100 -93
- package/src/commands/assert.js +30 -22
- package/src/commands/benchmark-invariants.js +74 -0
- package/src/commands/benchmark-report.js +24 -15
- package/src/commands/benchmark-run.js +16 -9
- package/src/commands/by-discussion.js +33 -23
- package/src/commands/callback.js +20 -11
- package/src/commands/discuss.js +31 -13
- package/src/commands/facilitate.js +21 -14
- package/src/commands/output.js +15 -13
- package/src/commands/run.js +28 -14
- package/src/commands/supervise.js +29 -19
- package/src/commands/task-input.js +10 -5
- package/src/commands/tee.js +24 -9
- package/src/commands/trace.js +181 -99
- package/src/discuss-tools.js +48 -2
- package/src/discusser.js +53 -2
- package/src/events/github.js +27 -5
- package/src/facilitator.js +4 -0
- package/src/inbox-poller.js +84 -0
- package/src/judge.js +4 -1
- package/src/message-bus.js +6 -0
- package/src/orchestration-loop.js +14 -4
- package/src/orchestration-toolkit.js +14 -0
- package/src/profile-prompt.js +22 -9
- package/src/redaction.js +31 -9
- package/src/reply-emitter.js +47 -0
- package/src/supervisor.js +4 -0
- package/src/tee-writer.js +4 -2
- package/src/trace-collector.js +9 -2
- package/src/trace-github.js +47 -27
- package/src/benchmark/scorer.js +0 -138
- package/src/commands/benchmark-score.js +0 -68
|
@@ -0,0 +1,74 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* `fit-benchmark invariants` — check a single task's invariants against a
|
|
3
|
+
* post-run workdir directory without invoking an agent (P6/P7). Useful for
|
|
4
|
+
* re-checking an agent's output against revised grading material.
|
|
5
|
+
*/
|
|
6
|
+
|
|
7
|
+
import { join, resolve } from "node:path";
|
|
8
|
+
import { createServer } from "node:net";
|
|
9
|
+
|
|
10
|
+
import { validateInvariantsRecord } from "../benchmark/result.js";
|
|
11
|
+
import { runInvariants } from "../benchmark/invariants.js";
|
|
12
|
+
import { loadTaskFamily } from "../benchmark/task-family.js";
|
|
13
|
+
|
|
14
|
+
/**
|
|
15
|
+
* @param {import("@forwardimpact/libcli").InvocationContext} ctx
|
|
16
|
+
* @returns {Promise<{ok: true} | {ok: false, code: number, error: string}>}
|
|
17
|
+
*/
|
|
18
|
+
export async function runBenchmarkInvariantsCommand(ctx) {
|
|
19
|
+
const values = ctx.options;
|
|
20
|
+
const runtime = ctx.deps.runtime;
|
|
21
|
+
const familyInput = values.family;
|
|
22
|
+
if (!familyInput)
|
|
23
|
+
return { ok: false, code: 1, error: "--family is required" };
|
|
24
|
+
const taskId = values.task;
|
|
25
|
+
if (!taskId) return { ok: false, code: 1, error: "--task is required" };
|
|
26
|
+
const workdirArg = values.workdir;
|
|
27
|
+
if (!workdirArg)
|
|
28
|
+
return { ok: false, code: 1, error: "--workdir is required" };
|
|
29
|
+
|
|
30
|
+
const family = await loadTaskFamily(familyInput, runtime);
|
|
31
|
+
const task = family.tasks().find((t) => t.id === taskId);
|
|
32
|
+
if (!task)
|
|
33
|
+
return { ok: false, code: 1, error: `task not found in family: ${taskId}` };
|
|
34
|
+
|
|
35
|
+
const runDir = resolve(workdirArg);
|
|
36
|
+
const cwd = join(runDir, "cwd");
|
|
37
|
+
const port = await allocatePort();
|
|
38
|
+
|
|
39
|
+
const invariants = await runInvariants(task, { cwd, port, runDir }, runtime);
|
|
40
|
+
const record = {
|
|
41
|
+
taskId: task.id,
|
|
42
|
+
invariants,
|
|
43
|
+
exitCode: invariants.exitCode,
|
|
44
|
+
};
|
|
45
|
+
validateInvariantsRecord(record);
|
|
46
|
+
|
|
47
|
+
const line = JSON.stringify(record) + "\n";
|
|
48
|
+
if (values.output) {
|
|
49
|
+
runtime.fsSync.writeFileSync(resolve(values.output), line);
|
|
50
|
+
} else {
|
|
51
|
+
runtime.proc.stdout.write(line);
|
|
52
|
+
}
|
|
53
|
+
return invariants.verdict === "pass"
|
|
54
|
+
? { ok: true }
|
|
55
|
+
: { ok: false, code: 1, error: "" };
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
function allocatePort() {
|
|
59
|
+
return new Promise((res, rej) => {
|
|
60
|
+
const server = createServer();
|
|
61
|
+
server.unref();
|
|
62
|
+
server.on("error", rej);
|
|
63
|
+
server.listen(0, "127.0.0.1", () => {
|
|
64
|
+
const addr = server.address();
|
|
65
|
+
if (!addr || typeof addr === "string") {
|
|
66
|
+
server.close();
|
|
67
|
+
rej(new Error("failed to allocate port"));
|
|
68
|
+
return;
|
|
69
|
+
}
|
|
70
|
+
const port = addr.port;
|
|
71
|
+
server.close(() => res(port));
|
|
72
|
+
});
|
|
73
|
+
});
|
|
74
|
+
}
|
|
@@ -9,34 +9,43 @@ import { resolve } from "node:path";
|
|
|
9
9
|
import { aggregate, renderTextReport } from "../benchmark/report.js";
|
|
10
10
|
|
|
11
11
|
/**
|
|
12
|
-
* @param {
|
|
13
|
-
* @
|
|
12
|
+
* @param {import("@forwardimpact/libcli").InvocationContext} ctx
|
|
13
|
+
* @returns {Promise<{ok: true} | {ok: false, code: number, error: string}>}
|
|
14
14
|
*/
|
|
15
|
-
export async function runBenchmarkReportCommand(
|
|
15
|
+
export async function runBenchmarkReportCommand(ctx) {
|
|
16
|
+
const values = ctx.options;
|
|
17
|
+
const runtime = ctx.deps.runtime;
|
|
16
18
|
const inputDir = values.input ?? "benchmark-runs";
|
|
17
19
|
const kRaw = values.k ?? "1,3,5";
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
|
|
20
|
+
let kValues;
|
|
21
|
+
try {
|
|
22
|
+
kValues = kRaw.split(",").map((t) => {
|
|
23
|
+
const n = Number.parseInt(t.trim(), 10);
|
|
24
|
+
if (!Number.isFinite(n) || n < 1) {
|
|
25
|
+
throw new Error(
|
|
26
|
+
"--k must be a comma-separated list of positive integers",
|
|
27
|
+
);
|
|
28
|
+
}
|
|
29
|
+
return n;
|
|
30
|
+
});
|
|
31
|
+
} catch (err) {
|
|
32
|
+
return { ok: false, code: 1, error: err.message };
|
|
33
|
+
}
|
|
27
34
|
const format = values.format ?? "json";
|
|
28
35
|
if (format !== "json" && format !== "text") {
|
|
29
|
-
|
|
36
|
+
return { ok: false, code: 1, error: "--format must be 'json' or 'text'" };
|
|
30
37
|
}
|
|
31
38
|
|
|
32
39
|
const report = await aggregate({
|
|
33
40
|
inputDir: resolve(inputDir),
|
|
34
41
|
kValues,
|
|
35
42
|
includeRuns: format === "text",
|
|
43
|
+
runtime,
|
|
36
44
|
});
|
|
37
45
|
if (format === "text") {
|
|
38
|
-
|
|
46
|
+
runtime.proc.stdout.write(renderTextReport(report, kValues) + "\n");
|
|
39
47
|
} else {
|
|
40
|
-
|
|
48
|
+
runtime.proc.stdout.write(JSON.stringify(report, null, 2) + "\n");
|
|
41
49
|
}
|
|
50
|
+
return { ok: true };
|
|
42
51
|
}
|
|
@@ -10,30 +10,37 @@ import { createConfig } from "@forwardimpact/libconfig";
|
|
|
10
10
|
import { createBenchmarkRunner } from "../benchmark/runner.js";
|
|
11
11
|
|
|
12
12
|
/**
|
|
13
|
-
* @param {
|
|
14
|
-
* @
|
|
13
|
+
* @param {import("@forwardimpact/libcli").InvocationContext} ctx
|
|
14
|
+
* @returns {Promise<{ok: true} | {ok: false, code: number, error: string}>}
|
|
15
15
|
*/
|
|
16
|
-
export async function runBenchmarkRunCommand(
|
|
17
|
-
const
|
|
16
|
+
export async function runBenchmarkRunCommand(ctx) {
|
|
17
|
+
const values = ctx.options;
|
|
18
|
+
const runtime = ctx.deps.runtime;
|
|
19
|
+
let opts;
|
|
20
|
+
try {
|
|
21
|
+
opts = parseRunOptions(values);
|
|
22
|
+
} catch (err) {
|
|
23
|
+
return { ok: false, code: 1, error: err.message };
|
|
24
|
+
}
|
|
18
25
|
const config = await createConfig("script", "benchmark");
|
|
19
|
-
|
|
26
|
+
runtime.proc.env.ANTHROPIC_API_KEY = await config.anthropicToken();
|
|
20
27
|
|
|
21
28
|
// The Claude Agent SDK spawns a `claude` subprocess that inherits
|
|
22
29
|
// process.env. NODE_EXTRA_CA_CERTS causes undici (the HTTP client
|
|
23
30
|
// inside that subprocess) to fail with UND_ERR_INVALID_ARG on
|
|
24
31
|
// Node 22+, aborting every API call after 10 retries. Strip it
|
|
25
32
|
// before the SDK loads so the subprocess gets a clean environment.
|
|
26
|
-
delete
|
|
33
|
+
delete runtime.proc.env.NODE_EXTRA_CA_CERTS;
|
|
27
34
|
|
|
28
35
|
const { query } = await import("@anthropic-ai/claude-agent-sdk");
|
|
29
|
-
const runner = createBenchmarkRunner({ ...opts, query });
|
|
36
|
+
const runner = createBenchmarkRunner({ ...opts, query, runtime });
|
|
30
37
|
|
|
31
38
|
let anyFail = false;
|
|
32
39
|
for await (const record of runner.run()) {
|
|
33
|
-
|
|
40
|
+
runtime.proc.stdout.write(JSON.stringify(record) + "\n");
|
|
34
41
|
if (record.verdict !== "pass") anyFail = true;
|
|
35
42
|
}
|
|
36
|
-
|
|
43
|
+
return anyFail ? { ok: false, code: 1, error: "" } : { ok: true };
|
|
37
44
|
}
|
|
38
45
|
|
|
39
46
|
function parseRunOptions(values) {
|
|
@@ -1,23 +1,29 @@
|
|
|
1
|
-
import { readdirSync, statSync, openSync, readSync, closeSync } from "node:fs";
|
|
2
1
|
import { join } from "node:path";
|
|
3
2
|
|
|
3
|
+
const FIRST_LINE_CAP = 64 * 1024;
|
|
4
|
+
|
|
4
5
|
/**
|
|
5
|
-
* Read the first newline-terminated line of a file
|
|
6
|
-
*
|
|
6
|
+
* Read the first newline-terminated line of a file, bounded to the first
|
|
7
|
+
* {@link FIRST_LINE_CAP} bytes. Trace `.ndjson` files can be many MB; the
|
|
8
|
+
* Step 2.6 meta header is always small, so a bounded positional read avoids
|
|
9
|
+
* loading whole files into memory just to inspect the header. The positional
|
|
10
|
+
* `openSync`/`readSync`/`closeSync` trio is read off the injected
|
|
11
|
+
* `runtime.fsSync` surface.
|
|
7
12
|
*
|
|
13
|
+
* @param {object} fsSync - Sync filesystem surface (`runtime.fsSync`).
|
|
8
14
|
* @param {string} path
|
|
9
15
|
* @returns {string}
|
|
10
16
|
*/
|
|
11
|
-
function readFirstLine(path) {
|
|
12
|
-
const fd = openSync(path, "r");
|
|
17
|
+
function readFirstLine(fsSync, path) {
|
|
18
|
+
const fd = fsSync.openSync(path, "r");
|
|
13
19
|
try {
|
|
14
|
-
const buf = Buffer.alloc(
|
|
15
|
-
const bytes = readSync(fd, buf, 0, buf.length, 0);
|
|
16
|
-
const
|
|
17
|
-
const nl =
|
|
18
|
-
return nl === -1 ?
|
|
20
|
+
const buf = Buffer.alloc(FIRST_LINE_CAP);
|
|
21
|
+
const bytes = fsSync.readSync(fd, buf, 0, buf.length, 0);
|
|
22
|
+
const text = buf.toString("utf8", 0, bytes);
|
|
23
|
+
const nl = text.indexOf("\n");
|
|
24
|
+
return nl === -1 ? text : text.slice(0, nl);
|
|
19
25
|
} finally {
|
|
20
|
-
closeSync(fd);
|
|
26
|
+
fsSync.closeSync(fd);
|
|
21
27
|
}
|
|
22
28
|
}
|
|
23
29
|
|
|
@@ -30,13 +36,14 @@ function readFirstLine(path) {
|
|
|
30
36
|
*
|
|
31
37
|
* @param {string} dir
|
|
32
38
|
* @param {string} discussionId
|
|
39
|
+
* @param {object} fsSync - Sync filesystem surface (`runtime.fsSync`).
|
|
33
40
|
* @returns {Array<{path: string, mtimeMs: number}>}
|
|
34
41
|
*/
|
|
35
|
-
export function findTracesByDiscussion(dir, discussionId) {
|
|
42
|
+
export function findTracesByDiscussion(dir, discussionId, fsSync) {
|
|
36
43
|
const matches = [];
|
|
37
44
|
let entries;
|
|
38
45
|
try {
|
|
39
|
-
entries = readdirSync(dir);
|
|
46
|
+
entries = fsSync.readdirSync(dir);
|
|
40
47
|
} catch {
|
|
41
48
|
return [];
|
|
42
49
|
}
|
|
@@ -45,7 +52,7 @@ export function findTracesByDiscussion(dir, discussionId) {
|
|
|
45
52
|
const path = join(dir, entry);
|
|
46
53
|
let firstLine;
|
|
47
54
|
try {
|
|
48
|
-
firstLine = readFirstLine(path);
|
|
55
|
+
firstLine = readFirstLine(fsSync, path);
|
|
49
56
|
} catch {
|
|
50
57
|
continue;
|
|
51
58
|
}
|
|
@@ -58,7 +65,7 @@ export function findTracesByDiscussion(dir, discussionId) {
|
|
|
58
65
|
const event = parsed.event ?? parsed;
|
|
59
66
|
if (event?.type !== "meta") continue;
|
|
60
67
|
if (event.discussion_id !== discussionId) continue;
|
|
61
|
-
matches.push({ path, mtimeMs: statSync(path).mtimeMs });
|
|
68
|
+
matches.push({ path, mtimeMs: fsSync.statSync(path).mtimeMs });
|
|
62
69
|
}
|
|
63
70
|
matches.sort((a, b) => a.mtimeMs - b.mtimeMs);
|
|
64
71
|
return matches;
|
|
@@ -70,15 +77,18 @@ export function findTracesByDiscussion(dir, discussionId) {
|
|
|
70
77
|
* line, ordered by first-event timestamp (file mtime ascending). The
|
|
71
78
|
* result is usable with `xargs cat` for a chronological merge.
|
|
72
79
|
*
|
|
73
|
-
* @param {
|
|
74
|
-
* @
|
|
80
|
+
* @param {import("@forwardimpact/libcli").InvocationContext} ctx
|
|
81
|
+
* @returns {Promise<{ok: true} | {ok: false, code: number, error: string}>}
|
|
75
82
|
*/
|
|
76
|
-
export async function runByDiscussionCommand(
|
|
77
|
-
const
|
|
78
|
-
|
|
79
|
-
|
|
80
|
-
|
|
83
|
+
export async function runByDiscussionCommand(ctx) {
|
|
84
|
+
const runtime = ctx.deps.runtime;
|
|
85
|
+
const discussionId = ctx.args["discussion-id"];
|
|
86
|
+
if (!discussionId)
|
|
87
|
+
return { ok: false, code: 1, error: "<discussion-id> is required" };
|
|
88
|
+
const dir = ctx.args["trace-dir"] ?? ctx.options["trace-dir"] ?? "traces";
|
|
89
|
+
const matches = findTracesByDiscussion(dir, discussionId, runtime.fsSync);
|
|
81
90
|
for (const { path } of matches) {
|
|
82
|
-
|
|
91
|
+
runtime.proc.stdout.write(`${path}\n`);
|
|
83
92
|
}
|
|
93
|
+
return { ok: true };
|
|
84
94
|
}
|
package/src/commands/callback.js
CHANGED
|
@@ -1,5 +1,3 @@
|
|
|
1
|
-
import { readFileSync } from "node:fs";
|
|
2
|
-
|
|
3
1
|
/**
|
|
4
2
|
* Scan an NDJSON trace and return the last orchestrator summary event,
|
|
5
3
|
* the first `meta` event's `discussion_id`, and any structured replies
|
|
@@ -11,13 +9,14 @@ import { readFileSync } from "node:fs";
|
|
|
11
9
|
* its channel semantics.
|
|
12
10
|
*
|
|
13
11
|
* @param {string} traceFile
|
|
12
|
+
* @param {object} fsSync - Sync filesystem surface (`runtime.fsSync`).
|
|
14
13
|
* @returns {{verdict: string, summary: string, replies: object[], trigger?: object, discussionId?: string} | null}
|
|
15
14
|
*/
|
|
16
15
|
// biome-ignore lint/complexity/noExcessiveCognitiveComplexity: NDJSON scan with malformed-line tolerance + meta/summary dual extraction
|
|
17
|
-
function readTraceSummary(traceFile) {
|
|
16
|
+
function readTraceSummary(traceFile, fsSync) {
|
|
18
17
|
let summary = null;
|
|
19
18
|
let metaDiscussionId = null;
|
|
20
|
-
for (const line of readFileSync(traceFile, "utf8").split("\n")) {
|
|
19
|
+
for (const line of fsSync.readFileSync(traceFile, "utf8").split("\n")) {
|
|
21
20
|
if (!line.trim()) continue;
|
|
22
21
|
let record;
|
|
23
22
|
try {
|
|
@@ -40,6 +39,9 @@ function readTraceSummary(traceFile) {
|
|
|
40
39
|
...(record.event.discussion_id && {
|
|
41
40
|
discussionId: record.event.discussion_id,
|
|
42
41
|
}),
|
|
42
|
+
...(typeof record.event.lastActedSeq === "number" && {
|
|
43
|
+
lastActedSeq: record.event.lastActedSeq,
|
|
44
|
+
}),
|
|
43
45
|
};
|
|
44
46
|
}
|
|
45
47
|
}
|
|
@@ -64,20 +66,24 @@ function readTraceSummary(traceFile) {
|
|
|
64
66
|
* }
|
|
65
67
|
* ```
|
|
66
68
|
*
|
|
67
|
-
* @param {
|
|
68
|
-
* @
|
|
69
|
+
* @param {import("@forwardimpact/libcli").InvocationContext} ctx
|
|
70
|
+
* @returns {Promise<{ok: true} | {ok: false, code: number, error: string}>}
|
|
69
71
|
*/
|
|
70
|
-
export async function runCallbackCommand(
|
|
72
|
+
export async function runCallbackCommand(ctx) {
|
|
73
|
+
const values = ctx.options;
|
|
74
|
+
const runtime = ctx.deps.runtime;
|
|
71
75
|
const traceFile = values["trace-file"];
|
|
72
76
|
const callbackUrl = values["callback-url"];
|
|
73
77
|
const correlationId = values["correlation-id"];
|
|
74
78
|
const runUrl = values["run-url"] ?? "";
|
|
75
79
|
const discussionIdOverride = values["discussion-id"] ?? null;
|
|
76
80
|
|
|
77
|
-
if (!traceFile)
|
|
78
|
-
|
|
81
|
+
if (!traceFile)
|
|
82
|
+
return { ok: false, code: 1, error: "--trace-file is required" };
|
|
83
|
+
if (!callbackUrl)
|
|
84
|
+
return { ok: false, code: 1, error: "--callback-url is required" };
|
|
79
85
|
|
|
80
|
-
const found = readTraceSummary(traceFile) ?? {
|
|
86
|
+
const found = readTraceSummary(traceFile, runtime.fsSync) ?? {
|
|
81
87
|
verdict: "failed",
|
|
82
88
|
summary: "Run ended without producing a summary.",
|
|
83
89
|
replies: [],
|
|
@@ -86,10 +92,12 @@ export async function runCallbackCommand(values, _args) {
|
|
|
86
92
|
const discussionId = found.discussionId ?? discussionIdOverride ?? null;
|
|
87
93
|
const payload = {
|
|
88
94
|
correlation_id: correlationId,
|
|
95
|
+
kind: "terminal",
|
|
89
96
|
verdict: found.verdict,
|
|
90
97
|
summary: found.summary,
|
|
91
98
|
run_url: runUrl,
|
|
92
99
|
replies: found.replies,
|
|
100
|
+
last_acted_seq: found.lastActedSeq ?? -1,
|
|
93
101
|
...(discussionId && { discussion_id: discussionId }),
|
|
94
102
|
...(found.trigger && { trigger: found.trigger }),
|
|
95
103
|
};
|
|
@@ -99,6 +107,7 @@ export async function runCallbackCommand(values, _args) {
|
|
|
99
107
|
body: JSON.stringify(payload),
|
|
100
108
|
});
|
|
101
109
|
if (!res.ok) {
|
|
102
|
-
|
|
110
|
+
return { ok: false, code: 1, error: `Callback POST failed: ${res.status}` };
|
|
103
111
|
}
|
|
112
|
+
return { ok: true };
|
|
104
113
|
}
|
package/src/commands/discuss.js
CHANGED
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { createWriteStream } from "node:fs";
|
|
2
1
|
import { resolve } from "node:path";
|
|
2
|
+
import { isoTimestamp } from "@forwardimpact/libutil";
|
|
3
3
|
import { createDiscusser } from "../discusser.js";
|
|
4
4
|
import { createRedactor } from "../redaction.js";
|
|
5
5
|
import { createTeeWriter } from "../tee-writer.js";
|
|
@@ -17,10 +17,14 @@ function parseAgentProfiles(raw, cwd, maxTurns) {
|
|
|
17
17
|
* Parse and validate discuss command options. Exported so tests can verify
|
|
18
18
|
* defaults and the legacy-flag clean break.
|
|
19
19
|
* @param {object} values - Parsed option values
|
|
20
|
+
* @param {import("@forwardimpact/libutil/runtime").Runtime} runtime
|
|
20
21
|
* @returns {object}
|
|
21
22
|
*/
|
|
22
|
-
export function parseDiscussOptions(values) {
|
|
23
|
-
const { task: taskContent, amend: taskAmend } = resolveTaskContent(
|
|
23
|
+
export function parseDiscussOptions(values, runtime) {
|
|
24
|
+
const { task: taskContent, amend: taskAmend } = resolveTaskContent(
|
|
25
|
+
values,
|
|
26
|
+
runtime,
|
|
27
|
+
);
|
|
24
28
|
|
|
25
29
|
const profilesRaw = values["agent-profiles"];
|
|
26
30
|
const agentCwd = resolve(values["agent-cwd"] ?? ".");
|
|
@@ -40,6 +44,9 @@ export function parseDiscussOptions(values) {
|
|
|
40
44
|
}
|
|
41
45
|
}
|
|
42
46
|
|
|
47
|
+
const maxLeadTurnsRaw = values["max-lead-turns"] ?? "200";
|
|
48
|
+
const maxLeadTurns = parseInt(maxLeadTurnsRaw, 10);
|
|
49
|
+
|
|
43
50
|
return {
|
|
44
51
|
taskContent,
|
|
45
52
|
taskAmend,
|
|
@@ -48,9 +55,13 @@ export function parseDiscussOptions(values) {
|
|
|
48
55
|
leadModel: values["lead-model"] ?? "claude-opus-4-7[1m]",
|
|
49
56
|
agentModel: values["agent-model"] ?? "claude-opus-4-7[1m]",
|
|
50
57
|
maxTurns,
|
|
58
|
+
maxLeadTurns,
|
|
51
59
|
outputPath: values.output,
|
|
52
60
|
discussionId: values["discussion-id"] ?? null,
|
|
53
61
|
resumeContext,
|
|
62
|
+
callbackUrl: runtime.proc.env.CALLBACK_URL ?? null,
|
|
63
|
+
inboxUrl: runtime.proc.env.INBOX_URL ?? null,
|
|
64
|
+
correlationId: runtime.proc.env.CORRELATION_ID ?? null,
|
|
54
65
|
};
|
|
55
66
|
}
|
|
56
67
|
|
|
@@ -59,27 +70,29 @@ export function parseDiscussOptions(values) {
|
|
|
59
70
|
* semantics, threading `discussion_id` through the trace so multi-run
|
|
60
71
|
* conversations are queryable as one.
|
|
61
72
|
*
|
|
62
|
-
* @param {
|
|
63
|
-
* @
|
|
73
|
+
* @param {import("@forwardimpact/libcli").InvocationContext} ctx
|
|
74
|
+
* @returns {Promise<{ok: boolean, code?: number, error?: string}>}
|
|
64
75
|
*/
|
|
65
|
-
export async function runDiscussCommand(
|
|
66
|
-
const
|
|
76
|
+
export async function runDiscussCommand(ctx) {
|
|
77
|
+
const runtime = ctx.deps.runtime;
|
|
78
|
+
const opts = parseDiscussOptions(ctx.options, runtime);
|
|
67
79
|
|
|
68
|
-
const redactor = createRedactor();
|
|
80
|
+
const redactor = createRedactor({ runtime });
|
|
69
81
|
|
|
70
82
|
const fileStream = opts.outputPath
|
|
71
|
-
? createWriteStream(opts.outputPath)
|
|
83
|
+
? runtime.fs.createWriteStream(opts.outputPath)
|
|
72
84
|
: null;
|
|
73
85
|
const output = fileStream
|
|
74
86
|
? createTeeWriter({
|
|
75
87
|
fileStream,
|
|
76
|
-
textStream:
|
|
88
|
+
textStream: runtime.proc.stdout,
|
|
77
89
|
mode: "supervised",
|
|
90
|
+
now: () => isoTimestamp(runtime.clock.now()),
|
|
78
91
|
})
|
|
79
|
-
:
|
|
92
|
+
: runtime.proc.stdout;
|
|
80
93
|
|
|
81
94
|
if (opts.leadProfile) {
|
|
82
|
-
|
|
95
|
+
runtime.proc.env.LIBEVAL_AGENT_PROFILE = opts.leadProfile;
|
|
83
96
|
}
|
|
84
97
|
|
|
85
98
|
const { query } = await import("@anthropic-ai/claude-agent-sdk");
|
|
@@ -93,8 +106,13 @@ export async function runDiscussCommand(values, _args) {
|
|
|
93
106
|
query,
|
|
94
107
|
output,
|
|
95
108
|
maxTurns: opts.maxTurns,
|
|
109
|
+
maxLeadTurns: opts.maxLeadTurns,
|
|
96
110
|
taskAmend: opts.taskAmend,
|
|
97
111
|
redactor,
|
|
112
|
+
callbackUrl: opts.callbackUrl,
|
|
113
|
+
inboxUrl: opts.inboxUrl,
|
|
114
|
+
correlationId: opts.correlationId,
|
|
115
|
+
runtime,
|
|
98
116
|
});
|
|
99
117
|
|
|
100
118
|
const result = await discusser.run(opts.taskContent);
|
|
@@ -104,5 +122,5 @@ export async function runDiscussCommand(values, _args) {
|
|
|
104
122
|
await new Promise((r) => fileStream.end(r));
|
|
105
123
|
}
|
|
106
124
|
|
|
107
|
-
|
|
125
|
+
return result.success ? { ok: true } : { ok: false, code: 1, error: "" };
|
|
108
126
|
}
|
|
@@ -1,5 +1,5 @@
|
|
|
1
|
-
import { createWriteStream } from "node:fs";
|
|
2
1
|
import { resolve } from "node:path";
|
|
2
|
+
import { isoTimestamp } from "@forwardimpact/libutil";
|
|
3
3
|
import { createFacilitator } from "../facilitator.js";
|
|
4
4
|
import { createRedactor } from "../redaction.js";
|
|
5
5
|
import { createTeeWriter } from "../tee-writer.js";
|
|
@@ -23,10 +23,14 @@ function parseAgentProfiles(raw, cwd, maxTurns) {
|
|
|
23
23
|
* coverage of the `--max-turns` → per-agent threading contract; not part
|
|
24
24
|
* of the package's public API.
|
|
25
25
|
* @param {object} values - Parsed option values
|
|
26
|
+
* @param {import("@forwardimpact/libutil/runtime").Runtime} runtime
|
|
26
27
|
* @returns {object} Parsed options
|
|
27
28
|
*/
|
|
28
|
-
export function parseFacilitateOptions(values) {
|
|
29
|
-
const { task: taskContent, amend: taskAmend } = resolveTaskContent(
|
|
29
|
+
export function parseFacilitateOptions(values, runtime) {
|
|
30
|
+
const { task: taskContent, amend: taskAmend } = resolveTaskContent(
|
|
31
|
+
values,
|
|
32
|
+
runtime,
|
|
33
|
+
);
|
|
30
34
|
|
|
31
35
|
const profilesRaw = values["agent-profiles"];
|
|
32
36
|
if (!profilesRaw) throw new Error("--agent-profiles is required");
|
|
@@ -59,30 +63,32 @@ export function parseFacilitateOptions(values) {
|
|
|
59
63
|
*
|
|
60
64
|
* Usage: fit-eval facilitate [options]
|
|
61
65
|
*
|
|
62
|
-
* @param {
|
|
63
|
-
* @
|
|
66
|
+
* @param {import("@forwardimpact/libcli").InvocationContext} ctx
|
|
67
|
+
* @returns {Promise<{ok: boolean, code?: number, error?: string}>}
|
|
64
68
|
*/
|
|
65
|
-
export async function runFacilitateCommand(
|
|
66
|
-
const
|
|
69
|
+
export async function runFacilitateCommand(ctx) {
|
|
70
|
+
const runtime = ctx.deps.runtime;
|
|
71
|
+
const opts = parseFacilitateOptions(ctx.options, runtime);
|
|
67
72
|
|
|
68
73
|
// Build the redactor as the first observable side-effect after option
|
|
69
74
|
// parsing — the env snapshot must freeze BEFORE any in-process
|
|
70
|
-
//
|
|
71
|
-
const redactor = createRedactor();
|
|
75
|
+
// env writes the command performs (e.g. LIBEVAL_AGENT_PROFILE).
|
|
76
|
+
const redactor = createRedactor({ runtime });
|
|
72
77
|
|
|
73
78
|
const fileStream = opts.outputPath
|
|
74
|
-
? createWriteStream(opts.outputPath)
|
|
79
|
+
? runtime.fs.createWriteStream(opts.outputPath)
|
|
75
80
|
: null;
|
|
76
81
|
const output = fileStream
|
|
77
82
|
? createTeeWriter({
|
|
78
83
|
fileStream,
|
|
79
|
-
textStream:
|
|
84
|
+
textStream: runtime.proc.stdout,
|
|
80
85
|
mode: "supervised",
|
|
86
|
+
now: () => isoTimestamp(runtime.clock.now()),
|
|
81
87
|
})
|
|
82
|
-
:
|
|
88
|
+
: runtime.proc.stdout;
|
|
83
89
|
|
|
84
90
|
if (opts.facilitatorProfile) {
|
|
85
|
-
|
|
91
|
+
runtime.proc.env.LIBEVAL_AGENT_PROFILE = opts.facilitatorProfile;
|
|
86
92
|
}
|
|
87
93
|
|
|
88
94
|
const { query } = await import("@anthropic-ai/claude-agent-sdk");
|
|
@@ -97,6 +103,7 @@ export async function runFacilitateCommand(values, _args) {
|
|
|
97
103
|
facilitatorProfile: opts.facilitatorProfile,
|
|
98
104
|
taskAmend: opts.taskAmend,
|
|
99
105
|
redactor,
|
|
106
|
+
runtime,
|
|
100
107
|
});
|
|
101
108
|
|
|
102
109
|
const result = await facilitator.run(opts.taskContent);
|
|
@@ -106,5 +113,5 @@ export async function runFacilitateCommand(values, _args) {
|
|
|
106
113
|
await new Promise((r) => fileStream.end(r));
|
|
107
114
|
}
|
|
108
115
|
|
|
109
|
-
|
|
116
|
+
return result.success ? { ok: true } : { ok: false, code: 1, error: "" };
|
|
110
117
|
}
|
package/src/commands/output.js
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
1
|
+
import { isoTimestamp } from "@forwardimpact/libutil";
|
|
1
2
|
import { createTraceCollector } from "@forwardimpact/libeval";
|
|
2
3
|
|
|
3
4
|
/**
|
|
@@ -6,29 +7,30 @@ import { createTraceCollector } from "@forwardimpact/libeval";
|
|
|
6
7
|
*
|
|
7
8
|
* Usage: fit-eval output [--format=json|text] < trace.ndjson
|
|
8
9
|
*
|
|
9
|
-
* @param {
|
|
10
|
-
* @
|
|
10
|
+
* @param {import("@forwardimpact/libcli").InvocationContext} ctx
|
|
11
|
+
* @returns {Promise<{ok: true}>}
|
|
11
12
|
*/
|
|
12
|
-
export async function runOutputCommand(
|
|
13
|
+
export async function runOutputCommand(ctx) {
|
|
14
|
+
const values = ctx.options;
|
|
15
|
+
const runtime = ctx.deps.runtime;
|
|
13
16
|
const format =
|
|
14
17
|
values.format === "text" || values.format === "json"
|
|
15
18
|
? values.format
|
|
16
19
|
: "json";
|
|
17
|
-
const collector = createTraceCollector(
|
|
20
|
+
const collector = createTraceCollector({
|
|
21
|
+
now: () => isoTimestamp(runtime.clock.now()),
|
|
22
|
+
});
|
|
18
23
|
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
}
|
|
23
|
-
const input = Buffer.concat(chunks).toString("utf8");
|
|
24
|
-
|
|
25
|
-
for (const line of input.split("\n")) {
|
|
24
|
+
// `runtime.proc.stdin` is an AsyncIterable of UTF-8 lines (newline-split by
|
|
25
|
+
// the runtime), so each yielded value is exactly one NDJSON record.
|
|
26
|
+
for await (const line of runtime.proc.stdin) {
|
|
26
27
|
collector.addLine(line);
|
|
27
28
|
}
|
|
28
29
|
|
|
29
30
|
if (format === "text") {
|
|
30
|
-
|
|
31
|
+
runtime.proc.stdout.write(collector.toText() + "\n");
|
|
31
32
|
} else {
|
|
32
|
-
|
|
33
|
+
runtime.proc.stdout.write(JSON.stringify(collector.toJSON()) + "\n");
|
|
33
34
|
}
|
|
35
|
+
return { ok: true };
|
|
34
36
|
}
|