@forwardimpact/libeval 0.1.50 → 0.1.52
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +11 -8
- package/bin/fit-benchmark.js +26 -27
- package/bin/fit-eval.js +36 -30
- package/bin/fit-trace.js +83 -57
- package/package.json +1 -1
- package/src/agent-runner.js +20 -12
- package/src/benchmark/apm-installer.js +48 -44
- package/src/benchmark/env-loader.js +35 -23
- package/src/benchmark/invariants.js +128 -0
- package/src/benchmark/judge.js +18 -19
- package/src/benchmark/npm-installer.js +33 -33
- package/src/benchmark/report.js +40 -26
- package/src/benchmark/result.js +11 -11
- package/src/benchmark/runner.js +90 -46
- package/src/benchmark/task-family.js +78 -65
- package/src/benchmark/workdir.js +100 -93
- package/src/commands/assert.js +30 -22
- package/src/commands/benchmark-invariants.js +74 -0
- package/src/commands/benchmark-report.js +24 -15
- package/src/commands/benchmark-run.js +16 -9
- package/src/commands/by-discussion.js +33 -23
- package/src/commands/callback.js +20 -11
- package/src/commands/discuss.js +31 -13
- package/src/commands/facilitate.js +21 -14
- package/src/commands/output.js +15 -13
- package/src/commands/run.js +28 -14
- package/src/commands/supervise.js +29 -19
- package/src/commands/task-input.js +10 -5
- package/src/commands/tee.js +24 -9
- package/src/commands/trace.js +181 -99
- package/src/discuss-tools.js +48 -2
- package/src/discusser.js +53 -2
- package/src/events/github.js +27 -5
- package/src/facilitator.js +4 -0
- package/src/inbox-poller.js +84 -0
- package/src/judge.js +4 -1
- package/src/message-bus.js +6 -0
- package/src/orchestration-loop.js +14 -4
- package/src/orchestration-toolkit.js +14 -0
- package/src/profile-prompt.js +22 -9
- package/src/redaction.js +31 -9
- package/src/reply-emitter.js +47 -0
- package/src/supervisor.js +4 -0
- package/src/tee-writer.js +4 -2
- package/src/trace-collector.js +9 -2
- package/src/trace-github.js +47 -27
- package/src/benchmark/scorer.js +0 -138
- package/src/commands/benchmark-score.js +0 -68
package/src/commands/run.js
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
|
-
import { createWriteStream } from "node:fs";
|
|
2
1
|
import { Writable } from "node:stream";
|
|
3
2
|
import { resolve } from "node:path";
|
|
3
|
+
import { isoTimestamp } from "@forwardimpact/libutil";
|
|
4
4
|
import { createAgentRunner } from "../agent-runner.js";
|
|
5
5
|
import { composeProfilePrompt } from "../profile-prompt.js";
|
|
6
6
|
import { createRedactor } from "../redaction.js";
|
|
@@ -12,10 +12,14 @@ import { createServiceConfig } from "@forwardimpact/libconfig";
|
|
|
12
12
|
/**
|
|
13
13
|
* Parse and validate run command options from parsed values.
|
|
14
14
|
* @param {object} values - Parsed option values from cli.parse()
|
|
15
|
+
* @param {import("@forwardimpact/libutil/runtime").Runtime} runtime
|
|
15
16
|
* @returns {{ taskContent: string, cwd: string, model: string, maxTurns: number, outputPath: string|undefined, agentProfile: string|undefined, allowedTools: string[] }}
|
|
16
17
|
*/
|
|
17
|
-
function parseRunOptions(values) {
|
|
18
|
-
const { task: taskContent, amend: taskAmend } = resolveTaskContent(
|
|
18
|
+
function parseRunOptions(values, runtime) {
|
|
19
|
+
const { task: taskContent, amend: taskAmend } = resolveTaskContent(
|
|
20
|
+
values,
|
|
21
|
+
runtime,
|
|
22
|
+
);
|
|
19
23
|
const maxTurnsRaw = values["max-turns"] ?? "50";
|
|
20
24
|
|
|
21
25
|
return {
|
|
@@ -39,10 +43,11 @@ function parseRunOptions(values) {
|
|
|
39
43
|
*
|
|
40
44
|
* Usage: fit-eval run [options]
|
|
41
45
|
*
|
|
42
|
-
* @param {
|
|
43
|
-
* @
|
|
46
|
+
* @param {import("@forwardimpact/libcli").InvocationContext} ctx
|
|
47
|
+
* @returns {Promise<{ok: boolean, code?: number, error?: string}>}
|
|
44
48
|
*/
|
|
45
|
-
export async function runRunCommand(
|
|
49
|
+
export async function runRunCommand(ctx) {
|
|
50
|
+
const runtime = ctx.deps.runtime;
|
|
46
51
|
const {
|
|
47
52
|
taskContent,
|
|
48
53
|
taskAmend,
|
|
@@ -53,19 +58,26 @@ export async function runRunCommand(values, _args) {
|
|
|
53
58
|
agentProfile,
|
|
54
59
|
allowedTools,
|
|
55
60
|
mcpServer,
|
|
56
|
-
} = parseRunOptions(
|
|
61
|
+
} = parseRunOptions(ctx.options, runtime);
|
|
57
62
|
|
|
58
63
|
// Build the redactor as the first observable side-effect after option
|
|
59
64
|
// parsing — the env snapshot must freeze BEFORE any in-process
|
|
60
|
-
//
|
|
61
|
-
const redactor = createRedactor();
|
|
65
|
+
// env writes the command performs (e.g. LIBEVAL_AGENT_PROFILE).
|
|
66
|
+
const redactor = createRedactor({ runtime });
|
|
62
67
|
|
|
63
68
|
// When --output is specified, stream text to stdout while writing NDJSON to file.
|
|
64
69
|
// Otherwise, write NDJSON directly to stdout (backwards-compatible).
|
|
65
|
-
const fileStream = outputPath
|
|
70
|
+
const fileStream = outputPath
|
|
71
|
+
? runtime.fs.createWriteStream(outputPath)
|
|
72
|
+
: null;
|
|
66
73
|
const output = fileStream
|
|
67
|
-
? createTeeWriter({
|
|
68
|
-
|
|
74
|
+
? createTeeWriter({
|
|
75
|
+
fileStream,
|
|
76
|
+
textStream: runtime.proc.stdout,
|
|
77
|
+
mode: "raw",
|
|
78
|
+
now: () => isoTimestamp(runtime.clock.now()),
|
|
79
|
+
})
|
|
80
|
+
: runtime.proc.stdout;
|
|
69
81
|
|
|
70
82
|
const counter = new SequenceCounter();
|
|
71
83
|
const devNull = new Writable({
|
|
@@ -93,12 +105,13 @@ export async function runRunCommand(values, _args) {
|
|
|
93
105
|
}
|
|
94
106
|
|
|
95
107
|
if (agentProfile) {
|
|
96
|
-
|
|
108
|
+
runtime.proc.env.LIBEVAL_AGENT_PROFILE = agentProfile;
|
|
97
109
|
}
|
|
98
110
|
|
|
99
111
|
const systemPrompt = agentProfile
|
|
100
112
|
? composeProfilePrompt(agentProfile, {
|
|
101
113
|
profilesDir: resolve(cwd, ".claude/agents"),
|
|
114
|
+
runtime,
|
|
102
115
|
})
|
|
103
116
|
: undefined;
|
|
104
117
|
|
|
@@ -116,6 +129,7 @@ export async function runRunCommand(values, _args) {
|
|
|
116
129
|
taskAmend,
|
|
117
130
|
mcpServers,
|
|
118
131
|
redactor,
|
|
132
|
+
runtime,
|
|
119
133
|
});
|
|
120
134
|
|
|
121
135
|
const result = await runner.run(taskContent);
|
|
@@ -125,5 +139,5 @@ export async function runRunCommand(values, _args) {
|
|
|
125
139
|
await new Promise((r) => fileStream.end(r));
|
|
126
140
|
}
|
|
127
141
|
|
|
128
|
-
|
|
142
|
+
return result.success ? { ok: true } : { ok: false, code: 1, error: "" };
|
|
129
143
|
}
|
|
@@ -1,6 +1,5 @@
|
|
|
1
|
-
import { createWriteStream, mkdtempSync } from "node:fs";
|
|
2
1
|
import { resolve, join } from "node:path";
|
|
3
|
-
import {
|
|
2
|
+
import { isoTimestamp } from "@forwardimpact/libutil";
|
|
4
3
|
import { createSupervisor } from "../supervisor.js";
|
|
5
4
|
import { createRedactor } from "../redaction.js";
|
|
6
5
|
import { createTeeWriter } from "../tee-writer.js";
|
|
@@ -10,19 +9,27 @@ import { createServiceConfig } from "@forwardimpact/libconfig";
|
|
|
10
9
|
/**
|
|
11
10
|
* Parse all supervise flags from parsed values into an options object.
|
|
12
11
|
* @param {object} values - Parsed option values from cli.parse()
|
|
13
|
-
* @
|
|
12
|
+
* @param {import("@forwardimpact/libutil/runtime").Runtime} runtime
|
|
13
|
+
* @returns {Promise<object>}
|
|
14
14
|
*/
|
|
15
|
-
export function parseSuperviseOptions(values) {
|
|
16
|
-
const { task: taskContent, amend: taskAmend } = resolveTaskContent(
|
|
15
|
+
export async function parseSuperviseOptions(values, runtime) {
|
|
16
|
+
const { task: taskContent, amend: taskAmend } = resolveTaskContent(
|
|
17
|
+
values,
|
|
18
|
+
runtime,
|
|
19
|
+
);
|
|
17
20
|
const supervisorAllowedToolsRaw = values["supervisor-allowed-tools"];
|
|
18
21
|
|
|
22
|
+
const tmpRoot = runtime.proc.env.TMPDIR ?? "/tmp";
|
|
23
|
+
const agentCwd = resolve(
|
|
24
|
+
values["agent-cwd"] ??
|
|
25
|
+
(await runtime.fs.mkdtemp(join(tmpRoot, "fit-eval-agent-"))),
|
|
26
|
+
);
|
|
27
|
+
|
|
19
28
|
return {
|
|
20
29
|
taskContent,
|
|
21
30
|
taskAmend,
|
|
22
31
|
supervisorCwd: resolve(values["supervisor-cwd"] ?? "."),
|
|
23
|
-
agentCwd
|
|
24
|
-
values["agent-cwd"] ?? mkdtempSync(join(tmpdir(), "fit-eval-agent-")),
|
|
25
|
-
),
|
|
32
|
+
agentCwd,
|
|
26
33
|
agentModel: values["agent-model"] ?? "claude-opus-4-7[1m]",
|
|
27
34
|
supervisorModel: values["lead-model"] ?? "claude-opus-4-7[1m]",
|
|
28
35
|
maxTurns: (() => {
|
|
@@ -50,29 +57,31 @@ export function parseSuperviseOptions(values) {
|
|
|
50
57
|
*
|
|
51
58
|
* Usage: fit-eval supervise [options]
|
|
52
59
|
*
|
|
53
|
-
* @param {
|
|
54
|
-
* @
|
|
60
|
+
* @param {import("@forwardimpact/libcli").InvocationContext} ctx
|
|
61
|
+
* @returns {Promise<{ok: boolean, code?: number, error?: string}>}
|
|
55
62
|
*/
|
|
56
|
-
export async function runSuperviseCommand(
|
|
57
|
-
const
|
|
63
|
+
export async function runSuperviseCommand(ctx) {
|
|
64
|
+
const runtime = ctx.deps.runtime;
|
|
65
|
+
const opts = await parseSuperviseOptions(ctx.options, runtime);
|
|
58
66
|
|
|
59
67
|
// Build the redactor as the first observable side-effect after option
|
|
60
68
|
// parsing — the env snapshot must freeze BEFORE any in-process
|
|
61
|
-
//
|
|
62
|
-
const redactor = createRedactor();
|
|
69
|
+
// env writes the command performs (e.g. LIBEVAL_AGENT_PROFILE).
|
|
70
|
+
const redactor = createRedactor({ runtime });
|
|
63
71
|
|
|
64
72
|
// When --output is specified, stream text to stdout while writing NDJSON to file.
|
|
65
73
|
// Otherwise, write NDJSON directly to stdout (backwards-compatible).
|
|
66
74
|
const fileStream = opts.outputPath
|
|
67
|
-
? createWriteStream(opts.outputPath)
|
|
75
|
+
? runtime.fs.createWriteStream(opts.outputPath)
|
|
68
76
|
: null;
|
|
69
77
|
const output = fileStream
|
|
70
78
|
? createTeeWriter({
|
|
71
79
|
fileStream,
|
|
72
|
-
textStream:
|
|
80
|
+
textStream: runtime.proc.stdout,
|
|
73
81
|
mode: "supervised",
|
|
82
|
+
now: () => isoTimestamp(runtime.clock.now()),
|
|
74
83
|
})
|
|
75
|
-
:
|
|
84
|
+
: runtime.proc.stdout;
|
|
76
85
|
|
|
77
86
|
let agentMcpServers = null;
|
|
78
87
|
if (opts.mcpServer) {
|
|
@@ -88,7 +97,7 @@ export async function runSuperviseCommand(values, _args) {
|
|
|
88
97
|
}
|
|
89
98
|
|
|
90
99
|
if (opts.agentProfile) {
|
|
91
|
-
|
|
100
|
+
runtime.proc.env.LIBEVAL_AGENT_PROFILE = opts.agentProfile;
|
|
92
101
|
}
|
|
93
102
|
|
|
94
103
|
const { query } = await import("@anthropic-ai/claude-agent-sdk");
|
|
@@ -107,6 +116,7 @@ export async function runSuperviseCommand(values, _args) {
|
|
|
107
116
|
taskAmend: opts.taskAmend,
|
|
108
117
|
agentMcpServers,
|
|
109
118
|
redactor,
|
|
119
|
+
runtime,
|
|
110
120
|
});
|
|
111
121
|
|
|
112
122
|
const result = await supervisor.run(opts.taskContent);
|
|
@@ -116,5 +126,5 @@ export async function runSuperviseCommand(values, _args) {
|
|
|
116
126
|
await new Promise((r) => fileStream.end(r));
|
|
117
127
|
}
|
|
118
128
|
|
|
119
|
-
|
|
129
|
+
return result.success ? { ok: true } : { ok: false, code: 1, error: "" };
|
|
120
130
|
}
|
|
@@ -1,4 +1,3 @@
|
|
|
1
|
-
import { readFileSync } from "node:fs";
|
|
2
1
|
import { composeTaskFromGitHubEvent } from "../events/github.js";
|
|
3
2
|
|
|
4
3
|
/**
|
|
@@ -11,9 +10,12 @@ import { composeTaskFromGitHubEvent } from "../events/github.js";
|
|
|
11
10
|
* works as before.
|
|
12
11
|
*
|
|
13
12
|
* @param {object} values - Parsed option values from cli.parse()
|
|
13
|
+
* @param {import("@forwardimpact/libutil/runtime").Runtime} runtime - Ambient
|
|
14
|
+
* collaborators; `fsSync.readFileSync` loads `--task-file`/`--task-event`
|
|
15
|
+
* and `proc.env` resolves `GITHUB_EVENT_NAME`.
|
|
14
16
|
* @returns {{ task: string, amend: string | undefined }}
|
|
15
17
|
*/
|
|
16
|
-
export function resolveTaskContent(values) {
|
|
18
|
+
export function resolveTaskContent(values, runtime) {
|
|
17
19
|
const taskFile = values["task-file"];
|
|
18
20
|
const taskText = values["task-text"];
|
|
19
21
|
const taskEvent = values["task-event"];
|
|
@@ -33,17 +35,20 @@ export function resolveTaskContent(values) {
|
|
|
33
35
|
const amendFlag = values["task-amend"] ?? undefined;
|
|
34
36
|
|
|
35
37
|
if (taskFile) {
|
|
36
|
-
return {
|
|
38
|
+
return {
|
|
39
|
+
task: runtime.fsSync.readFileSync(taskFile, "utf8"),
|
|
40
|
+
amend: amendFlag,
|
|
41
|
+
};
|
|
37
42
|
}
|
|
38
43
|
if (taskText) {
|
|
39
44
|
return { task: taskText, amend: amendFlag };
|
|
40
45
|
}
|
|
41
46
|
|
|
42
|
-
const eventName =
|
|
47
|
+
const eventName = runtime.proc.env.GITHUB_EVENT_NAME;
|
|
43
48
|
if (!eventName) {
|
|
44
49
|
throw new Error("--task-event requires GITHUB_EVENT_NAME to be set");
|
|
45
50
|
}
|
|
46
|
-
const payload = JSON.parse(readFileSync(taskEvent, "utf8"));
|
|
51
|
+
const payload = JSON.parse(runtime.fsSync.readFileSync(taskEvent, "utf8"));
|
|
47
52
|
const composed = composeTaskFromGitHubEvent(payload, eventName);
|
|
48
53
|
return { task: composed.task, amend: amendFlag ?? composed.amend };
|
|
49
54
|
}
|
package/src/commands/tee.js
CHANGED
|
@@ -1,32 +1,47 @@
|
|
|
1
|
-
import { createWriteStream } from "fs";
|
|
2
1
|
import { PassThrough } from "node:stream";
|
|
3
2
|
import { pipeline } from "node:stream/promises";
|
|
3
|
+
import { isoTimestamp } from "@forwardimpact/libutil";
|
|
4
4
|
import { createTeeWriter } from "../tee-writer.js";
|
|
5
5
|
|
|
6
6
|
/**
|
|
7
7
|
* Tee command — stream text output to stdout while optionally saving the raw
|
|
8
|
-
* NDJSON to a file.
|
|
8
|
+
* NDJSON to a file. Reads stdin line-by-line through the injected runtime and
|
|
9
|
+
* re-delimits each record with a newline so the TeeWriter's line splitter sees
|
|
10
|
+
* the same framing the raw byte stream produced.
|
|
9
11
|
*
|
|
10
12
|
* Usage: fit-eval tee [output.ndjson] < trace.ndjson
|
|
11
13
|
*
|
|
12
|
-
* @param {
|
|
13
|
-
* @
|
|
14
|
+
* @param {import("@forwardimpact/libcli").InvocationContext} ctx
|
|
15
|
+
* @returns {Promise<{ok: boolean, code?: number, error?: string}>}
|
|
14
16
|
*/
|
|
15
|
-
export async function runTeeCommand(
|
|
16
|
-
const
|
|
17
|
-
const
|
|
17
|
+
export async function runTeeCommand(ctx) {
|
|
18
|
+
const runtime = ctx.deps.runtime;
|
|
19
|
+
const outputPath = ctx.args.output ?? null;
|
|
20
|
+
const fileStream = outputPath
|
|
21
|
+
? runtime.fs.createWriteStream(outputPath)
|
|
22
|
+
: null;
|
|
18
23
|
|
|
19
24
|
// TeeWriter requires a fileStream; when no output file is specified,
|
|
20
25
|
// use a PassThrough as a no-op sink (NDJSON is not saved).
|
|
21
26
|
const sink = fileStream ?? new PassThrough();
|
|
22
27
|
const tee = createTeeWriter({
|
|
23
28
|
fileStream: sink,
|
|
24
|
-
textStream:
|
|
29
|
+
textStream: runtime.proc.stdout,
|
|
25
30
|
mode: "raw",
|
|
31
|
+
now: () => isoTimestamp(runtime.clock.now()),
|
|
26
32
|
});
|
|
27
33
|
|
|
28
34
|
try {
|
|
29
|
-
|
|
35
|
+
// `runtime.proc.stdin` yields newline-stripped lines; re-append `\n` so the
|
|
36
|
+
// TeeWriter's `_write` line splitter frames records exactly as it did when
|
|
37
|
+
// piped the raw byte stream.
|
|
38
|
+
const lines = (async function* () {
|
|
39
|
+
for await (const line of runtime.proc.stdin) yield `${line}\n`;
|
|
40
|
+
})();
|
|
41
|
+
await pipeline(lines, tee);
|
|
42
|
+
return { ok: true };
|
|
43
|
+
} catch (error) {
|
|
44
|
+
return { ok: false, code: 1, error: error.message };
|
|
30
45
|
} finally {
|
|
31
46
|
if (fileStream) {
|
|
32
47
|
await new Promise((resolve, reject) => {
|