@forwardimpact/libeval 0.1.49 → 0.1.51
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +11 -8
- package/bin/fit-benchmark.js +26 -27
- package/bin/fit-eval.js +76 -78
- package/bin/fit-trace.js +83 -57
- package/package.json +2 -2
- package/src/agent-runner.js +23 -13
- package/src/benchmark/env-loader.js +35 -23
- package/src/benchmark/{scorer.js → invariants.js} +14 -12
- package/src/benchmark/judge.js +5 -8
- package/src/benchmark/npm-installer.js +87 -0
- package/src/benchmark/report.js +15 -15
- package/src/benchmark/result.js +11 -11
- package/src/benchmark/runner.js +17 -11
- package/src/benchmark/task-family.js +6 -4
- package/src/benchmark/workdir.js +23 -3
- package/src/commands/assert.js +30 -22
- package/src/commands/benchmark-invariants.js +74 -0
- package/src/commands/benchmark-report.js +23 -15
- package/src/commands/benchmark-run.js +22 -7
- package/src/commands/by-discussion.js +29 -18
- package/src/commands/callback.js +20 -11
- package/src/commands/discuss.js +30 -21
- package/src/commands/facilitate.js +20 -21
- package/src/commands/output.js +11 -12
- package/src/commands/run.js +24 -21
- package/src/commands/supervise.js +27 -27
- package/src/commands/task-input.js +54 -0
- package/src/commands/trace.js +174 -97
- package/src/discuss-tools.js +48 -2
- package/src/discusser.js +49 -2
- package/src/events/github.js +155 -0
- package/src/inbox-poller.js +84 -0
- package/src/index.js +10 -0
- package/src/judge.js +1 -1
- package/src/message-bus.js +6 -0
- package/src/orchestration-loop.js +19 -5
- package/src/orchestration-toolkit.js +14 -0
- package/src/redaction.js +31 -9
- package/src/reply-emitter.js +47 -0
- package/src/commands/benchmark-score.js +0 -68
package/src/commands/discuss.js
CHANGED
|
@@ -1,8 +1,9 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { createWriteStream } from "node:fs";
|
|
2
2
|
import { resolve } from "node:path";
|
|
3
3
|
import { createDiscusser } from "../discusser.js";
|
|
4
4
|
import { createRedactor } from "../redaction.js";
|
|
5
5
|
import { createTeeWriter } from "../tee-writer.js";
|
|
6
|
+
import { resolveTaskContent } from "./task-input.js";
|
|
6
7
|
|
|
7
8
|
function parseAgentProfiles(raw, cwd, maxTurns) {
|
|
8
9
|
if (!raw) return [];
|
|
@@ -16,19 +17,14 @@ function parseAgentProfiles(raw, cwd, maxTurns) {
|
|
|
16
17
|
* Parse and validate discuss command options. Exported so tests can verify
|
|
17
18
|
* defaults and the legacy-flag clean break.
|
|
18
19
|
* @param {object} values - Parsed option values
|
|
20
|
+
* @param {import("@forwardimpact/libutil/runtime").Runtime} runtime
|
|
19
21
|
* @returns {object}
|
|
20
22
|
*/
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
|
|
24
|
-
|
|
25
|
-
|
|
26
|
-
throw new Error("--task-file and --task-text are mutually exclusive");
|
|
27
|
-
if (!taskFile && !taskText)
|
|
28
|
-
throw new Error("--task-file or --task-text is required");
|
|
29
|
-
|
|
30
|
-
const taskAmend = values["task-amend"] ?? undefined;
|
|
31
|
-
const taskContent = taskFile ? readFileSync(taskFile, "utf8") : taskText;
|
|
23
|
+
export function parseDiscussOptions(values, runtime) {
|
|
24
|
+
const { task: taskContent, amend: taskAmend } = resolveTaskContent(
|
|
25
|
+
values,
|
|
26
|
+
runtime,
|
|
27
|
+
);
|
|
32
28
|
|
|
33
29
|
const profilesRaw = values["agent-profiles"];
|
|
34
30
|
const agentCwd = resolve(values["agent-cwd"] ?? ".");
|
|
@@ -48,6 +44,9 @@ export function parseDiscussOptions(values) {
|
|
|
48
44
|
}
|
|
49
45
|
}
|
|
50
46
|
|
|
47
|
+
const maxLeadTurnsRaw = values["max-lead-turns"] ?? "200";
|
|
48
|
+
const maxLeadTurns = parseInt(maxLeadTurnsRaw, 10);
|
|
49
|
+
|
|
51
50
|
return {
|
|
52
51
|
taskContent,
|
|
53
52
|
taskAmend,
|
|
@@ -56,9 +55,13 @@ export function parseDiscussOptions(values) {
|
|
|
56
55
|
leadModel: values["lead-model"] ?? "claude-opus-4-7[1m]",
|
|
57
56
|
agentModel: values["agent-model"] ?? "claude-opus-4-7[1m]",
|
|
58
57
|
maxTurns,
|
|
58
|
+
maxLeadTurns,
|
|
59
59
|
outputPath: values.output,
|
|
60
60
|
discussionId: values["discussion-id"] ?? null,
|
|
61
61
|
resumeContext,
|
|
62
|
+
callbackUrl: runtime.proc.env.CALLBACK_URL ?? null,
|
|
63
|
+
inboxUrl: runtime.proc.env.INBOX_URL ?? null,
|
|
64
|
+
correlationId: runtime.proc.env.CORRELATION_ID ?? null,
|
|
62
65
|
};
|
|
63
66
|
}
|
|
64
67
|
|
|
@@ -67,13 +70,14 @@ export function parseDiscussOptions(values) {
|
|
|
67
70
|
* semantics, threading `discussion_id` through the trace so multi-run
|
|
68
71
|
* conversations are queryable as one.
|
|
69
72
|
*
|
|
70
|
-
* @param {
|
|
71
|
-
* @
|
|
73
|
+
* @param {import("@forwardimpact/libcli").InvocationContext} ctx
|
|
74
|
+
* @returns {Promise<{ok: boolean, code?: number, error?: string}>}
|
|
72
75
|
*/
|
|
73
|
-
export async function runDiscussCommand(
|
|
74
|
-
const
|
|
76
|
+
export async function runDiscussCommand(ctx) {
|
|
77
|
+
const runtime = ctx.deps.runtime;
|
|
78
|
+
const opts = parseDiscussOptions(ctx.options, runtime);
|
|
75
79
|
|
|
76
|
-
const redactor = createRedactor();
|
|
80
|
+
const redactor = createRedactor({ runtime });
|
|
77
81
|
|
|
78
82
|
const fileStream = opts.outputPath
|
|
79
83
|
? createWriteStream(opts.outputPath)
|
|
@@ -81,13 +85,13 @@ export async function runDiscussCommand(values, _args) {
|
|
|
81
85
|
const output = fileStream
|
|
82
86
|
? createTeeWriter({
|
|
83
87
|
fileStream,
|
|
84
|
-
textStream:
|
|
88
|
+
textStream: runtime.proc.stdout,
|
|
85
89
|
mode: "supervised",
|
|
86
90
|
})
|
|
87
|
-
:
|
|
91
|
+
: runtime.proc.stdout;
|
|
88
92
|
|
|
89
93
|
if (opts.leadProfile) {
|
|
90
|
-
|
|
94
|
+
runtime.proc.env.LIBEVAL_AGENT_PROFILE = opts.leadProfile;
|
|
91
95
|
}
|
|
92
96
|
|
|
93
97
|
const { query } = await import("@anthropic-ai/claude-agent-sdk");
|
|
@@ -101,8 +105,13 @@ export async function runDiscussCommand(values, _args) {
|
|
|
101
105
|
query,
|
|
102
106
|
output,
|
|
103
107
|
maxTurns: opts.maxTurns,
|
|
108
|
+
maxLeadTurns: opts.maxLeadTurns,
|
|
104
109
|
taskAmend: opts.taskAmend,
|
|
105
110
|
redactor,
|
|
111
|
+
callbackUrl: opts.callbackUrl,
|
|
112
|
+
inboxUrl: opts.inboxUrl,
|
|
113
|
+
correlationId: opts.correlationId,
|
|
114
|
+
runtime,
|
|
106
115
|
});
|
|
107
116
|
|
|
108
117
|
const result = await discusser.run(opts.taskContent);
|
|
@@ -112,5 +121,5 @@ export async function runDiscussCommand(values, _args) {
|
|
|
112
121
|
await new Promise((r) => fileStream.end(r));
|
|
113
122
|
}
|
|
114
123
|
|
|
115
|
-
|
|
124
|
+
return result.success ? { ok: true } : { ok: false, code: 1, error: "" };
|
|
116
125
|
}
|
|
@@ -1,8 +1,9 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { createWriteStream } from "node:fs";
|
|
2
2
|
import { resolve } from "node:path";
|
|
3
3
|
import { createFacilitator } from "../facilitator.js";
|
|
4
4
|
import { createRedactor } from "../redaction.js";
|
|
5
5
|
import { createTeeWriter } from "../tee-writer.js";
|
|
6
|
+
import { resolveTaskContent } from "./task-input.js";
|
|
6
7
|
|
|
7
8
|
/**
|
|
8
9
|
* Parse comma-separated agent profile names into structured configs.
|
|
@@ -22,18 +23,14 @@ function parseAgentProfiles(raw, cwd, maxTurns) {
|
|
|
22
23
|
* coverage of the `--max-turns` → per-agent threading contract; not part
|
|
23
24
|
* of the package's public API.
|
|
24
25
|
* @param {object} values - Parsed option values
|
|
26
|
+
* @param {import("@forwardimpact/libutil/runtime").Runtime} runtime
|
|
25
27
|
* @returns {object} Parsed options
|
|
26
28
|
*/
|
|
27
|
-
export function parseFacilitateOptions(values) {
|
|
28
|
-
const
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
|
|
32
|
-
if (!taskFile && !taskText)
|
|
33
|
-
throw new Error("--task-file or --task-text is required");
|
|
34
|
-
|
|
35
|
-
const taskAmend = values["task-amend"] ?? undefined;
|
|
36
|
-
const taskContent = taskFile ? readFileSync(taskFile, "utf8") : taskText;
|
|
29
|
+
export function parseFacilitateOptions(values, runtime) {
|
|
30
|
+
const { task: taskContent, amend: taskAmend } = resolveTaskContent(
|
|
31
|
+
values,
|
|
32
|
+
runtime,
|
|
33
|
+
);
|
|
37
34
|
|
|
38
35
|
const profilesRaw = values["agent-profiles"];
|
|
39
36
|
if (!profilesRaw) throw new Error("--agent-profiles is required");
|
|
@@ -66,16 +63,17 @@ export function parseFacilitateOptions(values) {
|
|
|
66
63
|
*
|
|
67
64
|
* Usage: fit-eval facilitate [options]
|
|
68
65
|
*
|
|
69
|
-
* @param {
|
|
70
|
-
* @
|
|
66
|
+
* @param {import("@forwardimpact/libcli").InvocationContext} ctx
|
|
67
|
+
* @returns {Promise<{ok: boolean, code?: number, error?: string}>}
|
|
71
68
|
*/
|
|
72
|
-
export async function runFacilitateCommand(
|
|
73
|
-
const
|
|
69
|
+
export async function runFacilitateCommand(ctx) {
|
|
70
|
+
const runtime = ctx.deps.runtime;
|
|
71
|
+
const opts = parseFacilitateOptions(ctx.options, runtime);
|
|
74
72
|
|
|
75
73
|
// Build the redactor as the first observable side-effect after option
|
|
76
74
|
// parsing — the env snapshot must freeze BEFORE any in-process
|
|
77
|
-
//
|
|
78
|
-
const redactor = createRedactor();
|
|
75
|
+
// env writes the command performs (e.g. LIBEVAL_AGENT_PROFILE).
|
|
76
|
+
const redactor = createRedactor({ runtime });
|
|
79
77
|
|
|
80
78
|
const fileStream = opts.outputPath
|
|
81
79
|
? createWriteStream(opts.outputPath)
|
|
@@ -83,13 +81,13 @@ export async function runFacilitateCommand(values, _args) {
|
|
|
83
81
|
const output = fileStream
|
|
84
82
|
? createTeeWriter({
|
|
85
83
|
fileStream,
|
|
86
|
-
textStream:
|
|
84
|
+
textStream: runtime.proc.stdout,
|
|
87
85
|
mode: "supervised",
|
|
88
86
|
})
|
|
89
|
-
:
|
|
87
|
+
: runtime.proc.stdout;
|
|
90
88
|
|
|
91
89
|
if (opts.facilitatorProfile) {
|
|
92
|
-
|
|
90
|
+
runtime.proc.env.LIBEVAL_AGENT_PROFILE = opts.facilitatorProfile;
|
|
93
91
|
}
|
|
94
92
|
|
|
95
93
|
const { query } = await import("@anthropic-ai/claude-agent-sdk");
|
|
@@ -104,6 +102,7 @@ export async function runFacilitateCommand(values, _args) {
|
|
|
104
102
|
facilitatorProfile: opts.facilitatorProfile,
|
|
105
103
|
taskAmend: opts.taskAmend,
|
|
106
104
|
redactor,
|
|
105
|
+
runtime,
|
|
107
106
|
});
|
|
108
107
|
|
|
109
108
|
const result = await facilitator.run(opts.taskContent);
|
|
@@ -113,5 +112,5 @@ export async function runFacilitateCommand(values, _args) {
|
|
|
113
112
|
await new Promise((r) => fileStream.end(r));
|
|
114
113
|
}
|
|
115
114
|
|
|
116
|
-
|
|
115
|
+
return result.success ? { ok: true } : { ok: false, code: 1, error: "" };
|
|
117
116
|
}
|
package/src/commands/output.js
CHANGED
|
@@ -6,29 +6,28 @@ import { createTraceCollector } from "@forwardimpact/libeval";
|
|
|
6
6
|
*
|
|
7
7
|
* Usage: fit-eval output [--format=json|text] < trace.ndjson
|
|
8
8
|
*
|
|
9
|
-
* @param {
|
|
10
|
-
* @
|
|
9
|
+
* @param {import("@forwardimpact/libcli").InvocationContext} ctx
|
|
10
|
+
* @returns {Promise<{ok: true}>}
|
|
11
11
|
*/
|
|
12
|
-
export async function runOutputCommand(
|
|
12
|
+
export async function runOutputCommand(ctx) {
|
|
13
|
+
const values = ctx.options;
|
|
14
|
+
const runtime = ctx.deps.runtime;
|
|
13
15
|
const format =
|
|
14
16
|
values.format === "text" || values.format === "json"
|
|
15
17
|
? values.format
|
|
16
18
|
: "json";
|
|
17
19
|
const collector = createTraceCollector();
|
|
18
20
|
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
|
|
22
|
-
}
|
|
23
|
-
const input = Buffer.concat(chunks).toString("utf8");
|
|
24
|
-
|
|
25
|
-
for (const line of input.split("\n")) {
|
|
21
|
+
// `runtime.proc.stdin` is an AsyncIterable of UTF-8 lines (newline-split by
|
|
22
|
+
// the runtime), so each yielded value is exactly one NDJSON record.
|
|
23
|
+
for await (const line of runtime.proc.stdin) {
|
|
26
24
|
collector.addLine(line);
|
|
27
25
|
}
|
|
28
26
|
|
|
29
27
|
if (format === "text") {
|
|
30
|
-
|
|
28
|
+
runtime.proc.stdout.write(collector.toText() + "\n");
|
|
31
29
|
} else {
|
|
32
|
-
|
|
30
|
+
runtime.proc.stdout.write(JSON.stringify(collector.toJSON()) + "\n");
|
|
33
31
|
}
|
|
32
|
+
return { ok: true };
|
|
34
33
|
}
|
package/src/commands/run.js
CHANGED
|
@@ -1,4 +1,4 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { createWriteStream } from "node:fs";
|
|
2
2
|
import { Writable } from "node:stream";
|
|
3
3
|
import { resolve } from "node:path";
|
|
4
4
|
import { createAgentRunner } from "../agent-runner.js";
|
|
@@ -6,24 +6,21 @@ import { composeProfilePrompt } from "../profile-prompt.js";
|
|
|
6
6
|
import { createRedactor } from "../redaction.js";
|
|
7
7
|
import { createTeeWriter } from "../tee-writer.js";
|
|
8
8
|
import { SequenceCounter } from "../sequence-counter.js";
|
|
9
|
+
import { resolveTaskContent } from "./task-input.js";
|
|
9
10
|
import { createServiceConfig } from "@forwardimpact/libconfig";
|
|
10
11
|
|
|
11
12
|
/**
|
|
12
13
|
* Parse and validate run command options from parsed values.
|
|
13
14
|
* @param {object} values - Parsed option values from cli.parse()
|
|
15
|
+
* @param {import("@forwardimpact/libutil/runtime").Runtime} runtime
|
|
14
16
|
* @returns {{ taskContent: string, cwd: string, model: string, maxTurns: number, outputPath: string|undefined, agentProfile: string|undefined, allowedTools: string[] }}
|
|
15
17
|
*/
|
|
16
|
-
function parseRunOptions(values) {
|
|
17
|
-
const
|
|
18
|
-
|
|
19
|
-
|
|
20
|
-
|
|
21
|
-
if (!taskFile && !taskText)
|
|
22
|
-
throw new Error("--task-file or --task-text is required");
|
|
23
|
-
|
|
18
|
+
function parseRunOptions(values, runtime) {
|
|
19
|
+
const { task: taskContent, amend: taskAmend } = resolveTaskContent(
|
|
20
|
+
values,
|
|
21
|
+
runtime,
|
|
22
|
+
);
|
|
24
23
|
const maxTurnsRaw = values["max-turns"] ?? "50";
|
|
25
|
-
const taskAmend = values["task-amend"] ?? undefined;
|
|
26
|
-
const taskContent = taskFile ? readFileSync(taskFile, "utf8") : taskText;
|
|
27
24
|
|
|
28
25
|
return {
|
|
29
26
|
taskContent,
|
|
@@ -46,10 +43,11 @@ function parseRunOptions(values) {
|
|
|
46
43
|
*
|
|
47
44
|
* Usage: fit-eval run [options]
|
|
48
45
|
*
|
|
49
|
-
* @param {
|
|
50
|
-
* @
|
|
46
|
+
* @param {import("@forwardimpact/libcli").InvocationContext} ctx
|
|
47
|
+
* @returns {Promise<{ok: boolean, code?: number, error?: string}>}
|
|
51
48
|
*/
|
|
52
|
-
export async function runRunCommand(
|
|
49
|
+
export async function runRunCommand(ctx) {
|
|
50
|
+
const runtime = ctx.deps.runtime;
|
|
53
51
|
const {
|
|
54
52
|
taskContent,
|
|
55
53
|
taskAmend,
|
|
@@ -60,19 +58,23 @@ export async function runRunCommand(values, _args) {
|
|
|
60
58
|
agentProfile,
|
|
61
59
|
allowedTools,
|
|
62
60
|
mcpServer,
|
|
63
|
-
} = parseRunOptions(
|
|
61
|
+
} = parseRunOptions(ctx.options, runtime);
|
|
64
62
|
|
|
65
63
|
// Build the redactor as the first observable side-effect after option
|
|
66
64
|
// parsing — the env snapshot must freeze BEFORE any in-process
|
|
67
|
-
//
|
|
68
|
-
const redactor = createRedactor();
|
|
65
|
+
// env writes the command performs (e.g. LIBEVAL_AGENT_PROFILE).
|
|
66
|
+
const redactor = createRedactor({ runtime });
|
|
69
67
|
|
|
70
68
|
// When --output is specified, stream text to stdout while writing NDJSON to file.
|
|
71
69
|
// Otherwise, write NDJSON directly to stdout (backwards-compatible).
|
|
72
70
|
const fileStream = outputPath ? createWriteStream(outputPath) : null;
|
|
73
71
|
const output = fileStream
|
|
74
|
-
? createTeeWriter({
|
|
75
|
-
|
|
72
|
+
? createTeeWriter({
|
|
73
|
+
fileStream,
|
|
74
|
+
textStream: runtime.proc.stdout,
|
|
75
|
+
mode: "raw",
|
|
76
|
+
})
|
|
77
|
+
: runtime.proc.stdout;
|
|
76
78
|
|
|
77
79
|
const counter = new SequenceCounter();
|
|
78
80
|
const devNull = new Writable({
|
|
@@ -100,7 +102,7 @@ export async function runRunCommand(values, _args) {
|
|
|
100
102
|
}
|
|
101
103
|
|
|
102
104
|
if (agentProfile) {
|
|
103
|
-
|
|
105
|
+
runtime.proc.env.LIBEVAL_AGENT_PROFILE = agentProfile;
|
|
104
106
|
}
|
|
105
107
|
|
|
106
108
|
const systemPrompt = agentProfile
|
|
@@ -123,6 +125,7 @@ export async function runRunCommand(values, _args) {
|
|
|
123
125
|
taskAmend,
|
|
124
126
|
mcpServers,
|
|
125
127
|
redactor,
|
|
128
|
+
runtime,
|
|
126
129
|
});
|
|
127
130
|
|
|
128
131
|
const result = await runner.run(taskContent);
|
|
@@ -132,5 +135,5 @@ export async function runRunCommand(values, _args) {
|
|
|
132
135
|
await new Promise((r) => fileStream.end(r));
|
|
133
136
|
}
|
|
134
137
|
|
|
135
|
-
|
|
138
|
+
return result.success ? { ok: true } : { ok: false, code: 1, error: "" };
|
|
136
139
|
}
|
|
@@ -1,37 +1,35 @@
|
|
|
1
|
-
import {
|
|
1
|
+
import { createWriteStream } from "node:fs";
|
|
2
2
|
import { resolve, join } from "node:path";
|
|
3
|
-
import { tmpdir } from "node:os";
|
|
4
3
|
import { createSupervisor } from "../supervisor.js";
|
|
5
4
|
import { createRedactor } from "../redaction.js";
|
|
6
5
|
import { createTeeWriter } from "../tee-writer.js";
|
|
6
|
+
import { resolveTaskContent } from "./task-input.js";
|
|
7
7
|
import { createServiceConfig } from "@forwardimpact/libconfig";
|
|
8
8
|
|
|
9
9
|
/**
|
|
10
10
|
* Parse all supervise flags from parsed values into an options object.
|
|
11
11
|
* @param {object} values - Parsed option values from cli.parse()
|
|
12
|
-
* @
|
|
12
|
+
* @param {import("@forwardimpact/libutil/runtime").Runtime} runtime
|
|
13
|
+
* @returns {Promise<object>}
|
|
13
14
|
*/
|
|
14
|
-
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
|
|
18
|
-
|
|
19
|
-
throw new Error("--task-file and --task-text are mutually exclusive");
|
|
20
|
-
if (!taskFile && !taskText)
|
|
21
|
-
throw new Error("--task-file or --task-text is required");
|
|
22
|
-
|
|
15
|
+
export async function parseSuperviseOptions(values, runtime) {
|
|
16
|
+
const { task: taskContent, amend: taskAmend } = resolveTaskContent(
|
|
17
|
+
values,
|
|
18
|
+
runtime,
|
|
19
|
+
);
|
|
23
20
|
const supervisorAllowedToolsRaw = values["supervisor-allowed-tools"];
|
|
24
21
|
|
|
25
|
-
const
|
|
26
|
-
const
|
|
22
|
+
const tmpRoot = runtime.proc.env.TMPDIR ?? "/tmp";
|
|
23
|
+
const agentCwd = resolve(
|
|
24
|
+
values["agent-cwd"] ??
|
|
25
|
+
(await runtime.fs.mkdtemp(join(tmpRoot, "fit-eval-agent-"))),
|
|
26
|
+
);
|
|
27
27
|
|
|
28
28
|
return {
|
|
29
29
|
taskContent,
|
|
30
30
|
taskAmend,
|
|
31
31
|
supervisorCwd: resolve(values["supervisor-cwd"] ?? "."),
|
|
32
|
-
agentCwd
|
|
33
|
-
values["agent-cwd"] ?? mkdtempSync(join(tmpdir(), "fit-eval-agent-")),
|
|
34
|
-
),
|
|
32
|
+
agentCwd,
|
|
35
33
|
agentModel: values["agent-model"] ?? "claude-opus-4-7[1m]",
|
|
36
34
|
supervisorModel: values["lead-model"] ?? "claude-opus-4-7[1m]",
|
|
37
35
|
maxTurns: (() => {
|
|
@@ -59,16 +57,17 @@ export function parseSuperviseOptions(values) {
|
|
|
59
57
|
*
|
|
60
58
|
* Usage: fit-eval supervise [options]
|
|
61
59
|
*
|
|
62
|
-
* @param {
|
|
63
|
-
* @
|
|
60
|
+
* @param {import("@forwardimpact/libcli").InvocationContext} ctx
|
|
61
|
+
* @returns {Promise<{ok: boolean, code?: number, error?: string}>}
|
|
64
62
|
*/
|
|
65
|
-
export async function runSuperviseCommand(
|
|
66
|
-
const
|
|
63
|
+
export async function runSuperviseCommand(ctx) {
|
|
64
|
+
const runtime = ctx.deps.runtime;
|
|
65
|
+
const opts = await parseSuperviseOptions(ctx.options, runtime);
|
|
67
66
|
|
|
68
67
|
// Build the redactor as the first observable side-effect after option
|
|
69
68
|
// parsing — the env snapshot must freeze BEFORE any in-process
|
|
70
|
-
//
|
|
71
|
-
const redactor = createRedactor();
|
|
69
|
+
// env writes the command performs (e.g. LIBEVAL_AGENT_PROFILE).
|
|
70
|
+
const redactor = createRedactor({ runtime });
|
|
72
71
|
|
|
73
72
|
// When --output is specified, stream text to stdout while writing NDJSON to file.
|
|
74
73
|
// Otherwise, write NDJSON directly to stdout (backwards-compatible).
|
|
@@ -78,10 +77,10 @@ export async function runSuperviseCommand(values, _args) {
|
|
|
78
77
|
const output = fileStream
|
|
79
78
|
? createTeeWriter({
|
|
80
79
|
fileStream,
|
|
81
|
-
textStream:
|
|
80
|
+
textStream: runtime.proc.stdout,
|
|
82
81
|
mode: "supervised",
|
|
83
82
|
})
|
|
84
|
-
:
|
|
83
|
+
: runtime.proc.stdout;
|
|
85
84
|
|
|
86
85
|
let agentMcpServers = null;
|
|
87
86
|
if (opts.mcpServer) {
|
|
@@ -97,7 +96,7 @@ export async function runSuperviseCommand(values, _args) {
|
|
|
97
96
|
}
|
|
98
97
|
|
|
99
98
|
if (opts.agentProfile) {
|
|
100
|
-
|
|
99
|
+
runtime.proc.env.LIBEVAL_AGENT_PROFILE = opts.agentProfile;
|
|
101
100
|
}
|
|
102
101
|
|
|
103
102
|
const { query } = await import("@anthropic-ai/claude-agent-sdk");
|
|
@@ -116,6 +115,7 @@ export async function runSuperviseCommand(values, _args) {
|
|
|
116
115
|
taskAmend: opts.taskAmend,
|
|
117
116
|
agentMcpServers,
|
|
118
117
|
redactor,
|
|
118
|
+
runtime,
|
|
119
119
|
});
|
|
120
120
|
|
|
121
121
|
const result = await supervisor.run(opts.taskContent);
|
|
@@ -125,5 +125,5 @@ export async function runSuperviseCommand(values, _args) {
|
|
|
125
125
|
await new Promise((r) => fileStream.end(r));
|
|
126
126
|
}
|
|
127
127
|
|
|
128
|
-
|
|
128
|
+
return result.success ? { ok: true } : { ok: false, code: 1, error: "" };
|
|
129
129
|
}
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
import { composeTaskFromGitHubEvent } from "../events/github.js";
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Resolve `--task-file` / `--task-text` / `--task-event` into the task pair the
|
|
5
|
+
* runner consumes. Exactly one of the three must be set. For `--task-event`,
|
|
6
|
+
* libeval reads the event payload and extracts both the main task (from the
|
|
7
|
+
* template that matches `$GITHUB_EVENT_NAME` + `payload.action`) and the
|
|
8
|
+
* amendment (from `payload.inputs?.prompt`) — so the workflow doesn't need to
|
|
9
|
+
* wire `--task-amend` separately. For the other two modes, `--task-amend`
|
|
10
|
+
* works as before.
|
|
11
|
+
*
|
|
12
|
+
* @param {object} values - Parsed option values from cli.parse()
|
|
13
|
+
* @param {import("@forwardimpact/libutil/runtime").Runtime} runtime - Ambient
|
|
14
|
+
* collaborators; `fsSync.readFileSync` loads `--task-file`/`--task-event`
|
|
15
|
+
* and `proc.env` resolves `GITHUB_EVENT_NAME`.
|
|
16
|
+
* @returns {{ task: string, amend: string | undefined }}
|
|
17
|
+
*/
|
|
18
|
+
export function resolveTaskContent(values, runtime) {
|
|
19
|
+
const taskFile = values["task-file"];
|
|
20
|
+
const taskText = values["task-text"];
|
|
21
|
+
const taskEvent = values["task-event"];
|
|
22
|
+
|
|
23
|
+
const set = [taskFile, taskText, taskEvent].filter(Boolean).length;
|
|
24
|
+
if (set === 0) {
|
|
25
|
+
throw new Error(
|
|
26
|
+
"one of --task-file, --task-text, --task-event is required",
|
|
27
|
+
);
|
|
28
|
+
}
|
|
29
|
+
if (set > 1) {
|
|
30
|
+
throw new Error(
|
|
31
|
+
"--task-file, --task-text, --task-event are mutually exclusive",
|
|
32
|
+
);
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
const amendFlag = values["task-amend"] ?? undefined;
|
|
36
|
+
|
|
37
|
+
if (taskFile) {
|
|
38
|
+
return {
|
|
39
|
+
task: runtime.fsSync.readFileSync(taskFile, "utf8"),
|
|
40
|
+
amend: amendFlag,
|
|
41
|
+
};
|
|
42
|
+
}
|
|
43
|
+
if (taskText) {
|
|
44
|
+
return { task: taskText, amend: amendFlag };
|
|
45
|
+
}
|
|
46
|
+
|
|
47
|
+
const eventName = runtime.proc.env.GITHUB_EVENT_NAME;
|
|
48
|
+
if (!eventName) {
|
|
49
|
+
throw new Error("--task-event requires GITHUB_EVENT_NAME to be set");
|
|
50
|
+
}
|
|
51
|
+
const payload = JSON.parse(runtime.fsSync.readFileSync(taskEvent, "utf8"));
|
|
52
|
+
const composed = composeTaskFromGitHubEvent(payload, eventName);
|
|
53
|
+
return { task: composed.task, amend: amendFlag ?? composed.amend };
|
|
54
|
+
}
|