@forwardimpact/libeval 0.1.42 → 0.1.44
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/bin/fit-benchmark.js +2 -2
- package/bin/fit-eval.js +103 -22
- package/bin/fit-trace.js +14 -0
- package/package.json +1 -1
- package/src/commands/benchmark-run.js +1 -1
- package/src/commands/by-discussion.js +84 -0
- package/src/commands/callback.js +104 -0
- package/src/commands/discuss.js +116 -0
- package/src/commands/facilitate.js +16 -8
- package/src/commands/supervise.js +4 -4
- package/src/discuss-tools.js +203 -0
- package/src/discusser.js +332 -0
- package/src/facilitator.js +40 -334
- package/src/index.js +14 -0
- package/src/orchestration-loop.js +369 -0
- package/src/redaction.js +10 -0
- package/src/render/orchestrator-filter.js +1 -0
- package/src/supervisor.js +17 -5
- package/src/trace-collector.js +4 -0
package/bin/fit-benchmark.js
CHANGED
|
@@ -46,10 +46,10 @@ export const definition = {
|
|
|
46
46
|
description:
|
|
47
47
|
"Claude model for the agent-under-test (default: claude-sonnet-4-6)",
|
|
48
48
|
},
|
|
49
|
-
"
|
|
49
|
+
"lead-model": {
|
|
50
50
|
type: "string",
|
|
51
51
|
description:
|
|
52
|
-
"Claude model for the
|
|
52
|
+
"Claude model for the lead role (default: claude-opus-4-7)",
|
|
53
53
|
},
|
|
54
54
|
"judge-model": {
|
|
55
55
|
type: "string",
|
package/bin/fit-eval.js
CHANGED
|
@@ -9,6 +9,8 @@ import { runTeeCommand } from "../src/commands/tee.js";
|
|
|
9
9
|
import { runRunCommand } from "../src/commands/run.js";
|
|
10
10
|
import { runSuperviseCommand } from "../src/commands/supervise.js";
|
|
11
11
|
import { runFacilitateCommand } from "../src/commands/facilitate.js";
|
|
12
|
+
import { runDiscussCommand } from "../src/commands/discuss.js";
|
|
13
|
+
import { runCallbackCommand } from "../src/commands/callback.js";
|
|
12
14
|
|
|
13
15
|
// `bun build --compile` injects FIT_EVAL_VERSION via --define, eliminating
|
|
14
16
|
// the readFileSync branch in the compiled binary (which would ENOENT against
|
|
@@ -18,6 +20,18 @@ const VERSION =
|
|
|
18
20
|
JSON.parse(readFileSync(new URL("../package.json", import.meta.url), "utf8"))
|
|
19
21
|
.version;
|
|
20
22
|
|
|
23
|
+
const LEAD_OPTIONS = {
|
|
24
|
+
"lead-profile": {
|
|
25
|
+
type: "string",
|
|
26
|
+
description: "Lead role profile name (supervisor / facilitator / chair)",
|
|
27
|
+
},
|
|
28
|
+
"lead-model": {
|
|
29
|
+
type: "string",
|
|
30
|
+
description:
|
|
31
|
+
"Claude model for the lead role (default: claude-opus-4-7[1m])",
|
|
32
|
+
},
|
|
33
|
+
};
|
|
34
|
+
|
|
21
35
|
const definition = {
|
|
22
36
|
name: "fit-eval",
|
|
23
37
|
version: VERSION,
|
|
@@ -93,14 +107,11 @@ const definition = {
|
|
|
93
107
|
description:
|
|
94
108
|
"Claude model for the agent (default: claude-opus-4-7[1m])",
|
|
95
109
|
},
|
|
96
|
-
|
|
97
|
-
type: "string",
|
|
98
|
-
description:
|
|
99
|
-
"Claude model for the supervisor (default: claude-opus-4-7[1m])",
|
|
100
|
-
},
|
|
110
|
+
...LEAD_OPTIONS,
|
|
101
111
|
"max-turns": {
|
|
102
112
|
type: "string",
|
|
103
|
-
description:
|
|
113
|
+
description:
|
|
114
|
+
"Max agentic turns per runner invocation (default: 200, 0 = unlimited)",
|
|
104
115
|
},
|
|
105
116
|
output: {
|
|
106
117
|
type: "string",
|
|
@@ -116,10 +127,6 @@ const definition = {
|
|
|
116
127
|
description: "Supervisor working directory",
|
|
117
128
|
},
|
|
118
129
|
"agent-cwd": { type: "string", description: "Agent working directory" },
|
|
119
|
-
"supervisor-profile": {
|
|
120
|
-
type: "string",
|
|
121
|
-
description: "Supervisor (judge) profile name",
|
|
122
|
-
},
|
|
123
130
|
"supervisor-allowed-tools": {
|
|
124
131
|
type: "string",
|
|
125
132
|
description: "Supervisor tool allowlist",
|
|
@@ -153,11 +160,7 @@ const definition = {
|
|
|
153
160
|
type: "string",
|
|
154
161
|
description: "Claude model for agents (default: claude-opus-4-7[1m])",
|
|
155
162
|
},
|
|
156
|
-
|
|
157
|
-
type: "string",
|
|
158
|
-
description:
|
|
159
|
-
"Claude model for the facilitator (default: claude-opus-4-7[1m])",
|
|
160
|
-
},
|
|
163
|
+
...LEAD_OPTIONS,
|
|
161
164
|
"max-turns": {
|
|
162
165
|
type: "string",
|
|
163
166
|
description: "Max agentic turns (default: 20, 0 = unlimited)",
|
|
@@ -170,10 +173,6 @@ const definition = {
|
|
|
170
173
|
type: "string",
|
|
171
174
|
description: "Facilitator working directory",
|
|
172
175
|
},
|
|
173
|
-
"facilitator-profile": {
|
|
174
|
-
type: "string",
|
|
175
|
-
description: "Facilitator profile name",
|
|
176
|
-
},
|
|
177
176
|
"agent-profiles": {
|
|
178
177
|
type: "string",
|
|
179
178
|
description:
|
|
@@ -185,6 +184,56 @@ const definition = {
|
|
|
185
184
|
},
|
|
186
185
|
},
|
|
187
186
|
},
|
|
187
|
+
{
|
|
188
|
+
name: "discuss",
|
|
189
|
+
args: "",
|
|
190
|
+
description:
|
|
191
|
+
"Run an async, suspendable discussion — Chair + N participants + bridge callback",
|
|
192
|
+
options: {
|
|
193
|
+
"task-file": {
|
|
194
|
+
type: "string",
|
|
195
|
+
description: "Path to a markdown task file",
|
|
196
|
+
},
|
|
197
|
+
"task-text": {
|
|
198
|
+
type: "string",
|
|
199
|
+
description: "Inline task text (alternative to --task-file)",
|
|
200
|
+
},
|
|
201
|
+
"task-amend": {
|
|
202
|
+
type: "string",
|
|
203
|
+
description: "Additional text appended to the task",
|
|
204
|
+
},
|
|
205
|
+
"agent-model": {
|
|
206
|
+
type: "string",
|
|
207
|
+
description: "Claude model for agents (default: claude-opus-4-7[1m])",
|
|
208
|
+
},
|
|
209
|
+
...LEAD_OPTIONS,
|
|
210
|
+
"max-turns": {
|
|
211
|
+
type: "string",
|
|
212
|
+
description: "Max agentic turns (default: 40, 0 = unlimited)",
|
|
213
|
+
},
|
|
214
|
+
output: {
|
|
215
|
+
type: "string",
|
|
216
|
+
description: "Write the NDJSON trace to a file",
|
|
217
|
+
},
|
|
218
|
+
"agent-profiles": {
|
|
219
|
+
type: "string",
|
|
220
|
+
description: "Comma-separated participant profile names (optional)",
|
|
221
|
+
},
|
|
222
|
+
"agent-cwd": {
|
|
223
|
+
type: "string",
|
|
224
|
+
description: "Working directory shared by participants (default: .)",
|
|
225
|
+
},
|
|
226
|
+
"discussion-id": {
|
|
227
|
+
type: "string",
|
|
228
|
+
description:
|
|
229
|
+
"Stable id for the threaded conversation; carried through traces for linking",
|
|
230
|
+
},
|
|
231
|
+
"resume-context": {
|
|
232
|
+
type: "string",
|
|
233
|
+
description: "JSON-serialized prior state for a resumed run",
|
|
234
|
+
},
|
|
235
|
+
},
|
|
236
|
+
},
|
|
188
237
|
{
|
|
189
238
|
name: "output",
|
|
190
239
|
args: "",
|
|
@@ -197,6 +246,35 @@ const definition = {
|
|
|
197
246
|
description:
|
|
198
247
|
"Stream readable text to stdout while saving raw NDJSON to a file",
|
|
199
248
|
},
|
|
249
|
+
{
|
|
250
|
+
name: "callback",
|
|
251
|
+
args: "",
|
|
252
|
+
description:
|
|
253
|
+
"Extract the terminal summary from an NDJSON trace and POST it to a callback URL",
|
|
254
|
+
options: {
|
|
255
|
+
"trace-file": {
|
|
256
|
+
type: "string",
|
|
257
|
+
description: "Path to the NDJSON trace file",
|
|
258
|
+
},
|
|
259
|
+
"callback-url": {
|
|
260
|
+
type: "string",
|
|
261
|
+
description: "URL to POST the summary to",
|
|
262
|
+
},
|
|
263
|
+
"correlation-id": {
|
|
264
|
+
type: "string",
|
|
265
|
+
description: "Correlation ID to include in the payload",
|
|
266
|
+
},
|
|
267
|
+
"run-url": {
|
|
268
|
+
type: "string",
|
|
269
|
+
description: "GitHub Actions run URL (optional)",
|
|
270
|
+
},
|
|
271
|
+
"discussion-id": {
|
|
272
|
+
type: "string",
|
|
273
|
+
description:
|
|
274
|
+
"Discussion id (fallback when the trace lacks a meta event)",
|
|
275
|
+
},
|
|
276
|
+
},
|
|
277
|
+
},
|
|
200
278
|
],
|
|
201
279
|
globalOptions: {
|
|
202
280
|
format: { type: "string", description: "Output format (json|text)" },
|
|
@@ -206,8 +284,9 @@ const definition = {
|
|
|
206
284
|
},
|
|
207
285
|
examples: [
|
|
208
286
|
"fit-eval run --task-file=task.md --output=trace.ndjson",
|
|
209
|
-
"fit-eval supervise --task-file=task.md --
|
|
210
|
-
'fit-eval facilitate --task-file=task.md --
|
|
287
|
+
"fit-eval supervise --task-file=task.md --lead-profile=judge --agent-profile=coder --output=trace.ndjson",
|
|
288
|
+
'fit-eval facilitate --task-file=task.md --lead-profile=lead --agent-profiles="security-engineer,technical-writer" --output=trace.ndjson',
|
|
289
|
+
'fit-eval discuss --task-file=task.md --lead-profile=release-engineer --agent-profiles="staff-engineer,security-engineer" --discussion-id=GD_kw...',
|
|
211
290
|
"fit-eval output --format=text < trace.ndjson",
|
|
212
291
|
],
|
|
213
292
|
documentation: [
|
|
@@ -233,7 +312,7 @@ const definition = {
|
|
|
233
312
|
title: "Agent Teams",
|
|
234
313
|
url: "https://www.forwardimpact.team/docs/products/agent-teams/index.md",
|
|
235
314
|
description:
|
|
236
|
-
"How to author the
|
|
315
|
+
"How to author the profiles consumed by --agent-profile, --lead-profile, and --agent-profiles.",
|
|
237
316
|
},
|
|
238
317
|
],
|
|
239
318
|
};
|
|
@@ -247,6 +326,8 @@ const COMMANDS = {
|
|
|
247
326
|
run: runRunCommand,
|
|
248
327
|
supervise: runSuperviseCommand,
|
|
249
328
|
facilitate: runFacilitateCommand,
|
|
329
|
+
discuss: runDiscussCommand,
|
|
330
|
+
callback: runCallbackCommand,
|
|
250
331
|
};
|
|
251
332
|
|
|
252
333
|
async function main() {
|
package/bin/fit-trace.js
CHANGED
|
@@ -26,6 +26,7 @@ import {
|
|
|
26
26
|
runSplitCommand,
|
|
27
27
|
} from "../src/commands/trace.js";
|
|
28
28
|
import { runAssertCommand } from "../src/commands/assert.js";
|
|
29
|
+
import { runByDiscussionCommand } from "../src/commands/by-discussion.js";
|
|
29
30
|
|
|
30
31
|
// `bun build --compile` injects FIT_TRACE_VERSION via --define, eliminating
|
|
31
32
|
// the readFileSync branch in the compiled binary (which would ENOENT against
|
|
@@ -160,6 +161,18 @@ const definition = {
|
|
|
160
161
|
args: "<file> <index>",
|
|
161
162
|
description: "Single turn by index",
|
|
162
163
|
},
|
|
164
|
+
{
|
|
165
|
+
name: "by-discussion",
|
|
166
|
+
args: "<discussion-id> [trace-dir]",
|
|
167
|
+
description:
|
|
168
|
+
"List trace files whose meta header carries the given discussion_id, ordered by first-event timestamp",
|
|
169
|
+
options: {
|
|
170
|
+
"trace-dir": {
|
|
171
|
+
type: "string",
|
|
172
|
+
description: "Directory to scan (default: traces)",
|
|
173
|
+
},
|
|
174
|
+
},
|
|
175
|
+
},
|
|
163
176
|
{
|
|
164
177
|
name: "filter",
|
|
165
178
|
args: "<file>",
|
|
@@ -307,6 +320,7 @@ const COMMANDS = {
|
|
|
307
320
|
filter: runFilterCommand,
|
|
308
321
|
split: runSplitCommand,
|
|
309
322
|
assert: runAssertCommand,
|
|
323
|
+
"by-discussion": runByDiscussionCommand,
|
|
310
324
|
};
|
|
311
325
|
|
|
312
326
|
async function main() {
|
package/package.json
CHANGED
|
@@ -40,7 +40,7 @@ function parseRunOptions(values) {
|
|
|
40
40
|
runs,
|
|
41
41
|
output: resolve(output),
|
|
42
42
|
agentModel: values["agent-model"] ?? "claude-sonnet-4-6",
|
|
43
|
-
supervisorModel: values["
|
|
43
|
+
supervisorModel: values["lead-model"] ?? "claude-opus-4-7",
|
|
44
44
|
judgeModel: values["judge-model"] ?? "claude-opus-4-7",
|
|
45
45
|
profiles: {
|
|
46
46
|
agent: values["agent-profile"] ?? null,
|
|
@@ -0,0 +1,84 @@
|
|
|
1
|
+
import { readdirSync, statSync, openSync, readSync, closeSync } from "node:fs";
|
|
2
|
+
import { join } from "node:path";
|
|
3
|
+
|
|
4
|
+
/**
|
|
5
|
+
* Read the first newline-terminated line of a file. Bounded to 64 KiB
|
|
6
|
+
* which is well above any orchestrator envelope.
|
|
7
|
+
*
|
|
8
|
+
* @param {string} path
|
|
9
|
+
* @returns {string}
|
|
10
|
+
*/
|
|
11
|
+
function readFirstLine(path) {
|
|
12
|
+
const fd = openSync(path, "r");
|
|
13
|
+
try {
|
|
14
|
+
const buf = Buffer.alloc(65536);
|
|
15
|
+
const bytes = readSync(fd, buf, 0, buf.length, 0);
|
|
16
|
+
const slice = buf.slice(0, bytes).toString("utf8");
|
|
17
|
+
const nl = slice.indexOf("\n");
|
|
18
|
+
return nl === -1 ? slice : slice.slice(0, nl);
|
|
19
|
+
} finally {
|
|
20
|
+
closeSync(fd);
|
|
21
|
+
}
|
|
22
|
+
}
|
|
23
|
+
|
|
24
|
+
/**
|
|
25
|
+
* Scan a directory for `.ndjson` files whose meta header carries the
|
|
26
|
+
* given discussion_id. The Step 2.6 first-line guarantee makes the
|
|
27
|
+
* lookup cheap: we read only the first line per file. Files without a
|
|
28
|
+
* meta header (e.g. legacy supervise/facilitate traces) are skipped
|
|
29
|
+
* silently — not erroneous.
|
|
30
|
+
*
|
|
31
|
+
* @param {string} dir
|
|
32
|
+
* @param {string} discussionId
|
|
33
|
+
* @returns {Array<{path: string, mtimeMs: number}>}
|
|
34
|
+
*/
|
|
35
|
+
export function findTracesByDiscussion(dir, discussionId) {
|
|
36
|
+
const matches = [];
|
|
37
|
+
let entries;
|
|
38
|
+
try {
|
|
39
|
+
entries = readdirSync(dir);
|
|
40
|
+
} catch {
|
|
41
|
+
return [];
|
|
42
|
+
}
|
|
43
|
+
for (const entry of entries) {
|
|
44
|
+
if (!entry.endsWith(".ndjson")) continue;
|
|
45
|
+
const path = join(dir, entry);
|
|
46
|
+
let firstLine;
|
|
47
|
+
try {
|
|
48
|
+
firstLine = readFirstLine(path);
|
|
49
|
+
} catch {
|
|
50
|
+
continue;
|
|
51
|
+
}
|
|
52
|
+
let parsed;
|
|
53
|
+
try {
|
|
54
|
+
parsed = JSON.parse(firstLine);
|
|
55
|
+
} catch {
|
|
56
|
+
continue;
|
|
57
|
+
}
|
|
58
|
+
const event = parsed.event ?? parsed;
|
|
59
|
+
if (event?.type !== "meta") continue;
|
|
60
|
+
if (event.discussion_id !== discussionId) continue;
|
|
61
|
+
matches.push({ path, mtimeMs: statSync(path).mtimeMs });
|
|
62
|
+
}
|
|
63
|
+
matches.sort((a, b) => a.mtimeMs - b.mtimeMs);
|
|
64
|
+
return matches;
|
|
65
|
+
}
|
|
66
|
+
|
|
67
|
+
/**
|
|
68
|
+
* `fit-trace by-discussion <discussion-id> [trace-dir]` — list trace
|
|
69
|
+
* files whose meta header carries the given discussion_id, one per
|
|
70
|
+
* line, ordered by first-event timestamp (file mtime ascending). The
|
|
71
|
+
* result is usable with `xargs cat` for a chronological merge.
|
|
72
|
+
*
|
|
73
|
+
* @param {object} values
|
|
74
|
+
* @param {string[]} args
|
|
75
|
+
*/
|
|
76
|
+
export async function runByDiscussionCommand(values, args) {
|
|
77
|
+
const [discussionId, traceDirArg] = args;
|
|
78
|
+
if (!discussionId) throw new Error("<discussion-id> is required");
|
|
79
|
+
const dir = traceDirArg ?? values["trace-dir"] ?? "traces";
|
|
80
|
+
const matches = findTracesByDiscussion(dir, discussionId);
|
|
81
|
+
for (const { path } of matches) {
|
|
82
|
+
process.stdout.write(`${path}\n`);
|
|
83
|
+
}
|
|
84
|
+
}
|
|
@@ -0,0 +1,104 @@
|
|
|
1
|
+
import { readFileSync } from "node:fs";
|
|
2
|
+
|
|
3
|
+
/**
|
|
4
|
+
* Scan an NDJSON trace and return the last orchestrator summary event,
|
|
5
|
+
* the first `meta` event's `discussion_id`, and any structured replies
|
|
6
|
+
* collected by the discusser. Skips malformed lines.
|
|
7
|
+
*
|
|
8
|
+
* The runner is verdict-agnostic — verbatim passthrough of whatever the
|
|
9
|
+
* trace carries ("success"/"failure" from supervise/facilitate; canonical
|
|
10
|
+
* "adjourned"/"recessed"/"failed" from discuss). The bridge layer maps to
|
|
11
|
+
* its channel semantics.
|
|
12
|
+
*
|
|
13
|
+
* @param {string} traceFile
|
|
14
|
+
* @returns {{verdict: string, summary: string, replies: object[], trigger?: object, discussionId?: string} | null}
|
|
15
|
+
*/
|
|
16
|
+
// biome-ignore lint/complexity/noExcessiveCognitiveComplexity: NDJSON scan with malformed-line tolerance + meta/summary dual extraction
|
|
17
|
+
function readTraceSummary(traceFile) {
|
|
18
|
+
let summary = null;
|
|
19
|
+
let metaDiscussionId = null;
|
|
20
|
+
for (const line of readFileSync(traceFile, "utf8").split("\n")) {
|
|
21
|
+
if (!line.trim()) continue;
|
|
22
|
+
let record;
|
|
23
|
+
try {
|
|
24
|
+
record = JSON.parse(line);
|
|
25
|
+
} catch {
|
|
26
|
+
continue;
|
|
27
|
+
}
|
|
28
|
+
if (record.source !== "orchestrator") continue;
|
|
29
|
+
if (record.event?.type === "meta" && !metaDiscussionId) {
|
|
30
|
+
metaDiscussionId = record.event.discussion_id ?? null;
|
|
31
|
+
}
|
|
32
|
+
if (record.event?.type === "summary") {
|
|
33
|
+
summary = {
|
|
34
|
+
verdict: record.event.verdict ?? "failed",
|
|
35
|
+
summary: record.event.summary ?? "",
|
|
36
|
+
replies: Array.isArray(record.event.replies)
|
|
37
|
+
? record.event.replies
|
|
38
|
+
: [],
|
|
39
|
+
...(record.event.trigger && { trigger: record.event.trigger }),
|
|
40
|
+
...(record.event.discussion_id && {
|
|
41
|
+
discussionId: record.event.discussion_id,
|
|
42
|
+
}),
|
|
43
|
+
};
|
|
44
|
+
}
|
|
45
|
+
}
|
|
46
|
+
if (summary && !summary.discussionId && metaDiscussionId) {
|
|
47
|
+
summary.discussionId = metaDiscussionId;
|
|
48
|
+
}
|
|
49
|
+
return summary;
|
|
50
|
+
}
|
|
51
|
+
|
|
52
|
+
/**
|
|
53
|
+
* Callback command — read an NDJSON trace, extract the terminal
|
|
54
|
+
* orchestrator summary, and POST a canonical callback body to the
|
|
55
|
+
* configured URL. Used by `kata-dispatch.yml` to deliver the lead's
|
|
56
|
+
* conclusion to the bridge that dispatched the run.
|
|
57
|
+
*
|
|
58
|
+
* Wire shape (single shape across modes):
|
|
59
|
+
*
|
|
60
|
+
* ```
|
|
61
|
+
* {
|
|
62
|
+
* correlation_id, verdict, summary, run_url,
|
|
63
|
+
* discussion_id?, replies: [], trigger?
|
|
64
|
+
* }
|
|
65
|
+
* ```
|
|
66
|
+
*
|
|
67
|
+
* @param {object} values - Parsed option values from cli.parse()
|
|
68
|
+
* @param {string[]} _args - Positional arguments
|
|
69
|
+
*/
|
|
70
|
+
export async function runCallbackCommand(values, _args) {
|
|
71
|
+
const traceFile = values["trace-file"];
|
|
72
|
+
const callbackUrl = values["callback-url"];
|
|
73
|
+
const correlationId = values["correlation-id"];
|
|
74
|
+
const runUrl = values["run-url"] ?? "";
|
|
75
|
+
const discussionIdOverride = values["discussion-id"] ?? null;
|
|
76
|
+
|
|
77
|
+
if (!traceFile) throw new Error("--trace-file is required");
|
|
78
|
+
if (!callbackUrl) throw new Error("--callback-url is required");
|
|
79
|
+
|
|
80
|
+
const found = readTraceSummary(traceFile) ?? {
|
|
81
|
+
verdict: "failed",
|
|
82
|
+
summary: "Run ended without producing a summary.",
|
|
83
|
+
replies: [],
|
|
84
|
+
};
|
|
85
|
+
|
|
86
|
+
const discussionId = found.discussionId ?? discussionIdOverride ?? null;
|
|
87
|
+
const payload = {
|
|
88
|
+
correlation_id: correlationId,
|
|
89
|
+
verdict: found.verdict,
|
|
90
|
+
summary: found.summary,
|
|
91
|
+
run_url: runUrl,
|
|
92
|
+
replies: found.replies,
|
|
93
|
+
...(discussionId && { discussion_id: discussionId }),
|
|
94
|
+
...(found.trigger && { trigger: found.trigger }),
|
|
95
|
+
};
|
|
96
|
+
const res = await fetch(callbackUrl, {
|
|
97
|
+
method: "POST",
|
|
98
|
+
headers: { "Content-Type": "application/json" },
|
|
99
|
+
body: JSON.stringify(payload),
|
|
100
|
+
});
|
|
101
|
+
if (!res.ok) {
|
|
102
|
+
throw new Error(`Callback POST failed: ${res.status}`);
|
|
103
|
+
}
|
|
104
|
+
}
|
|
@@ -0,0 +1,116 @@
|
|
|
1
|
+
import { readFileSync, createWriteStream } from "node:fs";
|
|
2
|
+
import { resolve } from "node:path";
|
|
3
|
+
import { createDiscusser } from "../discusser.js";
|
|
4
|
+
import { createRedactor } from "../redaction.js";
|
|
5
|
+
import { createTeeWriter } from "../tee-writer.js";
|
|
6
|
+
|
|
7
|
+
function parseAgentProfiles(raw, cwd, maxTurns) {
|
|
8
|
+
if (!raw) return [];
|
|
9
|
+
return raw.split(",").map((entry) => {
|
|
10
|
+
const name = entry.trim();
|
|
11
|
+
return { name, role: name, cwd, agentProfile: name, maxTurns };
|
|
12
|
+
});
|
|
13
|
+
}
|
|
14
|
+
|
|
15
|
+
/**
|
|
16
|
+
* Parse and validate discuss command options. Exported so tests can verify
|
|
17
|
+
* defaults and the legacy-flag clean break.
|
|
18
|
+
* @param {object} values - Parsed option values
|
|
19
|
+
* @returns {object}
|
|
20
|
+
*/
|
|
21
|
+
// biome-ignore lint/complexity/noExcessiveCognitiveComplexity: CLI option validation
|
|
22
|
+
export function parseDiscussOptions(values) {
|
|
23
|
+
const taskFile = values["task-file"];
|
|
24
|
+
const taskText = values["task-text"];
|
|
25
|
+
if (taskFile && taskText)
|
|
26
|
+
throw new Error("--task-file and --task-text are mutually exclusive");
|
|
27
|
+
if (!taskFile && !taskText)
|
|
28
|
+
throw new Error("--task-file or --task-text is required");
|
|
29
|
+
|
|
30
|
+
const taskAmend = values["task-amend"] ?? undefined;
|
|
31
|
+
const taskContent = taskFile ? readFileSync(taskFile, "utf8") : taskText;
|
|
32
|
+
|
|
33
|
+
const profilesRaw = values["agent-profiles"];
|
|
34
|
+
const agentCwd = resolve(values["agent-cwd"] ?? ".");
|
|
35
|
+
|
|
36
|
+
const maxTurnsRaw = values["max-turns"] ?? "40";
|
|
37
|
+
const maxTurns = maxTurnsRaw === "0" ? 0 : parseInt(maxTurnsRaw, 10);
|
|
38
|
+
|
|
39
|
+
const agentConfigs = parseAgentProfiles(profilesRaw, agentCwd, maxTurns);
|
|
40
|
+
|
|
41
|
+
const resumeContextRaw = values["resume-context"];
|
|
42
|
+
let resumeContext = null;
|
|
43
|
+
if (resumeContextRaw) {
|
|
44
|
+
try {
|
|
45
|
+
resumeContext = JSON.parse(resumeContextRaw);
|
|
46
|
+
} catch (err) {
|
|
47
|
+
throw new Error(`--resume-context is not valid JSON: ${err.message}`);
|
|
48
|
+
}
|
|
49
|
+
}
|
|
50
|
+
|
|
51
|
+
return {
|
|
52
|
+
taskContent,
|
|
53
|
+
taskAmend,
|
|
54
|
+
agentConfigs,
|
|
55
|
+
leadProfile: values["lead-profile"] ?? "release-engineer",
|
|
56
|
+
leadModel: values["lead-model"] ?? "claude-opus-4-7[1m]",
|
|
57
|
+
agentModel: values["agent-model"] ?? "claude-opus-4-7[1m]",
|
|
58
|
+
maxTurns,
|
|
59
|
+
outputPath: values.output,
|
|
60
|
+
discussionId: values["discussion-id"] ?? null,
|
|
61
|
+
resumeContext,
|
|
62
|
+
};
|
|
63
|
+
}
|
|
64
|
+
|
|
65
|
+
/**
|
|
66
|
+
* Discuss command — run a discusser-led session with suspend/resume
|
|
67
|
+
* semantics, threading `discussion_id` through the trace so multi-run
|
|
68
|
+
* conversations are queryable as one.
|
|
69
|
+
*
|
|
70
|
+
* @param {object} values - Parsed option values
|
|
71
|
+
* @param {string[]} _args - Positional arguments
|
|
72
|
+
*/
|
|
73
|
+
export async function runDiscussCommand(values, _args) {
|
|
74
|
+
const opts = parseDiscussOptions(values);
|
|
75
|
+
|
|
76
|
+
const redactor = createRedactor();
|
|
77
|
+
|
|
78
|
+
const fileStream = opts.outputPath
|
|
79
|
+
? createWriteStream(opts.outputPath)
|
|
80
|
+
: null;
|
|
81
|
+
const output = fileStream
|
|
82
|
+
? createTeeWriter({
|
|
83
|
+
fileStream,
|
|
84
|
+
textStream: process.stdout,
|
|
85
|
+
mode: "supervised",
|
|
86
|
+
})
|
|
87
|
+
: process.stdout;
|
|
88
|
+
|
|
89
|
+
if (opts.leadProfile) {
|
|
90
|
+
process.env.LIBEVAL_AGENT_PROFILE = opts.leadProfile;
|
|
91
|
+
}
|
|
92
|
+
|
|
93
|
+
const { query } = await import("@anthropic-ai/claude-agent-sdk");
|
|
94
|
+
const discusser = createDiscusser({
|
|
95
|
+
leadProfile: opts.leadProfile,
|
|
96
|
+
leadModel: opts.leadModel,
|
|
97
|
+
agentModel: opts.agentModel,
|
|
98
|
+
agentConfigs: opts.agentConfigs,
|
|
99
|
+
discussionId: opts.discussionId,
|
|
100
|
+
resumeContext: opts.resumeContext,
|
|
101
|
+
query,
|
|
102
|
+
output,
|
|
103
|
+
maxTurns: opts.maxTurns,
|
|
104
|
+
taskAmend: opts.taskAmend,
|
|
105
|
+
redactor,
|
|
106
|
+
});
|
|
107
|
+
|
|
108
|
+
const result = await discusser.run(opts.taskContent);
|
|
109
|
+
|
|
110
|
+
if (fileStream) {
|
|
111
|
+
await new Promise((r) => output.end(r));
|
|
112
|
+
await new Promise((r) => fileStream.end(r));
|
|
113
|
+
}
|
|
114
|
+
|
|
115
|
+
process.exit(result.success ? 0 : 1);
|
|
116
|
+
}
|
|
@@ -10,19 +10,21 @@ import { createTeeWriter } from "../tee-writer.js";
|
|
|
10
10
|
* @param {string} cwd - Shared working directory for all agents
|
|
11
11
|
* @returns {Array<{name: string, role: string, cwd: string, agentProfile: string}>}
|
|
12
12
|
*/
|
|
13
|
-
function parseAgentProfiles(raw, cwd) {
|
|
13
|
+
function parseAgentProfiles(raw, cwd, maxTurns) {
|
|
14
14
|
return raw.split(",").map((entry) => {
|
|
15
15
|
const name = entry.trim();
|
|
16
|
-
return { name, role: name, cwd, agentProfile: name };
|
|
16
|
+
return { name, role: name, cwd, agentProfile: name, maxTurns };
|
|
17
17
|
});
|
|
18
18
|
}
|
|
19
19
|
|
|
20
20
|
/**
|
|
21
|
-
* Parse and validate facilitate command options.
|
|
21
|
+
* Parse and validate facilitate command options. Exported for test
|
|
22
|
+
* coverage of the `--max-turns` → per-agent threading contract; not part
|
|
23
|
+
* of the package's public API.
|
|
22
24
|
* @param {object} values - Parsed option values
|
|
23
25
|
* @returns {object} Parsed options
|
|
24
26
|
*/
|
|
25
|
-
function parseFacilitateOptions(values) {
|
|
27
|
+
export function parseFacilitateOptions(values) {
|
|
26
28
|
const taskFile = values["task-file"];
|
|
27
29
|
const taskText = values["task-text"];
|
|
28
30
|
if (taskFile && taskText)
|
|
@@ -36,9 +38,15 @@ function parseFacilitateOptions(values) {
|
|
|
36
38
|
const profilesRaw = values["agent-profiles"];
|
|
37
39
|
if (!profilesRaw) throw new Error("--agent-profiles is required");
|
|
38
40
|
const agentCwd = resolve(values["agent-cwd"] ?? ".");
|
|
39
|
-
const agentConfigs = parseAgentProfiles(profilesRaw, agentCwd);
|
|
40
41
|
|
|
41
42
|
const maxTurnsRaw = values["max-turns"] ?? "20";
|
|
43
|
+
const maxTurns = maxTurnsRaw === "0" ? 0 : parseInt(maxTurnsRaw, 10);
|
|
44
|
+
|
|
45
|
+
// Thread --max-turns into each participant: without this, every facilitated
|
|
46
|
+
// agent silently falls back to the 50-turn default in facilitator.js even
|
|
47
|
+
// when the caller raises the budget. Observed in run 26078312414 where
|
|
48
|
+
// staff-engineer terminated at 51 turns despite --max-turns=200.
|
|
49
|
+
const agentConfigs = parseAgentProfiles(profilesRaw, agentCwd, maxTurns);
|
|
42
50
|
|
|
43
51
|
return {
|
|
44
52
|
taskContent,
|
|
@@ -46,10 +54,10 @@ function parseFacilitateOptions(values) {
|
|
|
46
54
|
agentConfigs,
|
|
47
55
|
facilitatorCwd: resolve(values["facilitator-cwd"] ?? "."),
|
|
48
56
|
agentModel: values["agent-model"] ?? "claude-opus-4-7[1m]",
|
|
49
|
-
facilitatorModel: values["
|
|
50
|
-
maxTurns
|
|
57
|
+
facilitatorModel: values["lead-model"] ?? "claude-opus-4-7[1m]",
|
|
58
|
+
maxTurns,
|
|
51
59
|
outputPath: values.output,
|
|
52
|
-
facilitatorProfile: values["
|
|
60
|
+
facilitatorProfile: values["lead-profile"] ?? undefined,
|
|
53
61
|
};
|
|
54
62
|
}
|
|
55
63
|
|
|
@@ -12,7 +12,7 @@ import { createServiceConfig } from "@forwardimpact/libconfig";
|
|
|
12
12
|
* @returns {object}
|
|
13
13
|
*/
|
|
14
14
|
// biome-ignore lint/complexity/noExcessiveCognitiveComplexity: CLI option validation
|
|
15
|
-
function parseSuperviseOptions(values) {
|
|
15
|
+
export function parseSuperviseOptions(values) {
|
|
16
16
|
const taskFile = values["task-file"];
|
|
17
17
|
const taskText = values["task-text"];
|
|
18
18
|
if (taskFile && taskText)
|
|
@@ -33,13 +33,13 @@ function parseSuperviseOptions(values) {
|
|
|
33
33
|
values["agent-cwd"] ?? mkdtempSync(join(tmpdir(), "fit-eval-agent-")),
|
|
34
34
|
),
|
|
35
35
|
agentModel: values["agent-model"] ?? "claude-opus-4-7[1m]",
|
|
36
|
-
supervisorModel: values["
|
|
36
|
+
supervisorModel: values["lead-model"] ?? "claude-opus-4-7[1m]",
|
|
37
37
|
maxTurns: (() => {
|
|
38
|
-
const raw = values["max-turns"] ?? "
|
|
38
|
+
const raw = values["max-turns"] ?? "200";
|
|
39
39
|
return raw === "0" ? 0 : parseInt(raw, 10);
|
|
40
40
|
})(),
|
|
41
41
|
outputPath: values.output,
|
|
42
|
-
supervisorProfile: values["
|
|
42
|
+
supervisorProfile: values["lead-profile"] ?? undefined,
|
|
43
43
|
agentProfile: values["agent-profile"] ?? undefined,
|
|
44
44
|
allowedTools: (
|
|
45
45
|
values["allowed-tools"] ??
|