@forwardimpact/libeval 0.1.27 → 0.1.30
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +6 -1
- package/bin/fit-eval.js +24 -10
- package/bin/fit-trace.js +14 -10
- package/package.json +13 -11
- package/src/agent-runner.js +1 -0
- package/src/commands/run.js +17 -0
- package/src/commands/supervise.js +16 -0
- package/src/facilitator.js +14 -6
- package/src/message-bus.js +1 -0
- package/src/orchestration-toolkit.js +10 -5
- package/src/orchestrator-helpers.js +1 -0
- package/src/render/tool-hints.js +21 -53
- package/src/render/turn-renderer.js +12 -7
- package/src/sequence-counter.js +4 -0
- package/src/supervisor.js +31 -13
- package/src/tee-writer.js +7 -1
- package/src/trace-collector.js +19 -2
package/README.md
CHANGED
|
@@ -1,6 +1,11 @@
|
|
|
1
1
|
# libeval
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
<!-- BEGIN:description — Do not edit. Generated from package.json. -->
|
|
4
|
+
|
|
5
|
+
Agent evaluation framework — prove whether agent changes improved outcomes with
|
|
6
|
+
reproducible evidence.
|
|
7
|
+
|
|
8
|
+
<!-- END:description -->
|
|
4
9
|
|
|
5
10
|
## Getting Started
|
|
6
11
|
|
package/bin/fit-eval.js
CHANGED
|
@@ -10,9 +10,13 @@ import { runRunCommand } from "../src/commands/run.js";
|
|
|
10
10
|
import { runSuperviseCommand } from "../src/commands/supervise.js";
|
|
11
11
|
import { runFacilitateCommand } from "../src/commands/facilitate.js";
|
|
12
12
|
|
|
13
|
-
|
|
14
|
-
|
|
15
|
-
)
|
|
13
|
+
// `bun build --compile` injects FIT_EVAL_VERSION via --define, eliminating
|
|
14
|
+
// the readFileSync branch in the compiled binary (which would ENOENT against
|
|
15
|
+
// the bunfs virtual mount). Source execution falls through to package.json.
|
|
16
|
+
const VERSION =
|
|
17
|
+
process.env.FIT_EVAL_VERSION ||
|
|
18
|
+
JSON.parse(readFileSync(new URL("../package.json", import.meta.url), "utf8"))
|
|
19
|
+
.version;
|
|
16
20
|
|
|
17
21
|
const definition = {
|
|
18
22
|
name: "fit-eval",
|
|
@@ -55,6 +59,11 @@ const definition = {
|
|
|
55
59
|
type: "string",
|
|
56
60
|
description: "Comma-separated tool allowlist",
|
|
57
61
|
},
|
|
62
|
+
"mcp-server": {
|
|
63
|
+
type: "string",
|
|
64
|
+
description:
|
|
65
|
+
"Connect to the MCP service (e.g. --mcp-server=guide); adds mcp__<name>__* to allowed tools",
|
|
66
|
+
},
|
|
58
67
|
},
|
|
59
68
|
},
|
|
60
69
|
{
|
|
@@ -102,6 +111,11 @@ const definition = {
|
|
|
102
111
|
type: "string",
|
|
103
112
|
description: "Supervisor tool allowlist",
|
|
104
113
|
},
|
|
114
|
+
"mcp-server": {
|
|
115
|
+
type: "string",
|
|
116
|
+
description:
|
|
117
|
+
"Connect to the MCP service (e.g. --mcp-server=guide); adds mcp__<name>__* to allowed tools",
|
|
118
|
+
},
|
|
105
119
|
},
|
|
106
120
|
},
|
|
107
121
|
{
|
|
@@ -177,20 +191,20 @@ const definition = {
|
|
|
177
191
|
],
|
|
178
192
|
documentation: [
|
|
179
193
|
{
|
|
180
|
-
title: "
|
|
181
|
-
url: "https://www.forwardimpact.team/docs/libraries/
|
|
194
|
+
title: "Run an Eval",
|
|
195
|
+
url: "https://www.forwardimpact.team/docs/libraries/prove-changes/run-eval/index.md",
|
|
182
196
|
description:
|
|
183
197
|
"Author a judge profile, run an eval locally, wire it into CI, and inspect the resulting trace.",
|
|
184
198
|
},
|
|
185
199
|
{
|
|
186
|
-
title: "Agent
|
|
187
|
-
url: "https://www.forwardimpact.team/docs/libraries/
|
|
200
|
+
title: "Prove Agent Changes",
|
|
201
|
+
url: "https://www.forwardimpact.team/docs/libraries/prove-changes/index.md",
|
|
188
202
|
description:
|
|
189
|
-
"
|
|
203
|
+
"End-to-end workflow from dataset generation through evaluation to trace analysis, including multi-agent collaboration sessions.",
|
|
190
204
|
},
|
|
191
205
|
{
|
|
192
|
-
title: "
|
|
193
|
-
url: "https://www.forwardimpact.team/docs/libraries/trace-analysis/index.md",
|
|
206
|
+
title: "Analyze Traces",
|
|
207
|
+
url: "https://www.forwardimpact.team/docs/libraries/prove-changes/trace-analysis/index.md",
|
|
194
208
|
description:
|
|
195
209
|
"Read the NDJSON traces produced by `fit-eval` with `fit-trace` — grounded-theory method and worked examples.",
|
|
196
210
|
},
|
package/bin/fit-trace.js
CHANGED
|
@@ -26,9 +26,13 @@ import {
|
|
|
26
26
|
runSplitCommand,
|
|
27
27
|
} from "../src/commands/trace.js";
|
|
28
28
|
|
|
29
|
-
|
|
30
|
-
|
|
31
|
-
)
|
|
29
|
+
// `bun build --compile` injects FIT_TRACE_VERSION via --define, eliminating
|
|
30
|
+
// the readFileSync branch in the compiled binary (which would ENOENT against
|
|
31
|
+
// the bunfs virtual mount). Source execution falls through to package.json.
|
|
32
|
+
const VERSION =
|
|
33
|
+
process.env.FIT_TRACE_VERSION ||
|
|
34
|
+
JSON.parse(readFileSync(new URL("../package.json", import.meta.url), "utf8"))
|
|
35
|
+
.version;
|
|
32
36
|
|
|
33
37
|
const definition = {
|
|
34
38
|
name: "fit-trace",
|
|
@@ -214,22 +218,22 @@ const definition = {
|
|
|
214
218
|
],
|
|
215
219
|
documentation: [
|
|
216
220
|
{
|
|
217
|
-
title: "
|
|
218
|
-
url: "https://www.forwardimpact.team/docs/libraries/trace-analysis/index.md",
|
|
221
|
+
title: "Analyze Traces",
|
|
222
|
+
url: "https://www.forwardimpact.team/docs/libraries/prove-changes/trace-analysis/index.md",
|
|
219
223
|
description:
|
|
220
224
|
"The full method walkthrough with worked examples (an eval that failed, a multi-agent session that stalled).",
|
|
221
225
|
},
|
|
222
226
|
{
|
|
223
|
-
title: "
|
|
224
|
-
url: "https://www.forwardimpact.team/docs/libraries/
|
|
227
|
+
title: "Run an Eval",
|
|
228
|
+
url: "https://www.forwardimpact.team/docs/libraries/prove-changes/run-eval/index.md",
|
|
225
229
|
description:
|
|
226
230
|
"How `fit-eval supervise` produces the traces this skill analyzes.",
|
|
227
231
|
},
|
|
228
232
|
{
|
|
229
|
-
title: "Agent
|
|
230
|
-
url: "https://www.forwardimpact.team/docs/libraries/
|
|
233
|
+
title: "Prove Agent Changes",
|
|
234
|
+
url: "https://www.forwardimpact.team/docs/libraries/prove-changes/index.md",
|
|
231
235
|
description:
|
|
232
|
-
"
|
|
236
|
+
"End-to-end workflow including multi-agent collaboration; `split` is the bridge into per-source trace files.",
|
|
233
237
|
},
|
|
234
238
|
],
|
|
235
239
|
};
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@forwardimpact/libeval",
|
|
3
|
-
"version": "0.1.
|
|
4
|
-
"description": "Agent evaluation
|
|
3
|
+
"version": "0.1.30",
|
|
4
|
+
"description": "Agent evaluation framework — prove whether agent changes improved outcomes with reproducible evidence.",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"eval",
|
|
7
7
|
"agent",
|
|
@@ -17,14 +17,16 @@
|
|
|
17
17
|
},
|
|
18
18
|
"license": "Apache-2.0",
|
|
19
19
|
"author": "D. Olsson <hi@senzilla.io>",
|
|
20
|
-
"
|
|
21
|
-
|
|
22
|
-
|
|
23
|
-
"
|
|
24
|
-
"
|
|
25
|
-
"
|
|
26
|
-
|
|
27
|
-
|
|
20
|
+
"jobs": [
|
|
21
|
+
{
|
|
22
|
+
"user": "Platform Builders",
|
|
23
|
+
"goal": "Prove Agent Changes",
|
|
24
|
+
"trigger": "An eval passes locally but fails in CI and the only output is 'assertion failed.'",
|
|
25
|
+
"bigHire": "prove whether agent changes improved outcomes with reproducible evidence.",
|
|
26
|
+
"littleHire": "run an eval and get a trace that shows exactly what the agent did.",
|
|
27
|
+
"competesWith": "manual before/after comparison; trusting gut feeling over evidence; skipping evaluation entirely"
|
|
28
|
+
}
|
|
29
|
+
],
|
|
28
30
|
"type": "module",
|
|
29
31
|
"main": "./src/index.js",
|
|
30
32
|
"exports": {
|
|
@@ -49,7 +51,7 @@
|
|
|
49
51
|
"@forwardimpact/libcli": "^0.1.0",
|
|
50
52
|
"@forwardimpact/libconfig": "^0.1.0",
|
|
51
53
|
"@forwardimpact/libtelemetry": "^0.1.22",
|
|
52
|
-
"zod": "^4.4.
|
|
54
|
+
"zod": "^4.4.3"
|
|
53
55
|
},
|
|
54
56
|
"devDependencies": {
|
|
55
57
|
"@forwardimpact/libharness": "^0.1.14"
|
package/src/agent-runner.js
CHANGED
package/src/commands/run.js
CHANGED
|
@@ -5,6 +5,7 @@ import { createAgentRunner } from "../agent-runner.js";
|
|
|
5
5
|
import { composeProfilePrompt } from "../profile-prompt.js";
|
|
6
6
|
import { createTeeWriter } from "../tee-writer.js";
|
|
7
7
|
import { SequenceCounter } from "../sequence-counter.js";
|
|
8
|
+
import { createServiceConfig } from "@forwardimpact/libconfig";
|
|
8
9
|
|
|
9
10
|
/**
|
|
10
11
|
* Parse and validate run command options from parsed values.
|
|
@@ -35,6 +36,7 @@ function parseRunOptions(values) {
|
|
|
35
36
|
values["allowed-tools"] ??
|
|
36
37
|
"Bash,Read,Glob,Grep,Write,Edit,Agent,TodoWrite"
|
|
37
38
|
).split(","),
|
|
39
|
+
mcpServer: values["mcp-server"] ?? undefined,
|
|
38
40
|
};
|
|
39
41
|
}
|
|
40
42
|
|
|
@@ -56,6 +58,7 @@ export async function runRunCommand(values, _args) {
|
|
|
56
58
|
outputPath,
|
|
57
59
|
agentProfile,
|
|
58
60
|
allowedTools,
|
|
61
|
+
mcpServer,
|
|
59
62
|
} = parseRunOptions(values);
|
|
60
63
|
|
|
61
64
|
// When --output is specified, stream text to stdout while writing NDJSON to file.
|
|
@@ -78,6 +81,19 @@ export async function runRunCommand(values, _args) {
|
|
|
78
81
|
);
|
|
79
82
|
};
|
|
80
83
|
|
|
84
|
+
let mcpServers = null;
|
|
85
|
+
if (mcpServer) {
|
|
86
|
+
const mcpConfig = await createServiceConfig("mcp");
|
|
87
|
+
mcpServers = {
|
|
88
|
+
[mcpServer]: {
|
|
89
|
+
type: "http",
|
|
90
|
+
url: mcpConfig.url,
|
|
91
|
+
headers: { Authorization: `Bearer ${mcpConfig.mcpToken()}` },
|
|
92
|
+
},
|
|
93
|
+
};
|
|
94
|
+
allowedTools.push(`mcp__${mcpServer}__*`);
|
|
95
|
+
}
|
|
96
|
+
|
|
81
97
|
if (agentProfile) {
|
|
82
98
|
process.env.LIBEVAL_AGENT_PROFILE = agentProfile;
|
|
83
99
|
}
|
|
@@ -100,6 +116,7 @@ export async function runRunCommand(values, _args) {
|
|
|
100
116
|
settingSources: ["project"],
|
|
101
117
|
systemPrompt,
|
|
102
118
|
taskAmend,
|
|
119
|
+
mcpServers,
|
|
103
120
|
});
|
|
104
121
|
|
|
105
122
|
const result = await runner.run(taskContent);
|
|
@@ -3,6 +3,7 @@ import { resolve, join } from "node:path";
|
|
|
3
3
|
import { tmpdir } from "node:os";
|
|
4
4
|
import { createSupervisor } from "../supervisor.js";
|
|
5
5
|
import { createTeeWriter } from "../tee-writer.js";
|
|
6
|
+
import { createServiceConfig } from "@forwardimpact/libconfig";
|
|
6
7
|
|
|
7
8
|
/**
|
|
8
9
|
* Parse all supervise flags from parsed values into an options object.
|
|
@@ -44,6 +45,7 @@ function parseSuperviseOptions(values) {
|
|
|
44
45
|
supervisorAllowedTools: supervisorAllowedToolsRaw
|
|
45
46
|
? supervisorAllowedToolsRaw.split(",")
|
|
46
47
|
: undefined,
|
|
48
|
+
mcpServer: values["mcp-server"] ?? undefined,
|
|
47
49
|
};
|
|
48
50
|
}
|
|
49
51
|
|
|
@@ -71,6 +73,19 @@ export async function runSuperviseCommand(values, _args) {
|
|
|
71
73
|
})
|
|
72
74
|
: process.stdout;
|
|
73
75
|
|
|
76
|
+
let agentMcpServers = null;
|
|
77
|
+
if (opts.mcpServer) {
|
|
78
|
+
const mcpConfig = await createServiceConfig("mcp");
|
|
79
|
+
agentMcpServers = {
|
|
80
|
+
[opts.mcpServer]: {
|
|
81
|
+
type: "http",
|
|
82
|
+
url: mcpConfig.url,
|
|
83
|
+
headers: { Authorization: `Bearer ${mcpConfig.mcpToken()}` },
|
|
84
|
+
},
|
|
85
|
+
};
|
|
86
|
+
opts.allowedTools.push(`mcp__${opts.mcpServer}__*`);
|
|
87
|
+
}
|
|
88
|
+
|
|
74
89
|
if (opts.agentProfile) {
|
|
75
90
|
process.env.LIBEVAL_AGENT_PROFILE = opts.agentProfile;
|
|
76
91
|
}
|
|
@@ -88,6 +103,7 @@ export async function runSuperviseCommand(values, _args) {
|
|
|
88
103
|
supervisorProfile: opts.supervisorProfile,
|
|
89
104
|
agentProfile: opts.agentProfile,
|
|
90
105
|
taskAmend: opts.taskAmend,
|
|
106
|
+
agentMcpServers,
|
|
91
107
|
});
|
|
92
108
|
|
|
93
109
|
const result = await supervisor.run(opts.taskContent);
|
package/src/facilitator.js
CHANGED
|
@@ -26,7 +26,8 @@ export const FACILITATOR_SYSTEM_PROMPT =
|
|
|
26
26
|
"Announce sends a message with no reply obligation. " +
|
|
27
27
|
"Redirect interrupts a participant with replacement instructions. " +
|
|
28
28
|
"RollCall lists participants. " +
|
|
29
|
-
"Conclude ends the session with a summary
|
|
29
|
+
"Conclude ends the session with a verdict ('success' or 'failure') and a summary; " +
|
|
30
|
+
"the verdict reflects whether the session met the criteria stated in the task.";
|
|
30
31
|
|
|
31
32
|
/** System prompt appended for facilitated agent runners. */
|
|
32
33
|
export const FACILITATED_AGENT_SYSTEM_PROMPT =
|
|
@@ -36,6 +37,7 @@ export const FACILITATED_AGENT_SYSTEM_PROMPT =
|
|
|
36
37
|
"Announce broadcasts a message. " +
|
|
37
38
|
"RollCall lists participants.";
|
|
38
39
|
|
|
40
|
+
/** Orchestrate N agent sessions coordinated by a single facilitator LLM session. */
|
|
39
41
|
export class Facilitator {
|
|
40
42
|
/**
|
|
41
43
|
* @param {object} deps
|
|
@@ -105,12 +107,14 @@ export class Facilitator {
|
|
|
105
107
|
// messages and started processing concurrently.
|
|
106
108
|
this.concludeResolve();
|
|
107
109
|
await Promise.allSettled(agentPromises);
|
|
110
|
+
const success = this.ctx.verdict === "success";
|
|
108
111
|
this.emitSummary({
|
|
109
|
-
success
|
|
112
|
+
success,
|
|
113
|
+
verdict: this.ctx.verdict,
|
|
110
114
|
turns: this.facilitatorTurns,
|
|
111
115
|
summary: this.ctx.summary,
|
|
112
116
|
});
|
|
113
|
-
return { success
|
|
117
|
+
return { success, turns: this.facilitatorTurns };
|
|
114
118
|
}
|
|
115
119
|
|
|
116
120
|
// Abort agents promptly when Conclude is called during the event loop
|
|
@@ -133,12 +137,14 @@ export class Facilitator {
|
|
|
133
137
|
throw err;
|
|
134
138
|
}
|
|
135
139
|
|
|
140
|
+
const success = this.ctx.concluded && this.ctx.verdict === "success";
|
|
136
141
|
const result = {
|
|
137
|
-
success
|
|
142
|
+
success,
|
|
138
143
|
turns: this.facilitatorTurns,
|
|
139
144
|
};
|
|
140
145
|
this.emitSummary({
|
|
141
|
-
success
|
|
146
|
+
success,
|
|
147
|
+
verdict: this.ctx.verdict,
|
|
142
148
|
turns: result.turns,
|
|
143
149
|
summary: this.ctx.summary,
|
|
144
150
|
});
|
|
@@ -296,6 +302,7 @@ export class Facilitator {
|
|
|
296
302
|
}
|
|
297
303
|
}
|
|
298
304
|
|
|
305
|
+
/** Return the last assistant text block from a runner's buffer, or the fallback if none exists. */
|
|
299
306
|
extractLastText(runner, fallback) {
|
|
300
307
|
const lines = runner.buffer;
|
|
301
308
|
for (let i = lines.length - 1; i >= 0; i--) {
|
|
@@ -342,7 +349,7 @@ export class Facilitator {
|
|
|
342
349
|
}
|
|
343
350
|
|
|
344
351
|
/**
|
|
345
|
-
* @param {{success: boolean, turns: number, summary?: string}} result
|
|
352
|
+
* @param {{success: boolean, verdict?: string|null, turns: number, summary?: string}} result
|
|
346
353
|
*/
|
|
347
354
|
emitSummary(result) {
|
|
348
355
|
this.output.write(
|
|
@@ -352,6 +359,7 @@ export class Facilitator {
|
|
|
352
359
|
event: {
|
|
353
360
|
type: "summary",
|
|
354
361
|
success: result.success,
|
|
362
|
+
...(result.verdict && { verdict: result.verdict }),
|
|
355
363
|
turns: result.turns,
|
|
356
364
|
...(result.summary && { summary: result.summary }),
|
|
357
365
|
},
|
package/src/message-bus.js
CHANGED
|
@@ -22,6 +22,7 @@ import { z } from "zod";
|
|
|
22
22
|
export function createOrchestrationContext() {
|
|
23
23
|
return {
|
|
24
24
|
concluded: false,
|
|
25
|
+
verdict: null,
|
|
25
26
|
summary: null,
|
|
26
27
|
redirect: null,
|
|
27
28
|
participants: [],
|
|
@@ -37,14 +38,17 @@ export function createOrchestrationContext() {
|
|
|
37
38
|
|
|
38
39
|
// --- Handler factories ---
|
|
39
40
|
|
|
41
|
+
/** Create a handler that marks the session as concluded and records the verdict and summary. */
|
|
40
42
|
export function createConcludeHandler(ctx) {
|
|
41
|
-
return async ({ summary }) => {
|
|
43
|
+
return async ({ verdict, summary }) => {
|
|
42
44
|
ctx.concluded = true;
|
|
45
|
+
ctx.verdict = verdict;
|
|
43
46
|
ctx.summary = summary;
|
|
44
47
|
return { content: [{ type: "text", text: "Session concluded." }] };
|
|
45
48
|
};
|
|
46
49
|
}
|
|
47
50
|
|
|
51
|
+
/** Create a handler that queues a redirect to interrupt a participant with replacement instructions. */
|
|
48
52
|
export function createRedirectHandler(ctx) {
|
|
49
53
|
return async ({ message, to }) => {
|
|
50
54
|
ctx.redirect = { message, to: to ?? null };
|
|
@@ -52,6 +56,7 @@ export function createRedirectHandler(ctx) {
|
|
|
52
56
|
};
|
|
53
57
|
}
|
|
54
58
|
|
|
59
|
+
/** Create a handler that returns the list of all session participants and their roles. */
|
|
55
60
|
export function createRollCallHandler(ctx) {
|
|
56
61
|
return async () => {
|
|
57
62
|
return {
|
|
@@ -217,8 +222,8 @@ export function createSupervisorToolServer(ctx) {
|
|
|
217
222
|
),
|
|
218
223
|
tool(
|
|
219
224
|
"Conclude",
|
|
220
|
-
"End the session with a summary.",
|
|
221
|
-
{ summary: z.string() },
|
|
225
|
+
"End the session with a verdict and a summary. verdict='success' if the agent's work meets the criteria stated in the task; 'failure' otherwise.",
|
|
226
|
+
{ verdict: z.enum(["success", "failure"]), summary: z.string() },
|
|
222
227
|
createConcludeHandler(ctx),
|
|
223
228
|
),
|
|
224
229
|
tool(
|
|
@@ -304,8 +309,8 @@ export function createFacilitatorToolServer(ctx) {
|
|
|
304
309
|
),
|
|
305
310
|
tool(
|
|
306
311
|
"Conclude",
|
|
307
|
-
"End the session with a summary.",
|
|
308
|
-
{ summary: z.string() },
|
|
312
|
+
"End the session with a verdict and a summary. verdict='success' if the agent's work meets the criteria stated in the task; 'failure' otherwise.",
|
|
313
|
+
{ verdict: z.enum(["success", "failure"]), summary: z.string() },
|
|
309
314
|
createConcludeHandler(ctx),
|
|
310
315
|
),
|
|
311
316
|
tool(
|
package/src/render/tool-hints.js
CHANGED
|
@@ -6,6 +6,11 @@
|
|
|
6
6
|
* tool (file path, command, pattern, …) sanitized to strip JSON punctuation
|
|
7
7
|
* (`{`, `}`, `"`) and collapsed to a single line ≤ 80 chars.
|
|
8
8
|
*
|
|
9
|
+
* MCP-prefixed tools (`mcp__*`) are an intentional carve-out: their hint is
|
|
10
|
+
* the full input rendered as compact single-line JSON, so `{` and `"` do
|
|
11
|
+
* appear on those lines. Readers of GitHub workflow logs need the full MCP
|
|
12
|
+
* payload to know what was actually sent across the protocol.
|
|
13
|
+
*
|
|
9
14
|
* `previewForResult(content, isError)` collapses a tool result to a single
|
|
10
15
|
* line ≤ 80 chars and flags errors so the renderer can apply the reserved
|
|
11
16
|
* error color and the `Error:` label.
|
|
@@ -91,35 +96,16 @@ export function simplifyToolName(name) {
|
|
|
91
96
|
return parts.slice(2).join("__");
|
|
92
97
|
}
|
|
93
98
|
|
|
94
|
-
/**
|
|
95
|
-
* MCP-prefixed tool names (e.g. `mcp__orchestration__Ask`) take a different
|
|
96
|
-
* handler path. The method name itself is surfaced via `simplifyToolName`,
|
|
97
|
-
* so this only adds the `to/from` decorators for orchestration calls.
|
|
98
|
-
* Returns null if the name does not match any MCP prefix.
|
|
99
|
-
* @param {string} name
|
|
100
|
-
* @param {object} input
|
|
101
|
-
* @returns {string|null}
|
|
102
|
-
*/
|
|
103
|
-
function hintForMcp(name, input) {
|
|
104
|
-
if (name.startsWith("mcp__orchestration__")) {
|
|
105
|
-
const parts = [];
|
|
106
|
-
if (input.to) parts.push(`to ${sanitize(input.to)}`);
|
|
107
|
-
if (input.from) parts.push(`from ${sanitize(input.from)}`);
|
|
108
|
-
return truncate(parts.join(" "));
|
|
109
|
-
}
|
|
110
|
-
if (name.startsWith("mcp__")) {
|
|
111
|
-
return "";
|
|
112
|
-
}
|
|
113
|
-
return null;
|
|
114
|
-
}
|
|
115
|
-
|
|
116
99
|
/**
|
|
117
100
|
* Map a tool name and input to a one-line human hint.
|
|
118
101
|
*
|
|
119
|
-
*
|
|
120
|
-
*
|
|
121
|
-
*
|
|
122
|
-
*
|
|
102
|
+
* Three branches, in priority order:
|
|
103
|
+
* - A built-in tool with an entry in `HINT_HANDLERS` → sanitized hint, no
|
|
104
|
+
* `{` / `"` from the input (spec 540 criterion #2 for non-MCP tools).
|
|
105
|
+
* - An MCP-prefixed tool (`mcp__*`) → full input rendered as compact
|
|
106
|
+
* single-line JSON; `{` and `"` intentionally appear so readers see
|
|
107
|
+
* the actual MCP payload.
|
|
108
|
+
* - Anything else → "" (the caller still shows the bare tool name).
|
|
123
109
|
*
|
|
124
110
|
* @param {string} name - Tool name (e.g. "Bash", "Read", "mcp__orchestration__Ask")
|
|
125
111
|
* @param {object|null|undefined} input - Raw tool input object from the trace
|
|
@@ -132,8 +118,7 @@ export function hintForCall(name, input) {
|
|
|
132
118
|
const handler = HINT_HANDLERS[name];
|
|
133
119
|
if (handler) return handler(safeInput);
|
|
134
120
|
|
|
135
|
-
|
|
136
|
-
if (mcp !== null) return mcp;
|
|
121
|
+
if (name.startsWith("mcp__")) return JSON.stringify(safeInput);
|
|
137
122
|
|
|
138
123
|
return "";
|
|
139
124
|
}
|
|
@@ -154,32 +139,15 @@ export function previewForResult(content, isError) {
|
|
|
154
139
|
: typeof content === "string"
|
|
155
140
|
? content
|
|
156
141
|
: JSON.stringify(content);
|
|
157
|
-
const
|
|
158
|
-
|
|
159
|
-
|
|
160
|
-
|
|
161
|
-
|
|
162
|
-
break;
|
|
163
|
-
}
|
|
164
|
-
}
|
|
165
|
-
|
|
166
|
-
if (isError) {
|
|
167
|
-
const body = firstNonBlank || "(no output)";
|
|
168
|
-
return {
|
|
169
|
-
text:
|
|
170
|
-
body.length <= MAX_HINT_CHARS
|
|
171
|
-
? body
|
|
172
|
-
: body.slice(0, MAX_HINT_CHARS - 3) + "...",
|
|
173
|
-
isError: true,
|
|
174
|
-
};
|
|
175
|
-
}
|
|
142
|
+
const firstNonBlank =
|
|
143
|
+
normalized
|
|
144
|
+
.split(/\r?\n/)
|
|
145
|
+
.map((l) => l.trim())
|
|
146
|
+
.find((l) => l.length > 0) ?? "";
|
|
176
147
|
|
|
177
|
-
|
|
148
|
+
const fallback = isError ? "(no output)" : "(ok)";
|
|
178
149
|
return {
|
|
179
|
-
text:
|
|
180
|
-
|
|
181
|
-
? firstNonBlank
|
|
182
|
-
: firstNonBlank.slice(0, MAX_HINT_CHARS - 3) + "...",
|
|
183
|
-
isError: false,
|
|
150
|
+
text: truncate(firstNonBlank || fallback),
|
|
151
|
+
isError,
|
|
184
152
|
};
|
|
185
153
|
}
|
|
@@ -25,12 +25,7 @@ import {
|
|
|
25
25
|
* @returns {string[]} Array of rendered line strings
|
|
26
26
|
*/
|
|
27
27
|
export function renderTurnLines(turn, withPrefix) {
|
|
28
|
-
|
|
29
|
-
if (turn.role === "tool_result")
|
|
30
|
-
return renderToolResultTurn(turn, withPrefix);
|
|
31
|
-
if (turn.role === "system") return renderSystemTurn(turn, withPrefix);
|
|
32
|
-
if (turn.role === "user") return renderUserTurn(turn, withPrefix);
|
|
33
|
-
return [];
|
|
28
|
+
return TURN_RENDERERS[turn.role]?.(turn, withPrefix) ?? [];
|
|
34
29
|
}
|
|
35
30
|
|
|
36
31
|
/** @param {object} turn @param {boolean} withPrefix @returns {string[]} */
|
|
@@ -57,10 +52,13 @@ function renderAssistantTurn(turn, withPrefix) {
|
|
|
57
52
|
|
|
58
53
|
/** @param {object} turn @param {boolean} withPrefix @returns {string[]} */
|
|
59
54
|
function renderToolResultTurn(turn, withPrefix) {
|
|
55
|
+
// Successful tool results emit no preview line — the trace document keeps
|
|
56
|
+
// the structured turn, but readers of the streamed log only see errors.
|
|
57
|
+
if (!turn.isError) return [];
|
|
60
58
|
return [
|
|
61
59
|
renderToolResultLine({
|
|
62
60
|
source: turn.source,
|
|
63
|
-
preview: previewForResult(turn.content,
|
|
61
|
+
preview: previewForResult(turn.content, true),
|
|
64
62
|
withPrefix,
|
|
65
63
|
}),
|
|
66
64
|
];
|
|
@@ -90,3 +88,10 @@ function renderUserTurn(turn, withPrefix) {
|
|
|
90
88
|
}
|
|
91
89
|
return lines;
|
|
92
90
|
}
|
|
91
|
+
|
|
92
|
+
const TURN_RENDERERS = {
|
|
93
|
+
assistant: renderAssistantTurn,
|
|
94
|
+
tool_result: renderToolResultTurn,
|
|
95
|
+
system: renderSystemTurn,
|
|
96
|
+
user: renderUserTurn,
|
|
97
|
+
};
|
package/src/sequence-counter.js
CHANGED
|
@@ -2,16 +2,20 @@
|
|
|
2
2
|
* SequenceCounter — global monotonic counter shared across all participants
|
|
3
3
|
* in a session. Single-threaded JS means no synchronization needed.
|
|
4
4
|
*/
|
|
5
|
+
/** Monotonic counter that assigns globally ordered sequence numbers within a session. */
|
|
5
6
|
export class SequenceCounter {
|
|
7
|
+
/** Initialize the counter at zero. */
|
|
6
8
|
constructor() {
|
|
7
9
|
this.value = 0;
|
|
8
10
|
}
|
|
9
11
|
|
|
12
|
+
/** Return the current value and advance the counter by one. */
|
|
10
13
|
next() {
|
|
11
14
|
return this.value++;
|
|
12
15
|
}
|
|
13
16
|
}
|
|
14
17
|
|
|
18
|
+
/** Create a new SequenceCounter starting at zero. */
|
|
15
19
|
export function createSequenceCounter() {
|
|
16
20
|
return new SequenceCounter();
|
|
17
21
|
}
|
package/src/supervisor.js
CHANGED
|
@@ -4,8 +4,9 @@
|
|
|
4
4
|
* introduces itself, and delegates work to the agent. The loop then alternates:
|
|
5
5
|
* agent → supervisor → agent.
|
|
6
6
|
*
|
|
7
|
-
* Signaling uses orchestration tools (Ask /
|
|
8
|
-
*
|
|
7
|
+
* Signaling uses orchestration tools (Ask / Announce / Redirect / Conclude)
|
|
8
|
+
* via in-process MCP servers; the supervisor has no Answer tool — agent replies
|
|
9
|
+
* are routed back through the relay loop. The Ask/Answer contract is enforced
|
|
9
10
|
* at turn boundaries: an unanswered Ask triggers one synthetic reminder and
|
|
10
11
|
* then a `protocol_violation` trace event plus a null-answer injection so the
|
|
11
12
|
* session advances without silent deadlock.
|
|
@@ -35,7 +36,8 @@ export const SUPERVISOR_SYSTEM_PROMPT =
|
|
|
35
36
|
"Answer replies to an ask the agent addressed to you. " +
|
|
36
37
|
"Announce sends a message with no reply obligation. " +
|
|
37
38
|
"Redirect interrupts the agent with replacement instructions. " +
|
|
38
|
-
"Conclude ends the session with a summary
|
|
39
|
+
"Conclude ends the session with a verdict ('success' or 'failure') and a summary; " +
|
|
40
|
+
"the verdict reflects whether the agent's work meets the criteria stated in the task.";
|
|
39
41
|
|
|
40
42
|
/** System prompt appended for the agent runner in supervise mode. */
|
|
41
43
|
export const AGENT_SYSTEM_PROMPT =
|
|
@@ -52,6 +54,7 @@ export const AGENT_SYSTEM_PROMPT =
|
|
|
52
54
|
*/
|
|
53
55
|
const MAX_INTERVENTIONS_PER_TURN = 5;
|
|
54
56
|
|
|
57
|
+
/** Orchestrate a relay loop between a supervisor LLM and an agent LLM with mid-turn review. */
|
|
55
58
|
export class Supervisor {
|
|
56
59
|
/**
|
|
57
60
|
* @param {object} deps
|
|
@@ -108,8 +111,14 @@ export class Supervisor {
|
|
|
108
111
|
}
|
|
109
112
|
|
|
110
113
|
if (this.ctx.concluded) {
|
|
111
|
-
|
|
112
|
-
|
|
114
|
+
const success = this.ctx.verdict === "success";
|
|
115
|
+
this.emitSummary({
|
|
116
|
+
success,
|
|
117
|
+
verdict: this.ctx.verdict,
|
|
118
|
+
turns: 0,
|
|
119
|
+
summary: this.ctx.summary,
|
|
120
|
+
});
|
|
121
|
+
return { success, turns: 0 };
|
|
113
122
|
}
|
|
114
123
|
|
|
115
124
|
let pendingRelay = null;
|
|
@@ -212,12 +221,14 @@ export class Supervisor {
|
|
|
212
221
|
}
|
|
213
222
|
|
|
214
223
|
if (this.ctx.concluded) {
|
|
224
|
+
const success = this.ctx.verdict === "success";
|
|
215
225
|
this.emitSummary({
|
|
216
|
-
success
|
|
226
|
+
success,
|
|
227
|
+
verdict: this.ctx.verdict,
|
|
217
228
|
turns: turn,
|
|
218
229
|
summary: this.ctx.summary,
|
|
219
230
|
});
|
|
220
|
-
return { type: "exit", exit: { success
|
|
231
|
+
return { type: "exit", exit: { success, turns: turn } };
|
|
221
232
|
}
|
|
222
233
|
|
|
223
234
|
if (agentResult.aborted && this.ctx.redirect) {
|
|
@@ -306,12 +317,14 @@ export class Supervisor {
|
|
|
306
317
|
}
|
|
307
318
|
|
|
308
319
|
if (this.ctx.concluded) {
|
|
320
|
+
const success = this.ctx.verdict === "success";
|
|
309
321
|
this.emitSummary({
|
|
310
|
-
success
|
|
322
|
+
success,
|
|
323
|
+
verdict: this.ctx.verdict,
|
|
311
324
|
turns: turn,
|
|
312
325
|
summary: this.ctx.summary,
|
|
313
326
|
});
|
|
314
|
-
return { exit: { success
|
|
327
|
+
return { exit: { success, turns: turn } };
|
|
315
328
|
}
|
|
316
329
|
|
|
317
330
|
if (this.#checkAsk("supervisor") === "recheck" && !this.ctx.concluded) {
|
|
@@ -321,12 +334,14 @@ export class Supervisor {
|
|
|
321
334
|
formatMessages(reminders),
|
|
322
335
|
);
|
|
323
336
|
if (this.ctx.concluded) {
|
|
337
|
+
const success = this.ctx.verdict === "success";
|
|
324
338
|
this.emitSummary({
|
|
325
|
-
success
|
|
339
|
+
success,
|
|
340
|
+
verdict: this.ctx.verdict,
|
|
326
341
|
turns: turn,
|
|
327
342
|
summary: this.ctx.summary,
|
|
328
343
|
});
|
|
329
|
-
return { exit: { success
|
|
344
|
+
return { exit: { success, turns: turn } };
|
|
330
345
|
}
|
|
331
346
|
this.#checkAsk("supervisor");
|
|
332
347
|
}
|
|
@@ -424,7 +439,7 @@ export class Supervisor {
|
|
|
424
439
|
|
|
425
440
|
/**
|
|
426
441
|
* Emit a final orchestrator summary line, wrapped in the universal envelope.
|
|
427
|
-
* @param {{success: boolean, turns: number, summary?: string}} result
|
|
442
|
+
* @param {{success: boolean, verdict?: string|null, turns: number, summary?: string}} result
|
|
428
443
|
*/
|
|
429
444
|
emitSummary(result) {
|
|
430
445
|
this.output.write(
|
|
@@ -434,6 +449,7 @@ export class Supervisor {
|
|
|
434
449
|
event: {
|
|
435
450
|
type: "summary",
|
|
436
451
|
success: result.success,
|
|
452
|
+
...(result.verdict && { verdict: result.verdict }),
|
|
437
453
|
turns: result.turns,
|
|
438
454
|
...(result.summary && { summary: result.summary }),
|
|
439
455
|
},
|
|
@@ -464,6 +480,7 @@ const devNull = new Writable({
|
|
|
464
480
|
* @param {string} [deps.agentProfile] - Agent profile name; resolved into the main-thread system prompt via `composeProfilePrompt`.
|
|
465
481
|
* @param {string} [deps.profilesDir] - Directory containing `<name>.md` profile files. Defaults to `<supervisorCwd>/.claude/agents`. Resolved once from the orchestrator's cwd so profiles travel with the project, not with a per-agent sandbox.
|
|
466
482
|
* @param {string} [deps.taskAmend] - Opaque addendum appended to the task before delivery.
|
|
483
|
+
* @param {Record<string, object>} [deps.agentMcpServers] - Additional MCP servers exposed to the agent (merged alongside the orchestration server).
|
|
467
484
|
* @returns {Supervisor}
|
|
468
485
|
*/
|
|
469
486
|
export function createSupervisor({
|
|
@@ -480,6 +497,7 @@ export function createSupervisor({
|
|
|
480
497
|
agentProfile,
|
|
481
498
|
profilesDir,
|
|
482
499
|
taskAmend,
|
|
500
|
+
agentMcpServers,
|
|
483
501
|
}) {
|
|
484
502
|
const resolvedProfilesDir =
|
|
485
503
|
profilesDir ?? resolve(supervisorCwd, ".claude/agents");
|
|
@@ -519,7 +537,7 @@ export function createSupervisor({
|
|
|
519
537
|
onLine,
|
|
520
538
|
settingSources: ["project"],
|
|
521
539
|
systemPrompt: systemPromptFor(agentProfile, AGENT_SYSTEM_PROMPT),
|
|
522
|
-
mcpServers: { orchestration: agentServer },
|
|
540
|
+
mcpServers: { orchestration: agentServer, ...agentMcpServers },
|
|
523
541
|
});
|
|
524
542
|
|
|
525
543
|
const defaultDisallowed = ["Agent", "Task", "TaskOutput", "TaskStop"];
|
package/src/tee-writer.js
CHANGED
|
@@ -20,6 +20,7 @@ import { TraceCollector } from "./trace-collector.js";
|
|
|
20
20
|
import { renderTurnLines } from "./render/turn-renderer.js";
|
|
21
21
|
import { isSuppressedOrchestratorEvent } from "./render/orchestrator-filter.js";
|
|
22
22
|
|
|
23
|
+
/** Writable stream that saves raw NDJSON to a file while streaming human-readable text to a display stream. */
|
|
23
24
|
export class TeeWriter extends Writable {
|
|
24
25
|
/**
|
|
25
26
|
* @param {object} deps
|
|
@@ -99,6 +100,12 @@ export class TeeWriter extends Writable {
|
|
|
99
100
|
|
|
100
101
|
// Universal envelope: { source, seq, event }
|
|
101
102
|
if (parsed.event) {
|
|
103
|
+
// Always forward to the collector so it can capture orchestrator
|
|
104
|
+
// metadata (e.g. the summary verdict for the result footer); the
|
|
105
|
+
// collector adds no turn for suppressed events, so flushTurns stays
|
|
106
|
+
// a no-op when we skip it below.
|
|
107
|
+
this.collector.addLine(line);
|
|
108
|
+
|
|
102
109
|
// Orchestrator lifecycle events are suppressed from the text stream
|
|
103
110
|
// entirely (spec 540). They still reached fileStream above.
|
|
104
111
|
if (
|
|
@@ -107,7 +114,6 @@ export class TeeWriter extends Writable {
|
|
|
107
114
|
) {
|
|
108
115
|
return;
|
|
109
116
|
}
|
|
110
|
-
this.collector.addLine(line);
|
|
111
117
|
this.flushTurns();
|
|
112
118
|
return;
|
|
113
119
|
}
|
package/src/trace-collector.js
CHANGED
|
@@ -12,6 +12,7 @@
|
|
|
12
12
|
import { renderTurnLines } from "./render/turn-renderer.js";
|
|
13
13
|
import { isSuppressedOrchestratorEvent } from "./render/orchestrator-filter.js";
|
|
14
14
|
|
|
15
|
+
/** Accumulate Claude Code NDJSON stream events into structured traces for analysis or text replay. */
|
|
15
16
|
export class TraceCollector {
|
|
16
17
|
/**
|
|
17
18
|
* @param {object} [deps]
|
|
@@ -26,6 +27,8 @@ export class TraceCollector {
|
|
|
26
27
|
this.turns = [];
|
|
27
28
|
/** @type {object|null} */
|
|
28
29
|
this.result = null;
|
|
30
|
+
/** @type {{verdict?: string, summary?: string, turns?: number}|null} */
|
|
31
|
+
this.orchestratorSummary = null;
|
|
29
32
|
/** @type {number} */
|
|
30
33
|
this.turnIndex = 0;
|
|
31
34
|
/** @type {object|null} */
|
|
@@ -61,6 +64,16 @@ export class TraceCollector {
|
|
|
61
64
|
// Orchestrator lifecycle events carry no content and are suppressed
|
|
62
65
|
// from turns entirely — the NDJSON artifact keeps them separately.
|
|
63
66
|
if (source === "orchestrator" && isSuppressedOrchestratorEvent(event)) {
|
|
67
|
+
// The summary event carries the supervisor/facilitator verdict —
|
|
68
|
+
// capture it before dropping the event, so the result footer can
|
|
69
|
+
// surface verdict="failure" instead of the SDK's per-runner status.
|
|
70
|
+
if (event.type === "summary") {
|
|
71
|
+
this.orchestratorSummary = {
|
|
72
|
+
...(event.verdict && { verdict: event.verdict }),
|
|
73
|
+
...(typeof event.summary === "string" && { summary: event.summary }),
|
|
74
|
+
...(typeof event.turns === "number" && { turns: event.turns }),
|
|
75
|
+
};
|
|
76
|
+
}
|
|
64
77
|
return;
|
|
65
78
|
}
|
|
66
79
|
|
|
@@ -276,16 +289,20 @@ export class TraceCollector {
|
|
|
276
289
|
}
|
|
277
290
|
|
|
278
291
|
/**
|
|
279
|
-
* Format the trailing result summary line (spec 540).
|
|
292
|
+
* Format the trailing result summary line (spec 540). When an orchestrator
|
|
293
|
+
* summary is present (supervised / facilitated mode), the headline word is
|
|
294
|
+
* the supervisor's verdict ("success" / "failure") rather than the SDK's
|
|
295
|
+
* per-runner subtype, so the footer aligns with the CI exit code.
|
|
280
296
|
* @returns {string}
|
|
281
297
|
*/
|
|
282
298
|
#formatResultTail() {
|
|
283
299
|
if (!this.result) return "";
|
|
284
300
|
const duration = formatDuration(this.result.durationMs);
|
|
285
301
|
const cost = Number(this.result.totalCostUsd).toFixed(4);
|
|
302
|
+
const headline = this.orchestratorSummary?.verdict ?? this.result.result;
|
|
286
303
|
return (
|
|
287
304
|
"\n" +
|
|
288
|
-
`--- Result: ${
|
|
305
|
+
`--- Result: ${headline} | Turns: ${this.result.numTurns} | Cost: $${cost} | Duration: ${duration} ---`
|
|
289
306
|
);
|
|
290
307
|
}
|
|
291
308
|
}
|