@forwardimpact/libeval 0.1.26 → 0.1.28
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +6 -1
- package/bin/fit-eval.js +7 -7
- package/bin/fit-trace.js +7 -7
- package/package.json +21 -13
- package/src/agent-runner.js +18 -2
- package/src/commands/facilitate.js +4 -0
- package/src/commands/run.js +4 -0
- package/src/commands/supervise.js +4 -0
- package/src/facilitator.js +36 -28
- package/src/message-bus.js +1 -0
- package/src/orchestration-toolkit.js +3 -0
- package/src/orchestrator-helpers.js +1 -0
- package/src/render/turn-renderer.js +92 -0
- package/src/sequence-counter.js +4 -0
- package/src/supervisor.js +61 -28
- package/src/tee-writer.js +4 -60
- package/src/trace-collector.js +18 -70
- package/src/trace-github.js +0 -1
- package/src/trace-query.js +69 -43
package/README.md
CHANGED
|
@@ -1,6 +1,11 @@
|
|
|
1
1
|
# libeval
|
|
2
2
|
|
|
3
|
-
|
|
3
|
+
<!-- BEGIN:description — Do not edit. Generated from package.json. -->
|
|
4
|
+
|
|
5
|
+
Agent evaluation framework — prove whether agent changes improved outcomes with
|
|
6
|
+
reproducible evidence.
|
|
7
|
+
|
|
8
|
+
<!-- END:description -->
|
|
4
9
|
|
|
5
10
|
## Getting Started
|
|
6
11
|
|
package/bin/fit-eval.js
CHANGED
|
@@ -177,20 +177,20 @@ const definition = {
|
|
|
177
177
|
],
|
|
178
178
|
documentation: [
|
|
179
179
|
{
|
|
180
|
-
title: "
|
|
181
|
-
url: "https://www.forwardimpact.team/docs/libraries/
|
|
180
|
+
title: "Run an Eval",
|
|
181
|
+
url: "https://www.forwardimpact.team/docs/libraries/prove-changes/run-eval/index.md",
|
|
182
182
|
description:
|
|
183
183
|
"Author a judge profile, run an eval locally, wire it into CI, and inspect the resulting trace.",
|
|
184
184
|
},
|
|
185
185
|
{
|
|
186
|
-
title: "Agent
|
|
187
|
-
url: "https://www.forwardimpact.team/docs/libraries/
|
|
186
|
+
title: "Prove Agent Changes",
|
|
187
|
+
url: "https://www.forwardimpact.team/docs/libraries/prove-changes/index.md",
|
|
188
188
|
description:
|
|
189
|
-
"
|
|
189
|
+
"End-to-end workflow from dataset generation through evaluation to trace analysis, including multi-agent collaboration sessions.",
|
|
190
190
|
},
|
|
191
191
|
{
|
|
192
|
-
title: "
|
|
193
|
-
url: "https://www.forwardimpact.team/docs/libraries/trace-analysis/index.md",
|
|
192
|
+
title: "Analyze Traces",
|
|
193
|
+
url: "https://www.forwardimpact.team/docs/libraries/prove-changes/trace-analysis/index.md",
|
|
194
194
|
description:
|
|
195
195
|
"Read the NDJSON traces produced by `fit-eval` with `fit-trace` — grounded-theory method and worked examples.",
|
|
196
196
|
},
|
package/bin/fit-trace.js
CHANGED
|
@@ -214,22 +214,22 @@ const definition = {
|
|
|
214
214
|
],
|
|
215
215
|
documentation: [
|
|
216
216
|
{
|
|
217
|
-
title: "
|
|
218
|
-
url: "https://www.forwardimpact.team/docs/libraries/trace-analysis/index.md",
|
|
217
|
+
title: "Analyze Traces",
|
|
218
|
+
url: "https://www.forwardimpact.team/docs/libraries/prove-changes/trace-analysis/index.md",
|
|
219
219
|
description:
|
|
220
220
|
"The full method walkthrough with worked examples (an eval that failed, a multi-agent session that stalled).",
|
|
221
221
|
},
|
|
222
222
|
{
|
|
223
|
-
title: "
|
|
224
|
-
url: "https://www.forwardimpact.team/docs/libraries/
|
|
223
|
+
title: "Run an Eval",
|
|
224
|
+
url: "https://www.forwardimpact.team/docs/libraries/prove-changes/run-eval/index.md",
|
|
225
225
|
description:
|
|
226
226
|
"How `fit-eval supervise` produces the traces this skill analyzes.",
|
|
227
227
|
},
|
|
228
228
|
{
|
|
229
|
-
title: "Agent
|
|
230
|
-
url: "https://www.forwardimpact.team/docs/libraries/
|
|
229
|
+
title: "Prove Agent Changes",
|
|
230
|
+
url: "https://www.forwardimpact.team/docs/libraries/prove-changes/index.md",
|
|
231
231
|
description:
|
|
232
|
-
"
|
|
232
|
+
"End-to-end workflow including multi-agent collaboration; `split` is the bridge into per-source trace files.",
|
|
233
233
|
},
|
|
234
234
|
],
|
|
235
235
|
};
|
package/package.json
CHANGED
|
@@ -1,7 +1,7 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "@forwardimpact/libeval",
|
|
3
|
-
"version": "0.1.
|
|
4
|
-
"description": "Agent evaluation
|
|
3
|
+
"version": "0.1.28",
|
|
4
|
+
"description": "Agent evaluation framework — prove whether agent changes improved outcomes with reproducible evidence.",
|
|
5
5
|
"keywords": [
|
|
6
6
|
"eval",
|
|
7
7
|
"agent",
|
|
@@ -9,16 +9,24 @@
|
|
|
9
9
|
"claude-code",
|
|
10
10
|
"supervisor"
|
|
11
11
|
],
|
|
12
|
-
"
|
|
13
|
-
|
|
14
|
-
"
|
|
15
|
-
|
|
16
|
-
|
|
17
|
-
"Supervise a multi-step or multi-agent workflow"
|
|
18
|
-
]
|
|
12
|
+
"homepage": "https://www.forwardimpact.team",
|
|
13
|
+
"repository": {
|
|
14
|
+
"type": "git",
|
|
15
|
+
"url": "git+https://github.com/forwardimpact/monorepo.git",
|
|
16
|
+
"directory": "libraries/libeval"
|
|
19
17
|
},
|
|
20
18
|
"license": "Apache-2.0",
|
|
21
19
|
"author": "D. Olsson <hi@senzilla.io>",
|
|
20
|
+
"jobs": [
|
|
21
|
+
{
|
|
22
|
+
"user": "Platform Builders",
|
|
23
|
+
"goal": "Prove Agent Changes",
|
|
24
|
+
"trigger": "An eval passes locally but fails in CI and the only output is 'assertion failed.'",
|
|
25
|
+
"bigHire": "prove whether agent changes improved outcomes with reproducible evidence.",
|
|
26
|
+
"littleHire": "run an eval and get a trace that shows exactly what the agent did.",
|
|
27
|
+
"competesWith": "manual before/after comparison; trusting gut feeling over evidence; skipping evaluation entirely"
|
|
28
|
+
}
|
|
29
|
+
],
|
|
22
30
|
"type": "module",
|
|
23
31
|
"main": "./src/index.js",
|
|
24
32
|
"exports": {
|
|
@@ -35,10 +43,6 @@
|
|
|
35
43
|
"bin/**/*.js",
|
|
36
44
|
"README.md"
|
|
37
45
|
],
|
|
38
|
-
"engines": {
|
|
39
|
-
"bun": ">=1.2.0",
|
|
40
|
-
"node": ">=18.0.0"
|
|
41
|
-
},
|
|
42
46
|
"scripts": {
|
|
43
47
|
"test": "bun test test/*.test.js"
|
|
44
48
|
},
|
|
@@ -52,6 +56,10 @@
|
|
|
52
56
|
"devDependencies": {
|
|
53
57
|
"@forwardimpact/libharness": "^0.1.14"
|
|
54
58
|
},
|
|
59
|
+
"engines": {
|
|
60
|
+
"bun": ">=1.2.0",
|
|
61
|
+
"node": ">=18.0.0"
|
|
62
|
+
},
|
|
55
63
|
"publishConfig": {
|
|
56
64
|
"access": "public"
|
|
57
65
|
}
|
package/src/agent-runner.js
CHANGED
|
@@ -32,6 +32,7 @@ function applyDefaults(deps) {
|
|
|
32
32
|
};
|
|
33
33
|
}
|
|
34
34
|
|
|
35
|
+
/** Run a single Claude Agent SDK session and emit raw NDJSON events to an output stream. */
|
|
35
36
|
export class AgentRunner {
|
|
36
37
|
/**
|
|
37
38
|
* @param {object} deps
|
|
@@ -211,8 +212,9 @@ export class AgentRunner {
|
|
|
211
212
|
if (message.type === "system" && message.subtype === "init") {
|
|
212
213
|
this.sessionId = message.session_id;
|
|
213
214
|
}
|
|
214
|
-
if (message.type === "assistant"
|
|
215
|
-
state.assistantTextCount++;
|
|
215
|
+
if (message.type === "assistant") {
|
|
216
|
+
if (hasTextBlock(message)) state.assistantTextCount++;
|
|
217
|
+
trackSkillInvocation(message);
|
|
216
218
|
}
|
|
217
219
|
}
|
|
218
220
|
|
|
@@ -293,6 +295,20 @@ export function hasTextBlock(message) {
|
|
|
293
295
|
return false;
|
|
294
296
|
}
|
|
295
297
|
|
|
298
|
+
function trackSkillInvocation(message) {
|
|
299
|
+
const content = message.message?.content ?? message.content;
|
|
300
|
+
if (!Array.isArray(content)) return;
|
|
301
|
+
for (const block of content) {
|
|
302
|
+
if (
|
|
303
|
+
block.type === "tool_use" &&
|
|
304
|
+
block.name === "Skill" &&
|
|
305
|
+
block.input?.skill
|
|
306
|
+
) {
|
|
307
|
+
process.env.LIBEVAL_SKILL = block.input.skill;
|
|
308
|
+
}
|
|
309
|
+
}
|
|
310
|
+
}
|
|
311
|
+
|
|
296
312
|
/**
|
|
297
313
|
* Factory function — wires real dependencies.
|
|
298
314
|
* @param {object} deps - Same as AgentRunner constructor
|
|
@@ -73,6 +73,10 @@ export async function runFacilitateCommand(values, _args) {
|
|
|
73
73
|
})
|
|
74
74
|
: process.stdout;
|
|
75
75
|
|
|
76
|
+
if (opts.facilitatorProfile) {
|
|
77
|
+
process.env.LIBEVAL_AGENT_PROFILE = opts.facilitatorProfile;
|
|
78
|
+
}
|
|
79
|
+
|
|
76
80
|
const { query } = await import("@anthropic-ai/claude-agent-sdk");
|
|
77
81
|
const facilitator = createFacilitator({
|
|
78
82
|
facilitatorCwd: opts.facilitatorCwd,
|
package/src/commands/run.js
CHANGED
|
@@ -78,6 +78,10 @@ export async function runRunCommand(values, _args) {
|
|
|
78
78
|
);
|
|
79
79
|
};
|
|
80
80
|
|
|
81
|
+
if (agentProfile) {
|
|
82
|
+
process.env.LIBEVAL_AGENT_PROFILE = agentProfile;
|
|
83
|
+
}
|
|
84
|
+
|
|
81
85
|
const systemPrompt = agentProfile
|
|
82
86
|
? composeProfilePrompt(agentProfile, {
|
|
83
87
|
profilesDir: resolve(cwd, ".claude/agents"),
|
|
@@ -71,6 +71,10 @@ export async function runSuperviseCommand(values, _args) {
|
|
|
71
71
|
})
|
|
72
72
|
: process.stdout;
|
|
73
73
|
|
|
74
|
+
if (opts.agentProfile) {
|
|
75
|
+
process.env.LIBEVAL_AGENT_PROFILE = opts.agentProfile;
|
|
76
|
+
}
|
|
77
|
+
|
|
74
78
|
const { query } = await import("@anthropic-ai/claude-agent-sdk");
|
|
75
79
|
const supervisor = createSupervisor({
|
|
76
80
|
supervisorCwd: opts.supervisorCwd,
|
package/src/facilitator.js
CHANGED
|
@@ -36,6 +36,7 @@ export const FACILITATED_AGENT_SYSTEM_PROMPT =
|
|
|
36
36
|
"Announce broadcasts a message. " +
|
|
37
37
|
"RollCall lists participants.";
|
|
38
38
|
|
|
39
|
+
/** Orchestrate N agent sessions coordinated by a single facilitator LLM session. */
|
|
39
40
|
export class Facilitator {
|
|
40
41
|
/**
|
|
41
42
|
* @param {object} deps
|
|
@@ -180,42 +181,48 @@ export class Facilitator {
|
|
|
180
181
|
let messages = this.messageBus.drain(agent.name);
|
|
181
182
|
if (messages.length === 0) return;
|
|
182
183
|
|
|
183
|
-
this.emitOrchestratorEvent({
|
|
184
|
-
type: "agent_start",
|
|
185
|
-
agent: agent.name,
|
|
186
|
-
});
|
|
184
|
+
this.emitOrchestratorEvent({ type: "agent_start", agent: agent.name });
|
|
187
185
|
await agent.runner.run(formatMessages(messages));
|
|
188
|
-
if (this
|
|
186
|
+
if (await this.#settleAgentTurn(agent)) return;
|
|
187
|
+
|
|
188
|
+
// Loop: check for new messages, resume if any
|
|
189
|
+
while (!this.ctx.concluded) {
|
|
190
|
+
messages = await this.#awaitAgentMessages(agent.name);
|
|
191
|
+
if (messages.length === 0) break;
|
|
192
|
+
await agent.runner.resume(formatMessages(messages));
|
|
193
|
+
if (await this.#settleAgentTurn(agent)) break;
|
|
194
|
+
}
|
|
195
|
+
}
|
|
196
|
+
|
|
197
|
+
/**
|
|
198
|
+
* Enforce pending-ask and emit turn_complete. Returns true when the
|
|
199
|
+
* session has concluded and the caller should stop.
|
|
200
|
+
*/
|
|
201
|
+
async #settleAgentTurn(agent) {
|
|
202
|
+
if (this.ctx.concluded) return true;
|
|
189
203
|
await this.#enforcePendingAsk(agent);
|
|
190
|
-
if (this.ctx.concluded) return;
|
|
204
|
+
if (this.ctx.concluded) return true;
|
|
191
205
|
this.eventQueue.enqueue({
|
|
192
206
|
type: "lifecycle",
|
|
193
207
|
agent: agent.name,
|
|
194
208
|
status: "turn_complete",
|
|
195
209
|
});
|
|
210
|
+
return false;
|
|
211
|
+
}
|
|
196
212
|
|
|
197
|
-
|
|
198
|
-
|
|
199
|
-
|
|
200
|
-
|
|
201
|
-
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
if (this.ctx.concluded) break;
|
|
211
|
-
await this.#enforcePendingAsk(agent);
|
|
212
|
-
if (this.ctx.concluded) break;
|
|
213
|
-
this.eventQueue.enqueue({
|
|
214
|
-
type: "lifecycle",
|
|
215
|
-
agent: agent.name,
|
|
216
|
-
status: "turn_complete",
|
|
217
|
-
});
|
|
218
|
-
}
|
|
213
|
+
/**
|
|
214
|
+
* Wait for messages addressed to `name`, returning an empty array when
|
|
215
|
+
* the session concludes first.
|
|
216
|
+
*/
|
|
217
|
+
async #awaitAgentMessages(name) {
|
|
218
|
+
const messages = this.messageBus.drain(name);
|
|
219
|
+
if (messages.length > 0) return messages;
|
|
220
|
+
await Promise.race([
|
|
221
|
+
this.messageBus.waitForMessages(name),
|
|
222
|
+
this.concludePromise,
|
|
223
|
+
]);
|
|
224
|
+
if (this.ctx.concluded) return [];
|
|
225
|
+
return this.messageBus.drain(name);
|
|
219
226
|
}
|
|
220
227
|
|
|
221
228
|
/**
|
|
@@ -290,6 +297,7 @@ export class Facilitator {
|
|
|
290
297
|
}
|
|
291
298
|
}
|
|
292
299
|
|
|
300
|
+
/** Return the last assistant text block from a runner's buffer, or the fallback if none exists. */
|
|
293
301
|
extractLastText(runner, fallback) {
|
|
294
302
|
const lines = runner.buffer;
|
|
295
303
|
for (let i = lines.length - 1; i >= 0; i--) {
|
package/src/message-bus.js
CHANGED
|
@@ -37,6 +37,7 @@ export function createOrchestrationContext() {
|
|
|
37
37
|
|
|
38
38
|
// --- Handler factories ---
|
|
39
39
|
|
|
40
|
+
/** Create a handler that marks the session as concluded and records the summary. */
|
|
40
41
|
export function createConcludeHandler(ctx) {
|
|
41
42
|
return async ({ summary }) => {
|
|
42
43
|
ctx.concluded = true;
|
|
@@ -45,6 +46,7 @@ export function createConcludeHandler(ctx) {
|
|
|
45
46
|
};
|
|
46
47
|
}
|
|
47
48
|
|
|
49
|
+
/** Create a handler that queues a redirect to interrupt a participant with replacement instructions. */
|
|
48
50
|
export function createRedirectHandler(ctx) {
|
|
49
51
|
return async ({ message, to }) => {
|
|
50
52
|
ctx.redirect = { message, to: to ?? null };
|
|
@@ -52,6 +54,7 @@ export function createRedirectHandler(ctx) {
|
|
|
52
54
|
};
|
|
53
55
|
}
|
|
54
56
|
|
|
57
|
+
/** Create a handler that returns the list of all session participants and their roles. */
|
|
55
58
|
export function createRollCallHandler(ctx) {
|
|
56
59
|
return async () => {
|
|
57
60
|
return {
|
|
@@ -0,0 +1,92 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Turn renderer — maps a structured turn into formatted text lines.
|
|
3
|
+
*
|
|
4
|
+
* Shared by `TeeWriter.flushTurns()` (live stream) and
|
|
5
|
+
* `TraceCollector.toText()` (offline replay) so both emit identical output
|
|
6
|
+
* (spec 540).
|
|
7
|
+
*/
|
|
8
|
+
|
|
9
|
+
import {
|
|
10
|
+
renderTextLine,
|
|
11
|
+
renderToolCallLine,
|
|
12
|
+
renderToolResultLine,
|
|
13
|
+
} from "./line-renderer.js";
|
|
14
|
+
import {
|
|
15
|
+
hintForCall,
|
|
16
|
+
previewForResult,
|
|
17
|
+
simplifyToolName,
|
|
18
|
+
} from "./tool-hints.js";
|
|
19
|
+
|
|
20
|
+
/**
|
|
21
|
+
* Render a single turn to formatted text lines.
|
|
22
|
+
*
|
|
23
|
+
* @param {object} turn - Structured turn object
|
|
24
|
+
* @param {boolean} withPrefix - Whether to include source labels
|
|
25
|
+
* @returns {string[]} Array of rendered line strings
|
|
26
|
+
*/
|
|
27
|
+
export function renderTurnLines(turn, withPrefix) {
|
|
28
|
+
if (turn.role === "assistant") return renderAssistantTurn(turn, withPrefix);
|
|
29
|
+
if (turn.role === "tool_result")
|
|
30
|
+
return renderToolResultTurn(turn, withPrefix);
|
|
31
|
+
if (turn.role === "system") return renderSystemTurn(turn, withPrefix);
|
|
32
|
+
if (turn.role === "user") return renderUserTurn(turn, withPrefix);
|
|
33
|
+
return [];
|
|
34
|
+
}
|
|
35
|
+
|
|
36
|
+
/** @param {object} turn @param {boolean} withPrefix @returns {string[]} */
|
|
37
|
+
function renderAssistantTurn(turn, withPrefix) {
|
|
38
|
+
const lines = [];
|
|
39
|
+
for (const block of turn.content) {
|
|
40
|
+
if (block.type === "text") {
|
|
41
|
+
lines.push(
|
|
42
|
+
renderTextLine({ source: turn.source, text: block.text, withPrefix }),
|
|
43
|
+
);
|
|
44
|
+
} else if (block.type === "tool_use") {
|
|
45
|
+
lines.push(
|
|
46
|
+
renderToolCallLine({
|
|
47
|
+
source: turn.source,
|
|
48
|
+
toolName: simplifyToolName(block.name),
|
|
49
|
+
hint: hintForCall(block.name, block.input),
|
|
50
|
+
withPrefix,
|
|
51
|
+
}),
|
|
52
|
+
);
|
|
53
|
+
}
|
|
54
|
+
}
|
|
55
|
+
return lines;
|
|
56
|
+
}
|
|
57
|
+
|
|
58
|
+
/** @param {object} turn @param {boolean} withPrefix @returns {string[]} */
|
|
59
|
+
function renderToolResultTurn(turn, withPrefix) {
|
|
60
|
+
return [
|
|
61
|
+
renderToolResultLine({
|
|
62
|
+
source: turn.source,
|
|
63
|
+
preview: previewForResult(turn.content, turn.isError),
|
|
64
|
+
withPrefix,
|
|
65
|
+
}),
|
|
66
|
+
];
|
|
67
|
+
}
|
|
68
|
+
|
|
69
|
+
/** @param {object} turn @param {boolean} withPrefix @returns {string[]} */
|
|
70
|
+
function renderSystemTurn(turn, withPrefix) {
|
|
71
|
+
const label = turn.subtype ?? "system";
|
|
72
|
+
return [
|
|
73
|
+
renderTextLine({ source: turn.source, text: `[${label}]`, withPrefix }),
|
|
74
|
+
];
|
|
75
|
+
}
|
|
76
|
+
|
|
77
|
+
/** @param {object} turn @param {boolean} withPrefix @returns {string[]} */
|
|
78
|
+
function renderUserTurn(turn, withPrefix) {
|
|
79
|
+
const lines = [];
|
|
80
|
+
for (const block of turn.content) {
|
|
81
|
+
if (block.type === "text") {
|
|
82
|
+
lines.push(
|
|
83
|
+
renderTextLine({
|
|
84
|
+
source: turn.source,
|
|
85
|
+
text: `[user] ${block.text}`,
|
|
86
|
+
withPrefix,
|
|
87
|
+
}),
|
|
88
|
+
);
|
|
89
|
+
}
|
|
90
|
+
}
|
|
91
|
+
return lines;
|
|
92
|
+
}
|
package/src/sequence-counter.js
CHANGED
|
@@ -2,16 +2,20 @@
|
|
|
2
2
|
* SequenceCounter — global monotonic counter shared across all participants
|
|
3
3
|
* in a session. Single-threaded JS means no synchronization needed.
|
|
4
4
|
*/
|
|
5
|
+
/** Monotonic counter that assigns globally ordered sequence numbers within a session. */
|
|
5
6
|
export class SequenceCounter {
|
|
7
|
+
/** Initialize the counter at zero. */
|
|
6
8
|
constructor() {
|
|
7
9
|
this.value = 0;
|
|
8
10
|
}
|
|
9
11
|
|
|
12
|
+
/** Return the current value and advance the counter by one. */
|
|
10
13
|
next() {
|
|
11
14
|
return this.value++;
|
|
12
15
|
}
|
|
13
16
|
}
|
|
14
17
|
|
|
18
|
+
/** Create a new SequenceCounter starting at zero. */
|
|
15
19
|
export function createSequenceCounter() {
|
|
16
20
|
return new SequenceCounter();
|
|
17
21
|
}
|
package/src/supervisor.js
CHANGED
|
@@ -4,8 +4,9 @@
|
|
|
4
4
|
* introduces itself, and delegates work to the agent. The loop then alternates:
|
|
5
5
|
* agent → supervisor → agent.
|
|
6
6
|
*
|
|
7
|
-
* Signaling uses orchestration tools (Ask /
|
|
8
|
-
*
|
|
7
|
+
* Signaling uses orchestration tools (Ask / Announce / Redirect / Conclude)
|
|
8
|
+
* via in-process MCP servers; the supervisor has no Answer tool — agent replies
|
|
9
|
+
* are routed back through the relay loop. The Ask/Answer contract is enforced
|
|
9
10
|
* at turn boundaries: an unanswered Ask triggers one synthetic reminder and
|
|
10
11
|
* then a `protocol_violation` trace event plus a null-answer injection so the
|
|
11
12
|
* session advances without silent deadlock.
|
|
@@ -52,6 +53,7 @@ export const AGENT_SYSTEM_PROMPT =
|
|
|
52
53
|
*/
|
|
53
54
|
const MAX_INTERVENTIONS_PER_TURN = 5;
|
|
54
55
|
|
|
56
|
+
/** Orchestrate a relay loop between a supervisor LLM and an agent LLM with mid-turn review. */
|
|
55
57
|
export class Supervisor {
|
|
56
58
|
/**
|
|
57
59
|
* @param {object} deps
|
|
@@ -172,39 +174,26 @@ export class Supervisor {
|
|
|
172
174
|
: await this.agentRunner.run(relay);
|
|
173
175
|
agentCalled = true;
|
|
174
176
|
|
|
175
|
-
|
|
176
|
-
|
|
177
|
-
|
|
178
|
-
|
|
177
|
+
const outcome = this.#classifyAgentOutcome(
|
|
178
|
+
agentResult,
|
|
179
|
+
turn,
|
|
180
|
+
interventions,
|
|
181
|
+
);
|
|
179
182
|
|
|
180
|
-
if (
|
|
181
|
-
|
|
182
|
-
success: true,
|
|
183
|
-
turns: turn,
|
|
184
|
-
summary: this.ctx.summary,
|
|
185
|
-
});
|
|
186
|
-
return { exit: { success: true, turns: turn } };
|
|
187
|
-
}
|
|
183
|
+
if (outcome.type === "exit") return { exit: outcome.exit };
|
|
184
|
+
if (outcome.type === "intervention_limit") return { exit: null };
|
|
188
185
|
|
|
189
|
-
if (
|
|
186
|
+
if (outcome.type === "redirect") {
|
|
190
187
|
interventions++;
|
|
191
|
-
|
|
192
|
-
this.ctx.redirect = null;
|
|
193
|
-
if (interventions >= MAX_INTERVENTIONS_PER_TURN) {
|
|
194
|
-
this.emitOrchestratorEvent({ type: "intervention_limit", turn });
|
|
195
|
-
return { exit: null };
|
|
196
|
-
}
|
|
197
|
-
relay = redirect.message;
|
|
188
|
+
relay = outcome.relay;
|
|
198
189
|
this.emitOrchestratorEvent({ type: "intervention_relayed", turn });
|
|
199
190
|
continue;
|
|
200
191
|
}
|
|
201
192
|
|
|
202
|
-
|
|
203
|
-
|
|
204
|
-
|
|
205
|
-
|
|
206
|
-
continue;
|
|
207
|
-
}
|
|
193
|
+
const askRelay = this.#drainAgentAskRelay();
|
|
194
|
+
if (askRelay) {
|
|
195
|
+
relay = askRelay;
|
|
196
|
+
continue;
|
|
208
197
|
}
|
|
209
198
|
|
|
210
199
|
return { exit: null };
|
|
@@ -214,6 +203,50 @@ export class Supervisor {
|
|
|
214
203
|
}
|
|
215
204
|
}
|
|
216
205
|
|
|
206
|
+
/**
|
|
207
|
+
* Classify the outcome of a single agent execution within #runAgentTurn.
|
|
208
|
+
* @returns {{type: string, exit?: object|null, relay?: string}}
|
|
209
|
+
*/
|
|
210
|
+
#classifyAgentOutcome(agentResult, turn, interventions) {
|
|
211
|
+
if (agentResult.error && !agentResult.aborted) {
|
|
212
|
+
this.emitSummary({ success: false, turns: turn });
|
|
213
|
+
return { type: "exit", exit: { success: false, turns: turn } };
|
|
214
|
+
}
|
|
215
|
+
|
|
216
|
+
if (this.ctx.concluded) {
|
|
217
|
+
this.emitSummary({
|
|
218
|
+
success: true,
|
|
219
|
+
turns: turn,
|
|
220
|
+
summary: this.ctx.summary,
|
|
221
|
+
});
|
|
222
|
+
return { type: "exit", exit: { success: true, turns: turn } };
|
|
223
|
+
}
|
|
224
|
+
|
|
225
|
+
if (agentResult.aborted && this.ctx.redirect) {
|
|
226
|
+
const redirect = this.ctx.redirect;
|
|
227
|
+
this.ctx.redirect = null;
|
|
228
|
+
if (interventions + 1 >= MAX_INTERVENTIONS_PER_TURN) {
|
|
229
|
+
this.emitOrchestratorEvent({ type: "intervention_limit", turn });
|
|
230
|
+
return { type: "intervention_limit" };
|
|
231
|
+
}
|
|
232
|
+
return { type: "redirect", relay: redirect.message };
|
|
233
|
+
}
|
|
234
|
+
|
|
235
|
+
return { type: "continue" };
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
/**
|
|
239
|
+
* If the agent has an unanswered ask, drain reminders and return a
|
|
240
|
+
* formatted relay string. Returns null when no relay is needed.
|
|
241
|
+
* @returns {string|null}
|
|
242
|
+
*/
|
|
243
|
+
#drainAgentAskRelay() {
|
|
244
|
+
if (this.#checkAsk("agent") !== "recheck" || this.ctx.concluded)
|
|
245
|
+
return null;
|
|
246
|
+
const reminders = this.messageBus.drain("agent");
|
|
247
|
+
return reminders.length > 0 ? formatMessages(reminders) : null;
|
|
248
|
+
}
|
|
249
|
+
|
|
217
250
|
/**
|
|
218
251
|
* Mid-turn supervisor review fired from inside the agent's onBatch hook.
|
|
219
252
|
* Runs the supervisor's LLM against the batch and aborts the agent if
|
package/src/tee-writer.js
CHANGED
|
@@ -17,18 +17,10 @@
|
|
|
17
17
|
|
|
18
18
|
import { Writable } from "node:stream";
|
|
19
19
|
import { TraceCollector } from "./trace-collector.js";
|
|
20
|
-
import {
|
|
21
|
-
renderTextLine,
|
|
22
|
-
renderToolCallLine,
|
|
23
|
-
renderToolResultLine,
|
|
24
|
-
} from "./render/line-renderer.js";
|
|
25
|
-
import {
|
|
26
|
-
hintForCall,
|
|
27
|
-
previewForResult,
|
|
28
|
-
simplifyToolName,
|
|
29
|
-
} from "./render/tool-hints.js";
|
|
20
|
+
import { renderTurnLines } from "./render/turn-renderer.js";
|
|
30
21
|
import { isSuppressedOrchestratorEvent } from "./render/orchestrator-filter.js";
|
|
31
22
|
|
|
23
|
+
/** Writable stream that saves raw NDJSON to a file while streaming human-readable text to a display stream. */
|
|
32
24
|
export class TeeWriter extends Writable {
|
|
33
25
|
/**
|
|
34
26
|
* @param {object} deps
|
|
@@ -134,56 +126,8 @@ export class TeeWriter extends Writable {
|
|
|
134
126
|
const withPrefix = this.mode !== "raw";
|
|
135
127
|
while (this.turnsEmitted < turns.length) {
|
|
136
128
|
const turn = turns[this.turnsEmitted++];
|
|
137
|
-
|
|
138
|
-
|
|
139
|
-
if (block.type === "text") {
|
|
140
|
-
this.textStream.write(
|
|
141
|
-
renderTextLine({
|
|
142
|
-
source: turn.source,
|
|
143
|
-
text: block.text,
|
|
144
|
-
withPrefix,
|
|
145
|
-
}),
|
|
146
|
-
);
|
|
147
|
-
} else if (block.type === "tool_use") {
|
|
148
|
-
this.textStream.write(
|
|
149
|
-
renderToolCallLine({
|
|
150
|
-
source: turn.source,
|
|
151
|
-
toolName: simplifyToolName(block.name),
|
|
152
|
-
hint: hintForCall(block.name, block.input),
|
|
153
|
-
withPrefix,
|
|
154
|
-
}),
|
|
155
|
-
);
|
|
156
|
-
}
|
|
157
|
-
}
|
|
158
|
-
} else if (turn.role === "tool_result") {
|
|
159
|
-
this.textStream.write(
|
|
160
|
-
renderToolResultLine({
|
|
161
|
-
source: turn.source,
|
|
162
|
-
preview: previewForResult(turn.content, turn.isError),
|
|
163
|
-
withPrefix,
|
|
164
|
-
}),
|
|
165
|
-
);
|
|
166
|
-
} else if (turn.role === "system") {
|
|
167
|
-
const label = turn.subtype ?? "system";
|
|
168
|
-
this.textStream.write(
|
|
169
|
-
renderTextLine({
|
|
170
|
-
source: turn.source,
|
|
171
|
-
text: `[${label}]`,
|
|
172
|
-
withPrefix,
|
|
173
|
-
}),
|
|
174
|
-
);
|
|
175
|
-
} else if (turn.role === "user") {
|
|
176
|
-
for (const block of turn.content) {
|
|
177
|
-
if (block.type === "text") {
|
|
178
|
-
this.textStream.write(
|
|
179
|
-
renderTextLine({
|
|
180
|
-
source: turn.source,
|
|
181
|
-
text: `[user] ${block.text}`,
|
|
182
|
-
withPrefix,
|
|
183
|
-
}),
|
|
184
|
-
);
|
|
185
|
-
}
|
|
186
|
-
}
|
|
129
|
+
for (const line of renderTurnLines(turn, withPrefix)) {
|
|
130
|
+
this.textStream.write(line);
|
|
187
131
|
}
|
|
188
132
|
}
|
|
189
133
|
}
|
package/src/trace-collector.js
CHANGED
|
@@ -9,18 +9,10 @@
|
|
|
9
9
|
* one formatting path (spec 540).
|
|
10
10
|
*/
|
|
11
11
|
|
|
12
|
-
import {
|
|
13
|
-
renderTextLine,
|
|
14
|
-
renderToolCallLine,
|
|
15
|
-
renderToolResultLine,
|
|
16
|
-
} from "./render/line-renderer.js";
|
|
17
|
-
import {
|
|
18
|
-
hintForCall,
|
|
19
|
-
previewForResult,
|
|
20
|
-
simplifyToolName,
|
|
21
|
-
} from "./render/tool-hints.js";
|
|
12
|
+
import { renderTurnLines } from "./render/turn-renderer.js";
|
|
22
13
|
import { isSuppressedOrchestratorEvent } from "./render/orchestrator-filter.js";
|
|
23
14
|
|
|
15
|
+
/** Accumulate Claude Code NDJSON stream events into structured traces for analysis or text replay. */
|
|
24
16
|
export class TraceCollector {
|
|
25
17
|
/**
|
|
26
18
|
* @param {object} [deps]
|
|
@@ -270,68 +262,10 @@ export class TraceCollector {
|
|
|
270
262
|
const out = [];
|
|
271
263
|
|
|
272
264
|
for (const turn of this.turns) {
|
|
273
|
-
|
|
274
|
-
for (const block of turn.content) {
|
|
275
|
-
if (block.type === "text") {
|
|
276
|
-
out.push(
|
|
277
|
-
renderTextLine({
|
|
278
|
-
source: turn.source,
|
|
279
|
-
text: block.text,
|
|
280
|
-
withPrefix,
|
|
281
|
-
}),
|
|
282
|
-
);
|
|
283
|
-
} else if (block.type === "tool_use") {
|
|
284
|
-
out.push(
|
|
285
|
-
renderToolCallLine({
|
|
286
|
-
source: turn.source,
|
|
287
|
-
toolName: simplifyToolName(block.name),
|
|
288
|
-
hint: hintForCall(block.name, block.input),
|
|
289
|
-
withPrefix,
|
|
290
|
-
}),
|
|
291
|
-
);
|
|
292
|
-
}
|
|
293
|
-
}
|
|
294
|
-
} else if (turn.role === "tool_result") {
|
|
295
|
-
out.push(
|
|
296
|
-
renderToolResultLine({
|
|
297
|
-
source: turn.source,
|
|
298
|
-
preview: previewForResult(turn.content, turn.isError),
|
|
299
|
-
withPrefix,
|
|
300
|
-
}),
|
|
301
|
-
);
|
|
302
|
-
} else if (turn.role === "system") {
|
|
303
|
-
const label = turn.subtype ?? "system";
|
|
304
|
-
out.push(
|
|
305
|
-
renderTextLine({
|
|
306
|
-
source: turn.source,
|
|
307
|
-
text: `[${label}]`,
|
|
308
|
-
withPrefix,
|
|
309
|
-
}),
|
|
310
|
-
);
|
|
311
|
-
} else if (turn.role === "user") {
|
|
312
|
-
for (const block of turn.content) {
|
|
313
|
-
if (block.type === "text") {
|
|
314
|
-
out.push(
|
|
315
|
-
renderTextLine({
|
|
316
|
-
source: turn.source,
|
|
317
|
-
text: `[user] ${block.text}`,
|
|
318
|
-
withPrefix,
|
|
319
|
-
}),
|
|
320
|
-
);
|
|
321
|
-
}
|
|
322
|
-
}
|
|
323
|
-
}
|
|
265
|
+
out.push(...renderTurnLines(turn, withPrefix));
|
|
324
266
|
}
|
|
325
267
|
|
|
326
|
-
|
|
327
|
-
let tail = "";
|
|
328
|
-
if (this.result) {
|
|
329
|
-
const duration = formatDuration(this.result.durationMs);
|
|
330
|
-
const cost = Number(this.result.totalCostUsd).toFixed(4);
|
|
331
|
-
tail =
|
|
332
|
-
"\n" +
|
|
333
|
-
`--- Result: ${this.result.result} | Turns: ${this.result.numTurns} | Cost: $${cost} | Duration: ${duration} ---`;
|
|
334
|
-
}
|
|
268
|
+
const tail = this.#formatResultTail();
|
|
335
269
|
|
|
336
270
|
// Each rendered line already ends with `\n`; concatenate, drop the
|
|
337
271
|
// trailing newline, then append the tail so the output shape stays
|
|
@@ -341,6 +275,20 @@ export class TraceCollector {
|
|
|
341
275
|
const body = out.join("").replace(/\n$/, "");
|
|
342
276
|
return body + tail;
|
|
343
277
|
}
|
|
278
|
+
|
|
279
|
+
/**
|
|
280
|
+
* Format the trailing result summary line (spec 540).
|
|
281
|
+
* @returns {string}
|
|
282
|
+
*/
|
|
283
|
+
#formatResultTail() {
|
|
284
|
+
if (!this.result) return "";
|
|
285
|
+
const duration = formatDuration(this.result.durationMs);
|
|
286
|
+
const cost = Number(this.result.totalCostUsd).toFixed(4);
|
|
287
|
+
return (
|
|
288
|
+
"\n" +
|
|
289
|
+
`--- Result: ${this.result.result} | Turns: ${this.result.numTurns} | Cost: $${cost} | Duration: ${duration} ---`
|
|
290
|
+
);
|
|
291
|
+
}
|
|
344
292
|
}
|
|
345
293
|
|
|
346
294
|
/**
|
package/src/trace-github.js
CHANGED
|
@@ -48,7 +48,6 @@ export class TraceGitHub {
|
|
|
48
48
|
const data = await this.#get(url);
|
|
49
49
|
const runs = data.workflow_runs ?? [];
|
|
50
50
|
|
|
51
|
-
// eslint-disable-next-line security/detect-non-literal-regexp -- pattern is caller-controlled, not untrusted input
|
|
52
51
|
const re = new RegExp(pattern, "i");
|
|
53
52
|
return runs
|
|
54
53
|
.filter((r) => re.test(r.name))
|
package/src/trace-query.js
CHANGED
|
@@ -81,24 +81,12 @@ export class TraceQuery {
|
|
|
81
81
|
*/
|
|
82
82
|
filter(opts = {}) {
|
|
83
83
|
const { role, toolName, isError } = opts;
|
|
84
|
-
return this.turns.filter(
|
|
85
|
-
|
|
86
|
-
|
|
87
|
-
|
|
88
|
-
|
|
89
|
-
|
|
90
|
-
if (toolName !== undefined) {
|
|
91
|
-
if (turn.role === "assistant") {
|
|
92
|
-
const has = turn.content.some(
|
|
93
|
-
(b) => b.type === "tool_use" && b.name === toolName,
|
|
94
|
-
);
|
|
95
|
-
if (!has) return false;
|
|
96
|
-
} else {
|
|
97
|
-
return false;
|
|
98
|
-
}
|
|
99
|
-
}
|
|
100
|
-
return true;
|
|
101
|
-
});
|
|
84
|
+
return this.turns.filter(
|
|
85
|
+
(turn) =>
|
|
86
|
+
matchesRole(turn, role) &&
|
|
87
|
+
matchesError(turn, isError) &&
|
|
88
|
+
matchesToolName(turn, toolName),
|
|
89
|
+
);
|
|
102
90
|
}
|
|
103
91
|
|
|
104
92
|
/** @returns {number} */
|
|
@@ -151,7 +139,6 @@ export class TraceQuery {
|
|
|
151
139
|
*/
|
|
152
140
|
search(pattern, opts = {}) {
|
|
153
141
|
const { context = 0, limit = 50, full = false } = opts;
|
|
154
|
-
// eslint-disable-next-line security/detect-non-literal-regexp -- pattern is caller-controlled, not untrusted input
|
|
155
142
|
const re = new RegExp(pattern, "gi");
|
|
156
143
|
const hits = [];
|
|
157
144
|
|
|
@@ -200,30 +187,18 @@ export class TraceQuery {
|
|
|
200
187
|
* @returns {object[]}
|
|
201
188
|
*/
|
|
202
189
|
tool(name) {
|
|
203
|
-
const toolUseIds =
|
|
204
|
-
const
|
|
205
|
-
|
|
206
|
-
|
|
207
|
-
|
|
208
|
-
|
|
209
|
-
|
|
210
|
-
|
|
211
|
-
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
toolUseIds.add(b.toolUseId);
|
|
216
|
-
}
|
|
217
|
-
}
|
|
218
|
-
}
|
|
219
|
-
} else if (
|
|
220
|
-
turn.role === "tool_result" &&
|
|
221
|
-
toolUseIds.has(turn.toolUseId)
|
|
222
|
-
) {
|
|
223
|
-
results.push(turn);
|
|
224
|
-
}
|
|
225
|
-
}
|
|
226
|
-
return results;
|
|
190
|
+
const toolUseIds = collectToolUseIds(this.turns, name);
|
|
191
|
+
const assistantTurns = this.turns.filter(
|
|
192
|
+
(t) =>
|
|
193
|
+
t.role === "assistant" &&
|
|
194
|
+
t.content.some((b) => b.type === "tool_use" && b.name === name),
|
|
195
|
+
);
|
|
196
|
+
const resultTurns = this.turns.filter(
|
|
197
|
+
(t) => t.role === "tool_result" && toolUseIds.has(t.toolUseId),
|
|
198
|
+
);
|
|
199
|
+
return [...assistantTurns, ...resultTurns].sort(
|
|
200
|
+
(a, b) => a.index - b.index,
|
|
201
|
+
);
|
|
227
202
|
}
|
|
228
203
|
|
|
229
204
|
/**
|
|
@@ -343,6 +318,57 @@ export class TraceQuery {
|
|
|
343
318
|
}
|
|
344
319
|
}
|
|
345
320
|
|
|
321
|
+
/**
|
|
322
|
+
* @param {object} turn
|
|
323
|
+
* @param {string|undefined} role
|
|
324
|
+
* @returns {boolean}
|
|
325
|
+
*/
|
|
326
|
+
function matchesRole(turn, role) {
|
|
327
|
+
return role === undefined || turn.role === role;
|
|
328
|
+
}
|
|
329
|
+
|
|
330
|
+
/**
|
|
331
|
+
* @param {object} turn
|
|
332
|
+
* @param {boolean|undefined} isError
|
|
333
|
+
* @returns {boolean}
|
|
334
|
+
*/
|
|
335
|
+
function matchesError(turn, isError) {
|
|
336
|
+
if (isError === undefined) return true;
|
|
337
|
+
return turn.role === "tool_result" && turn.isError === isError;
|
|
338
|
+
}
|
|
339
|
+
|
|
340
|
+
/**
|
|
341
|
+
* @param {object} turn
|
|
342
|
+
* @param {string|undefined} toolName
|
|
343
|
+
* @returns {boolean}
|
|
344
|
+
*/
|
|
345
|
+
function matchesToolName(turn, toolName) {
|
|
346
|
+
if (toolName === undefined) return true;
|
|
347
|
+
return (
|
|
348
|
+
turn.role === "assistant" &&
|
|
349
|
+
turn.content.some((b) => b.type === "tool_use" && b.name === toolName)
|
|
350
|
+
);
|
|
351
|
+
}
|
|
352
|
+
|
|
353
|
+
/**
|
|
354
|
+
* Collect all toolUseIds for a given tool name from assistant turns.
|
|
355
|
+
* @param {object[]} turns
|
|
356
|
+
* @param {string} name
|
|
357
|
+
* @returns {Set<string>}
|
|
358
|
+
*/
|
|
359
|
+
function collectToolUseIds(turns, name) {
|
|
360
|
+
const ids = new Set();
|
|
361
|
+
for (const turn of turns) {
|
|
362
|
+
if (turn.role !== "assistant") continue;
|
|
363
|
+
for (const b of turn.content) {
|
|
364
|
+
if (b.type === "tool_use" && b.name === name && b.toolUseId) {
|
|
365
|
+
ids.add(b.toolUseId);
|
|
366
|
+
}
|
|
367
|
+
}
|
|
368
|
+
}
|
|
369
|
+
return ids;
|
|
370
|
+
}
|
|
371
|
+
|
|
346
372
|
/**
|
|
347
373
|
* Search a single turn for regex matches. Returns array of match descriptions.
|
|
348
374
|
* @param {object} turn
|