@forwardimpact/libeval 0.1.5 → 0.1.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/package.json +1 -1
- package/src/supervisor.js +38 -5
- package/src/trace-collector.js +7 -0
- package/test/supervisor.test.js +61 -0
- package/test/trace-collector.test.js +96 -0
package/package.json
CHANGED
package/src/supervisor.js
CHANGED
|
@@ -53,6 +53,15 @@ export class Supervisor {
|
|
|
53
53
|
this.currentSource = "agent";
|
|
54
54
|
/** @type {number} */
|
|
55
55
|
this.currentTurn = 0;
|
|
56
|
+
/**
|
|
57
|
+
* Set to true when any supervisor message contains the success signal.
|
|
58
|
+
* The SDK result text only reflects the last assistant message, so when
|
|
59
|
+
* the supervisor writes EVALUATION_SUCCESSFUL in an early message and
|
|
60
|
+
* then continues with follow-up work, the result text won't contain it.
|
|
61
|
+
* This flag captures the signal from the full message stream.
|
|
62
|
+
* @type {boolean}
|
|
63
|
+
*/
|
|
64
|
+
this.successSignalSeen = false;
|
|
56
65
|
}
|
|
57
66
|
|
|
58
67
|
/**
|
|
@@ -66,6 +75,7 @@ export class Supervisor {
|
|
|
66
75
|
// Turn 0: Supervisor receives the task and introduces it to the agent
|
|
67
76
|
this.currentSource = "supervisor";
|
|
68
77
|
this.currentTurn = 0;
|
|
78
|
+
this.successSignalSeen = false;
|
|
69
79
|
let supervisorResult = await this.supervisorRunner.run(task);
|
|
70
80
|
|
|
71
81
|
if (supervisorResult.error) {
|
|
@@ -73,9 +83,12 @@ export class Supervisor {
|
|
|
73
83
|
return { success: false, turns: 0 };
|
|
74
84
|
}
|
|
75
85
|
|
|
76
|
-
//
|
|
77
|
-
//
|
|
78
|
-
|
|
86
|
+
// Check for the success signal in either the SDK result text or the
|
|
87
|
+
// streamed message content. The SDK result text only reflects the last
|
|
88
|
+
// assistant message, so when the supervisor writes EVALUATION_SUCCESSFUL
|
|
89
|
+
// early and then continues (e.g. filing issues), we must also check the
|
|
90
|
+
// flag set by emitLine during streaming.
|
|
91
|
+
if (this.successSignalSeen || isSuccessful(supervisorResult.text)) {
|
|
79
92
|
this.emitSummary({ success: true, turns: 0 });
|
|
80
93
|
return { success: true, turns: 0 };
|
|
81
94
|
}
|
|
@@ -106,6 +119,7 @@ export class Supervisor {
|
|
|
106
119
|
|
|
107
120
|
this.currentSource = "supervisor";
|
|
108
121
|
this.currentTurn = turn;
|
|
122
|
+
this.successSignalSeen = false;
|
|
109
123
|
supervisorResult = await this.supervisorRunner.resume(supervisorPrompt);
|
|
110
124
|
|
|
111
125
|
if (supervisorResult.error) {
|
|
@@ -113,8 +127,9 @@ export class Supervisor {
|
|
|
113
127
|
return { success: false, turns: turn };
|
|
114
128
|
}
|
|
115
129
|
|
|
116
|
-
// The supervisor's turn is fully complete — check for success signal
|
|
117
|
-
|
|
130
|
+
// The supervisor's turn is fully complete — check for success signal
|
|
131
|
+
// in either the SDK result text or streamed messages.
|
|
132
|
+
if (this.successSignalSeen || isSuccessful(supervisorResult.text)) {
|
|
118
133
|
this.emitSummary({ success: true, turns: turn });
|
|
119
134
|
return { success: true, turns: turn };
|
|
120
135
|
}
|
|
@@ -142,6 +157,9 @@ export class Supervisor {
|
|
|
142
157
|
/**
|
|
143
158
|
* Emit a single NDJSON line tagged with the current source and turn.
|
|
144
159
|
* Called in real-time via the AgentRunner onLine callback.
|
|
160
|
+
*
|
|
161
|
+
* When the current source is the supervisor, also scans assistant text
|
|
162
|
+
* content for the EVALUATION_SUCCESSFUL signal and sets successSignalSeen.
|
|
145
163
|
* @param {string} line - Raw NDJSON line from the runner
|
|
146
164
|
*/
|
|
147
165
|
emitLine(line) {
|
|
@@ -152,6 +170,21 @@ export class Supervisor {
|
|
|
152
170
|
event,
|
|
153
171
|
};
|
|
154
172
|
this.output.write(JSON.stringify(tagged) + "\n");
|
|
173
|
+
|
|
174
|
+
// Scan supervisor assistant messages for the success signal in real time.
|
|
175
|
+
// The SDK result text only reflects the final assistant message, but the
|
|
176
|
+
// supervisor may write EVALUATION_SUCCESSFUL in an earlier message and
|
|
177
|
+
// then continue with follow-up tool calls.
|
|
178
|
+
if (this.currentSource === "supervisor" && event.type === "assistant") {
|
|
179
|
+
const content = event.message?.content ?? event.content ?? [];
|
|
180
|
+
if (Array.isArray(content)) {
|
|
181
|
+
for (const block of content) {
|
|
182
|
+
if (block.type === "text" && isSuccessful(block.text)) {
|
|
183
|
+
this.successSignalSeen = true;
|
|
184
|
+
}
|
|
185
|
+
}
|
|
186
|
+
}
|
|
187
|
+
}
|
|
155
188
|
}
|
|
156
189
|
|
|
157
190
|
/**
|
package/src/trace-collector.js
CHANGED
|
@@ -38,6 +38,13 @@ export class TraceCollector {
|
|
|
38
38
|
return;
|
|
39
39
|
}
|
|
40
40
|
|
|
41
|
+
// Unwrap combined supervised trace format {source, turn, event}.
|
|
42
|
+
// The Supervisor emits this wrapper; when replayed through addLine the
|
|
43
|
+
// inner event is the one we need.
|
|
44
|
+
if (event.event && !event.type && typeof event.source === "string") {
|
|
45
|
+
event = event.event;
|
|
46
|
+
}
|
|
47
|
+
|
|
41
48
|
switch (event.type) {
|
|
42
49
|
case "system":
|
|
43
50
|
this.handleSystem(event);
|
package/test/supervisor.test.js
CHANGED
|
@@ -189,6 +189,67 @@ describe("Supervisor", () => {
|
|
|
189
189
|
assert.strictEqual(result.turns, 1);
|
|
190
190
|
});
|
|
191
191
|
|
|
192
|
+
test("detects EVALUATION_SUCCESSFUL in streamed messages when result text differs", async () => {
|
|
193
|
+
// Simulates the real failure: supervisor writes EVALUATION_SUCCESSFUL in
|
|
194
|
+
// an early message, then continues with follow-up work (e.g. filing issues).
|
|
195
|
+
// The SDK result text reflects only the final message, which does NOT
|
|
196
|
+
// contain the signal.
|
|
197
|
+
const agentRunner = createMockRunner([
|
|
198
|
+
{ text: "I installed the packages." },
|
|
199
|
+
]);
|
|
200
|
+
|
|
201
|
+
// The supervisor's result text is the Summary (no signal), but messages
|
|
202
|
+
// include one with EVALUATION_SUCCESSFUL.
|
|
203
|
+
const supervisorMessages = [
|
|
204
|
+
undefined, // turn 0: use default
|
|
205
|
+
[
|
|
206
|
+
{
|
|
207
|
+
type: "assistant",
|
|
208
|
+
message: {
|
|
209
|
+
content: [
|
|
210
|
+
{
|
|
211
|
+
type: "text",
|
|
212
|
+
text: "Good work.\n\nEVALUATION_SUCCESSFUL\n\nNow filing issues.",
|
|
213
|
+
},
|
|
214
|
+
],
|
|
215
|
+
},
|
|
216
|
+
},
|
|
217
|
+
{
|
|
218
|
+
type: "assistant",
|
|
219
|
+
message: {
|
|
220
|
+
content: [
|
|
221
|
+
{ type: "text", text: "## Summary\n\nAll issues filed." },
|
|
222
|
+
],
|
|
223
|
+
},
|
|
224
|
+
},
|
|
225
|
+
],
|
|
226
|
+
];
|
|
227
|
+
|
|
228
|
+
const supervisorRunner = createMockRunner(
|
|
229
|
+
[
|
|
230
|
+
{ text: "Welcome! Please install the packages." },
|
|
231
|
+
// Result text is the final message — does NOT contain the signal
|
|
232
|
+
{ text: "## Summary\n\nAll issues filed." },
|
|
233
|
+
],
|
|
234
|
+
supervisorMessages,
|
|
235
|
+
);
|
|
236
|
+
|
|
237
|
+
const output = new PassThrough();
|
|
238
|
+
const supervisor = new Supervisor({
|
|
239
|
+
agentRunner,
|
|
240
|
+
supervisorRunner,
|
|
241
|
+
output,
|
|
242
|
+
maxTurns: 10,
|
|
243
|
+
});
|
|
244
|
+
agentRunner.onLine = (line) => supervisor.emitLine(line);
|
|
245
|
+
supervisorRunner.onLine = (line) => supervisor.emitLine(line);
|
|
246
|
+
|
|
247
|
+
const result = await supervisor.run("Install stuff");
|
|
248
|
+
|
|
249
|
+
assert.strictEqual(result.success, true);
|
|
250
|
+
assert.strictEqual(result.turns, 1);
|
|
251
|
+
});
|
|
252
|
+
|
|
192
253
|
test("runs multiple turns before completion", async () => {
|
|
193
254
|
const agentRunner = createMockRunner([
|
|
194
255
|
{ text: "Started working." },
|
|
@@ -149,6 +149,102 @@ describe("TraceCollector", () => {
|
|
|
149
149
|
assert.strictEqual(trace.summary.tokenUsage.inputTokens, 5000);
|
|
150
150
|
});
|
|
151
151
|
|
|
152
|
+
test("unwraps combined supervised trace format {source, turn, event}", () => {
|
|
153
|
+
const collector = new TraceCollector();
|
|
154
|
+
|
|
155
|
+
// System init wrapped in supervisor envelope
|
|
156
|
+
collector.addLine(
|
|
157
|
+
JSON.stringify({
|
|
158
|
+
source: "agent",
|
|
159
|
+
turn: 0,
|
|
160
|
+
event: {
|
|
161
|
+
type: "system",
|
|
162
|
+
subtype: "init",
|
|
163
|
+
session_id: "sess-supervised",
|
|
164
|
+
model: "claude-opus-4-6",
|
|
165
|
+
tools: ["Bash"],
|
|
166
|
+
},
|
|
167
|
+
}),
|
|
168
|
+
);
|
|
169
|
+
|
|
170
|
+
// Assistant message wrapped in supervisor envelope
|
|
171
|
+
collector.addLine(
|
|
172
|
+
JSON.stringify({
|
|
173
|
+
source: "agent",
|
|
174
|
+
turn: 1,
|
|
175
|
+
event: {
|
|
176
|
+
type: "assistant",
|
|
177
|
+
message: {
|
|
178
|
+
content: [{ type: "text", text: "I ran the tests." }],
|
|
179
|
+
usage: { input_tokens: 100, output_tokens: 50 },
|
|
180
|
+
},
|
|
181
|
+
},
|
|
182
|
+
}),
|
|
183
|
+
);
|
|
184
|
+
|
|
185
|
+
// Tool result wrapped in supervisor envelope
|
|
186
|
+
collector.addLine(
|
|
187
|
+
JSON.stringify({
|
|
188
|
+
source: "agent",
|
|
189
|
+
turn: 1,
|
|
190
|
+
event: {
|
|
191
|
+
type: "user",
|
|
192
|
+
message: {
|
|
193
|
+
role: "user",
|
|
194
|
+
content: [
|
|
195
|
+
{
|
|
196
|
+
type: "tool_result",
|
|
197
|
+
tool_use_id: "toolu_sup",
|
|
198
|
+
content: "All tests passed",
|
|
199
|
+
},
|
|
200
|
+
],
|
|
201
|
+
},
|
|
202
|
+
},
|
|
203
|
+
}),
|
|
204
|
+
);
|
|
205
|
+
|
|
206
|
+
// Result event wrapped in supervisor envelope
|
|
207
|
+
collector.addLine(
|
|
208
|
+
JSON.stringify({
|
|
209
|
+
source: "supervisor",
|
|
210
|
+
turn: 1,
|
|
211
|
+
event: {
|
|
212
|
+
type: "result",
|
|
213
|
+
subtype: "success",
|
|
214
|
+
total_cost_usd: 0.44,
|
|
215
|
+
duration_ms: 30000,
|
|
216
|
+
num_turns: 2,
|
|
217
|
+
},
|
|
218
|
+
}),
|
|
219
|
+
);
|
|
220
|
+
|
|
221
|
+
const trace = collector.toJSON();
|
|
222
|
+
assert.strictEqual(trace.metadata.sessionId, "sess-supervised");
|
|
223
|
+
assert.strictEqual(trace.turns.length, 2);
|
|
224
|
+
assert.strictEqual(trace.turns[0].role, "assistant");
|
|
225
|
+
assert.strictEqual(trace.turns[0].content[0].text, "I ran the tests.");
|
|
226
|
+
assert.strictEqual(trace.turns[1].role, "tool_result");
|
|
227
|
+
assert.strictEqual(trace.turns[1].content, "All tests passed");
|
|
228
|
+
assert.strictEqual(trace.summary.result, "success");
|
|
229
|
+
assert.strictEqual(trace.summary.totalCostUsd, 0.44);
|
|
230
|
+
});
|
|
231
|
+
|
|
232
|
+
test("skips orchestrator summary lines from supervised traces", () => {
|
|
233
|
+
const collector = new TraceCollector();
|
|
234
|
+
collector.addLine(
|
|
235
|
+
JSON.stringify({
|
|
236
|
+
source: "orchestrator",
|
|
237
|
+
type: "summary",
|
|
238
|
+
success: true,
|
|
239
|
+
turns: 3,
|
|
240
|
+
}),
|
|
241
|
+
);
|
|
242
|
+
|
|
243
|
+
// Orchestrator summaries have no inner event and no recognized type
|
|
244
|
+
// after unwrap — they should be silently skipped.
|
|
245
|
+
assert.strictEqual(collector.toJSON().turns.length, 0);
|
|
246
|
+
});
|
|
247
|
+
|
|
152
248
|
test("skips rate_limit_event and unknown types", () => {
|
|
153
249
|
const collector = new TraceCollector();
|
|
154
250
|
collector.addLine(
|