@forwardimpact/libeval 0.1.5 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@forwardimpact/libeval",
3
- "version": "0.1.5",
3
+ "version": "0.1.6",
4
4
  "description": "Process Claude Code stream-json output into structured traces",
5
5
  "license": "Apache-2.0",
6
6
  "author": "D. Olsson <hi@senzilla.io>",
package/src/supervisor.js CHANGED
@@ -53,6 +53,15 @@ export class Supervisor {
53
53
  this.currentSource = "agent";
54
54
  /** @type {number} */
55
55
  this.currentTurn = 0;
56
+ /**
57
+ * Set to true when any supervisor message contains the success signal.
58
+ * The SDK result text only reflects the last assistant message, so when
59
+ * the supervisor writes EVALUATION_SUCCESSFUL in an early message and
60
+ * then continues with follow-up work, the result text won't contain it.
61
+ * This flag captures the signal from the full message stream.
62
+ * @type {boolean}
63
+ */
64
+ this.successSignalSeen = false;
56
65
  }
57
66
 
58
67
  /**
@@ -66,6 +75,7 @@ export class Supervisor {
66
75
  // Turn 0: Supervisor receives the task and introduces it to the agent
67
76
  this.currentSource = "supervisor";
68
77
  this.currentTurn = 0;
78
+ this.successSignalSeen = false;
69
79
  let supervisorResult = await this.supervisorRunner.run(task);
70
80
 
71
81
  if (supervisorResult.error) {
@@ -73,9 +83,12 @@ export class Supervisor {
73
83
  return { success: false, turns: 0 };
74
84
  }
75
85
 
76
- // The supervisor's turn is fully complete (all tool calls executed) by the
77
- // time we check the signal no work is interrupted.
78
- if (isSuccessful(supervisorResult.text)) {
86
+ // Check for the success signal in either the SDK result text or the
87
+ // streamed message content. The SDK result text only reflects the last
88
+ // assistant message, so when the supervisor writes EVALUATION_SUCCESSFUL
89
+ // early and then continues (e.g. filing issues), we must also check the
90
+ // flag set by emitLine during streaming.
91
+ if (this.successSignalSeen || isSuccessful(supervisorResult.text)) {
79
92
  this.emitSummary({ success: true, turns: 0 });
80
93
  return { success: true, turns: 0 };
81
94
  }
@@ -106,6 +119,7 @@ export class Supervisor {
106
119
 
107
120
  this.currentSource = "supervisor";
108
121
  this.currentTurn = turn;
122
+ this.successSignalSeen = false;
109
123
  supervisorResult = await this.supervisorRunner.resume(supervisorPrompt);
110
124
 
111
125
  if (supervisorResult.error) {
@@ -113,8 +127,9 @@ export class Supervisor {
113
127
  return { success: false, turns: turn };
114
128
  }
115
129
 
116
- // The supervisor's turn is fully complete — check for success signal.
117
- if (isSuccessful(supervisorResult.text)) {
130
+ // The supervisor's turn is fully complete — check for success signal
131
+ // in either the SDK result text or streamed messages.
132
+ if (this.successSignalSeen || isSuccessful(supervisorResult.text)) {
118
133
  this.emitSummary({ success: true, turns: turn });
119
134
  return { success: true, turns: turn };
120
135
  }
@@ -142,6 +157,9 @@ export class Supervisor {
142
157
  /**
143
158
  * Emit a single NDJSON line tagged with the current source and turn.
144
159
  * Called in real-time via the AgentRunner onLine callback.
160
+ *
161
+ * When the current source is the supervisor, also scans assistant text
162
+ * content for the EVALUATION_SUCCESSFUL signal and sets successSignalSeen.
145
163
  * @param {string} line - Raw NDJSON line from the runner
146
164
  */
147
165
  emitLine(line) {
@@ -152,6 +170,21 @@ export class Supervisor {
152
170
  event,
153
171
  };
154
172
  this.output.write(JSON.stringify(tagged) + "\n");
173
+
174
+ // Scan supervisor assistant messages for the success signal in real time.
175
+ // The SDK result text only reflects the final assistant message, but the
176
+ // supervisor may write EVALUATION_SUCCESSFUL in an earlier message and
177
+ // then continue with follow-up tool calls.
178
+ if (this.currentSource === "supervisor" && event.type === "assistant") {
179
+ const content = event.message?.content ?? event.content ?? [];
180
+ if (Array.isArray(content)) {
181
+ for (const block of content) {
182
+ if (block.type === "text" && isSuccessful(block.text)) {
183
+ this.successSignalSeen = true;
184
+ }
185
+ }
186
+ }
187
+ }
155
188
  }
156
189
 
157
190
  /**
@@ -38,6 +38,13 @@ export class TraceCollector {
38
38
  return;
39
39
  }
40
40
 
41
+ // Unwrap combined supervised trace format {source, turn, event}.
42
+ // The Supervisor emits this wrapper; when replayed through addLine the
43
+ // inner event is the one we need.
44
+ if (event.event && !event.type && typeof event.source === "string") {
45
+ event = event.event;
46
+ }
47
+
41
48
  switch (event.type) {
42
49
  case "system":
43
50
  this.handleSystem(event);
@@ -189,6 +189,67 @@ describe("Supervisor", () => {
189
189
  assert.strictEqual(result.turns, 1);
190
190
  });
191
191
 
192
+ test("detects EVALUATION_SUCCESSFUL in streamed messages when result text differs", async () => {
193
+ // Simulates the real failure: supervisor writes EVALUATION_SUCCESSFUL in
194
+ // an early message, then continues with follow-up work (e.g. filing issues).
195
+ // The SDK result text reflects only the final message, which does NOT
196
+ // contain the signal.
197
+ const agentRunner = createMockRunner([
198
+ { text: "I installed the packages." },
199
+ ]);
200
+
201
+ // The supervisor's result text is the Summary (no signal), but messages
202
+ // include one with EVALUATION_SUCCESSFUL.
203
+ const supervisorMessages = [
204
+ undefined, // turn 0: use default
205
+ [
206
+ {
207
+ type: "assistant",
208
+ message: {
209
+ content: [
210
+ {
211
+ type: "text",
212
+ text: "Good work.\n\nEVALUATION_SUCCESSFUL\n\nNow filing issues.",
213
+ },
214
+ ],
215
+ },
216
+ },
217
+ {
218
+ type: "assistant",
219
+ message: {
220
+ content: [
221
+ { type: "text", text: "## Summary\n\nAll issues filed." },
222
+ ],
223
+ },
224
+ },
225
+ ],
226
+ ];
227
+
228
+ const supervisorRunner = createMockRunner(
229
+ [
230
+ { text: "Welcome! Please install the packages." },
231
+ // Result text is the final message — does NOT contain the signal
232
+ { text: "## Summary\n\nAll issues filed." },
233
+ ],
234
+ supervisorMessages,
235
+ );
236
+
237
+ const output = new PassThrough();
238
+ const supervisor = new Supervisor({
239
+ agentRunner,
240
+ supervisorRunner,
241
+ output,
242
+ maxTurns: 10,
243
+ });
244
+ agentRunner.onLine = (line) => supervisor.emitLine(line);
245
+ supervisorRunner.onLine = (line) => supervisor.emitLine(line);
246
+
247
+ const result = await supervisor.run("Install stuff");
248
+
249
+ assert.strictEqual(result.success, true);
250
+ assert.strictEqual(result.turns, 1);
251
+ });
252
+
192
253
  test("runs multiple turns before completion", async () => {
193
254
  const agentRunner = createMockRunner([
194
255
  { text: "Started working." },
@@ -149,6 +149,102 @@ describe("TraceCollector", () => {
149
149
  assert.strictEqual(trace.summary.tokenUsage.inputTokens, 5000);
150
150
  });
151
151
 
152
+ test("unwraps combined supervised trace format {source, turn, event}", () => {
153
+ const collector = new TraceCollector();
154
+
155
+ // System init wrapped in supervisor envelope
156
+ collector.addLine(
157
+ JSON.stringify({
158
+ source: "agent",
159
+ turn: 0,
160
+ event: {
161
+ type: "system",
162
+ subtype: "init",
163
+ session_id: "sess-supervised",
164
+ model: "claude-opus-4-6",
165
+ tools: ["Bash"],
166
+ },
167
+ }),
168
+ );
169
+
170
+ // Assistant message wrapped in supervisor envelope
171
+ collector.addLine(
172
+ JSON.stringify({
173
+ source: "agent",
174
+ turn: 1,
175
+ event: {
176
+ type: "assistant",
177
+ message: {
178
+ content: [{ type: "text", text: "I ran the tests." }],
179
+ usage: { input_tokens: 100, output_tokens: 50 },
180
+ },
181
+ },
182
+ }),
183
+ );
184
+
185
+ // Tool result wrapped in supervisor envelope
186
+ collector.addLine(
187
+ JSON.stringify({
188
+ source: "agent",
189
+ turn: 1,
190
+ event: {
191
+ type: "user",
192
+ message: {
193
+ role: "user",
194
+ content: [
195
+ {
196
+ type: "tool_result",
197
+ tool_use_id: "toolu_sup",
198
+ content: "All tests passed",
199
+ },
200
+ ],
201
+ },
202
+ },
203
+ }),
204
+ );
205
+
206
+ // Result event wrapped in supervisor envelope
207
+ collector.addLine(
208
+ JSON.stringify({
209
+ source: "supervisor",
210
+ turn: 1,
211
+ event: {
212
+ type: "result",
213
+ subtype: "success",
214
+ total_cost_usd: 0.44,
215
+ duration_ms: 30000,
216
+ num_turns: 2,
217
+ },
218
+ }),
219
+ );
220
+
221
+ const trace = collector.toJSON();
222
+ assert.strictEqual(trace.metadata.sessionId, "sess-supervised");
223
+ assert.strictEqual(trace.turns.length, 2);
224
+ assert.strictEqual(trace.turns[0].role, "assistant");
225
+ assert.strictEqual(trace.turns[0].content[0].text, "I ran the tests.");
226
+ assert.strictEqual(trace.turns[1].role, "tool_result");
227
+ assert.strictEqual(trace.turns[1].content, "All tests passed");
228
+ assert.strictEqual(trace.summary.result, "success");
229
+ assert.strictEqual(trace.summary.totalCostUsd, 0.44);
230
+ });
231
+
232
+ test("skips orchestrator summary lines from supervised traces", () => {
233
+ const collector = new TraceCollector();
234
+ collector.addLine(
235
+ JSON.stringify({
236
+ source: "orchestrator",
237
+ type: "summary",
238
+ success: true,
239
+ turns: 3,
240
+ }),
241
+ );
242
+
243
+ // Orchestrator summaries have no inner event and no recognized type
244
+ // after unwrap — they should be silently skipped.
245
+ assert.strictEqual(collector.toJSON().turns.length, 0);
246
+ });
247
+
152
248
  test("skips rate_limit_event and unknown types", () => {
153
249
  const collector = new TraceCollector();
154
250
  collector.addLine(