npm - @forwardimpact/libeval - Versions diffs - 0.1.5 → 0.1.6 - Mend

@forwardimpact/libeval 0.1.5 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (5) hide show

package/package.json +1 -1
package/src/supervisor.js +38 -5
package/src/trace-collector.js +7 -0
package/test/supervisor.test.js +61 -0
package/test/trace-collector.test.js +96 -0

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "@forwardimpact/libeval",
-  "version": "0.1.5",
+  "version": "0.1.6",
   "description": "Process Claude Code stream-json output into structured traces",
   "license": "Apache-2.0",
   "author": "D. Olsson <hi@senzilla.io>",

package/src/supervisor.js CHANGED Viewed

@@ -53,6 +53,15 @@ export class Supervisor {
     this.currentSource = "agent";
     /** @type {number} */
     this.currentTurn = 0;
+    /**
+     * Set to true when any supervisor message contains the success signal.
+     * The SDK result text only reflects the last assistant message, so when
+     * the supervisor writes EVALUATION_SUCCESSFUL in an early message and
+     * then continues with follow-up work, the result text won't contain it.
+     * This flag captures the signal from the full message stream.
+     * @type {boolean}
+     */
+    this.successSignalSeen = false;
   }
   /**
@@ -66,6 +75,7 @@ export class Supervisor {
     // Turn 0: Supervisor receives the task and introduces it to the agent
     this.currentSource = "supervisor";
     this.currentTurn = 0;
+    this.successSignalSeen = false;
     let supervisorResult = await this.supervisorRunner.run(task);
     if (supervisorResult.error) {
@@ -73,9 +83,12 @@ export class Supervisor {
       return { success: false, turns: 0 };
     }
-    // The supervisor's turn is fully complete (all tool calls executed) by the
-    // time we check the signal — no work is interrupted.
-    if (isSuccessful(supervisorResult.text)) {
+    // Check for the success signal in either the SDK result text or the
+    // streamed message content. The SDK result text only reflects the last
+    // assistant message, so when the supervisor writes EVALUATION_SUCCESSFUL
+    // early and then continues (e.g. filing issues), we must also check the
+    // flag set by emitLine during streaming.
+    if (this.successSignalSeen || isSuccessful(supervisorResult.text)) {
       this.emitSummary({ success: true, turns: 0 });
       return { success: true, turns: 0 };
     }
@@ -106,6 +119,7 @@ export class Supervisor {
       this.currentSource = "supervisor";
       this.currentTurn = turn;
+      this.successSignalSeen = false;
       supervisorResult = await this.supervisorRunner.resume(supervisorPrompt);
       if (supervisorResult.error) {
@@ -113,8 +127,9 @@ export class Supervisor {
         return { success: false, turns: turn };
       }
-      // The supervisor's turn is fully complete — check for success signal.
-      if (isSuccessful(supervisorResult.text)) {
+      // The supervisor's turn is fully complete — check for success signal
+      // in either the SDK result text or streamed messages.
+      if (this.successSignalSeen || isSuccessful(supervisorResult.text)) {
         this.emitSummary({ success: true, turns: turn });
         return { success: true, turns: turn };
       }
@@ -142,6 +157,9 @@ export class Supervisor {
   /**
    * Emit a single NDJSON line tagged with the current source and turn.
    * Called in real-time via the AgentRunner onLine callback.
+   *
+   * When the current source is the supervisor, also scans assistant text
+   * content for the EVALUATION_SUCCESSFUL signal and sets successSignalSeen.
    * @param {string} line - Raw NDJSON line from the runner
    */
   emitLine(line) {
@@ -152,6 +170,21 @@ export class Supervisor {
       event,
     };
     this.output.write(JSON.stringify(tagged) + "\n");
+    // Scan supervisor assistant messages for the success signal in real time.
+    // The SDK result text only reflects the final assistant message, but the
+    // supervisor may write EVALUATION_SUCCESSFUL in an earlier message and
+    // then continue with follow-up tool calls.
+    if (this.currentSource === "supervisor" && event.type === "assistant") {
+      const content = event.message?.content ?? event.content ?? [];
+      if (Array.isArray(content)) {
+        for (const block of content) {
+          if (block.type === "text" && isSuccessful(block.text)) {
+            this.successSignalSeen = true;
+          }
+        }
+      }
+    }
   }
   /**

package/src/trace-collector.js CHANGED Viewed

@@ -38,6 +38,13 @@ export class TraceCollector {
       return;
     }
+    // Unwrap combined supervised trace format {source, turn, event}.
+    // The Supervisor emits this wrapper; when replayed through addLine the
+    // inner event is the one we need.
+    if (event.event && !event.type && typeof event.source === "string") {
+      event = event.event;
+    }
     switch (event.type) {
       case "system":
         this.handleSystem(event);

package/test/supervisor.test.js CHANGED Viewed

@@ -189,6 +189,67 @@ describe("Supervisor", () => {
     assert.strictEqual(result.turns, 1);
   });
+  test("detects EVALUATION_SUCCESSFUL in streamed messages when result text differs", async () => {
+    // Simulates the real failure: supervisor writes EVALUATION_SUCCESSFUL in
+    // an early message, then continues with follow-up work (e.g. filing issues).
+    // The SDK result text reflects only the final message, which does NOT
+    // contain the signal.
+    const agentRunner = createMockRunner([
+      { text: "I installed the packages." },
+    ]);
+    // The supervisor's result text is the Summary (no signal), but messages
+    // include one with EVALUATION_SUCCESSFUL.
+    const supervisorMessages = [
+      undefined, // turn 0: use default
+      [
+        {
+          type: "assistant",
+          message: {
+            content: [
+              {
+                type: "text",
+                text: "Good work.\n\nEVALUATION_SUCCESSFUL\n\nNow filing issues.",
+              },
+            ],
+          },
+        },
+        {
+          type: "assistant",
+          message: {
+            content: [
+              { type: "text", text: "## Summary\n\nAll issues filed." },
+            ],
+          },
+        },
+      ],
+    ];
+    const supervisorRunner = createMockRunner(
+      [
+        { text: "Welcome! Please install the packages." },
+        // Result text is the final message — does NOT contain the signal
+        { text: "## Summary\n\nAll issues filed." },
+      ],
+      supervisorMessages,
+    );
+    const output = new PassThrough();
+    const supervisor = new Supervisor({
+      agentRunner,
+      supervisorRunner,
+      output,
+      maxTurns: 10,
+    });
+    agentRunner.onLine = (line) => supervisor.emitLine(line);
+    supervisorRunner.onLine = (line) => supervisor.emitLine(line);
+    const result = await supervisor.run("Install stuff");
+    assert.strictEqual(result.success, true);
+    assert.strictEqual(result.turns, 1);
+  });
   test("runs multiple turns before completion", async () => {
     const agentRunner = createMockRunner([
       { text: "Started working." },

package/test/trace-collector.test.js CHANGED Viewed

@@ -149,6 +149,102 @@ describe("TraceCollector", () => {
       assert.strictEqual(trace.summary.tokenUsage.inputTokens, 5000);
     });
+    test("unwraps combined supervised trace format {source, turn, event}", () => {
+      const collector = new TraceCollector();
+      // System init wrapped in supervisor envelope
+      collector.addLine(
+        JSON.stringify({
+          source: "agent",
+          turn: 0,
+          event: {
+            type: "system",
+            subtype: "init",
+            session_id: "sess-supervised",
+            model: "claude-opus-4-6",
+            tools: ["Bash"],
+          },
+        }),
+      );
+      // Assistant message wrapped in supervisor envelope
+      collector.addLine(
+        JSON.stringify({
+          source: "agent",
+          turn: 1,
+          event: {
+            type: "assistant",
+            message: {
+              content: [{ type: "text", text: "I ran the tests." }],
+              usage: { input_tokens: 100, output_tokens: 50 },
+            },
+          },
+        }),
+      );
+      // Tool result wrapped in supervisor envelope
+      collector.addLine(
+        JSON.stringify({
+          source: "agent",
+          turn: 1,
+          event: {
+            type: "user",
+            message: {
+              role: "user",
+              content: [
+                {
+                  type: "tool_result",
+                  tool_use_id: "toolu_sup",
+                  content: "All tests passed",
+                },
+              ],
+            },
+          },
+        }),
+      );
+      // Result event wrapped in supervisor envelope
+      collector.addLine(
+        JSON.stringify({
+          source: "supervisor",
+          turn: 1,
+          event: {
+            type: "result",
+            subtype: "success",
+            total_cost_usd: 0.44,
+            duration_ms: 30000,
+            num_turns: 2,
+          },
+        }),
+      );
+      const trace = collector.toJSON();
+      assert.strictEqual(trace.metadata.sessionId, "sess-supervised");
+      assert.strictEqual(trace.turns.length, 2);
+      assert.strictEqual(trace.turns[0].role, "assistant");
+      assert.strictEqual(trace.turns[0].content[0].text, "I ran the tests.");
+      assert.strictEqual(trace.turns[1].role, "tool_result");
+      assert.strictEqual(trace.turns[1].content, "All tests passed");
+      assert.strictEqual(trace.summary.result, "success");
+      assert.strictEqual(trace.summary.totalCostUsd, 0.44);
+    });
+    test("skips orchestrator summary lines from supervised traces", () => {
+      const collector = new TraceCollector();
+      collector.addLine(
+        JSON.stringify({
+          source: "orchestrator",
+          type: "summary",
+          success: true,
+          turns: 3,
+        }),
+      );
+      // Orchestrator summaries have no inner event and no recognized type
+      // after unwrap — they should be silently skipped.
+      assert.strictEqual(collector.toJSON().turns.length, 0);
+    });
     test("skips rate_limit_event and unknown types", () => {
       const collector = new TraceCollector();
       collector.addLine(