npm - little-coder - Versions diffs - 1.4.3 → 1.5.0 - Mend

little-coder 1.4.3 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

package/.pi/extensions/_shared/intervention.test.ts +13 -0
package/.pi/extensions/_shared/intervention.ts +41 -0
package/.pi/extensions/benchmark-profiles/index.ts +27 -9
package/.pi/extensions/benchmark-profiles/profiles.test.ts +53 -44
package/.pi/extensions/clear-command/index.test.ts +37 -0
package/.pi/extensions/clear-command/index.ts +26 -0
package/.pi/extensions/finalize-warn/index.ts +4 -3
package/.pi/extensions/output-parser/index.ts +4 -3
package/.pi/extensions/quality-monitor/index.ts +15 -8
package/.pi/extensions/quality-monitor/quality.test.ts +68 -2
package/.pi/extensions/quality-monitor/quality.ts +17 -0
package/.pi/extensions/thinking-budget/budget.test.ts +170 -132
package/.pi/extensions/thinking-budget/index.ts +118 -52
package/.pi/extensions/turn-cap/index.ts +4 -3
package/.pi/extensions/write-guard/index.ts +57 -67
package/.pi/extensions/write-guard/write-guard.test.ts +102 -2
package/.pi/settings.json +6 -6
package/CHANGELOG.md +26 -0
package/README.md +8 -2
package/bin/little-coder.mjs +12 -0
package/package.json +4 -2
package/scripts/patch-pi.mjs +113 -0
package/scripts/patch-pi.test.mjs +63 -0

package/.pi/extensions/thinking-budget/budget.test.ts CHANGED Viewed

@@ -1,4 +1,4 @@
-import { describe, it, expect, beforeEach } from "vitest";
+import { describe, it, expect, beforeEach, afterEach } from "vitest";
 import setupExtension from "./index.ts";
 // Exercise the char→token conversion (matches local/context_manager.py)
@@ -13,170 +13,208 @@ describe("thinking budget token estimation", () => {
     expect(charsToTokens(7)).toBe(2);
     expect(charsToTokens(3500)).toBe(1000);
   });
-  it("2048 tokens ~ 7168 chars", () => {
-    // Budget trigger boundary: ceil(7169/3.5) = 2049 > 2048
-    expect(charsToTokens(7168)).toBe(2048);
-    expect(charsToTokens(7169)).toBeGreaterThan(2048);
+  it("4096 tokens ~ 14336 chars (the v1.5.0 default budget)", () => {
+    expect(charsToTokens(14336)).toBe(4096);
+    expect(charsToTokens(14337)).toBeGreaterThan(4096);
   });
 });
-// ── Issue #8 regression coverage ────────────────────────────────────────
-// Mock just enough of pi's ExtensionAPI for the handler choreography.
-// We capture every registered handler keyed by event name and drive them
-// directly to assert idempotency / sequencing.
+// ── Issue #8 regression coverage (second reproduction, 1.4.3) ───────────────
+// The bug: recovery (setThinkingLevel("off") + sendUserMessage) was deferred to
+// a `turn_end` handler that ran against the module-scope `pi` AFTER ctx.abort()
+// triggered a session replacement → stale `pi` → throw → thinking never turned
+// off + follow-up never sent.
+//
+// The fix: do the whole recovery synchronously in `message_update`, BEFORE
+// ctx.abort(), while `pi` is still live. These tests pin that choreography:
+//   - no `turn_end` handler exists (nothing can run against a stale pi),
+//   - setThinkingLevel + sendUserMessage are ordered strictly before abort,
+//   - thinking is re-asserted off across the restart turn,
+//   - the prior level is restored on the next genuine user input.
 interface Handler {
   (event: any, ctx: any): Promise<unknown> | unknown;
 }
-interface MockPi {
-  on: (name: string, h: Handler) => void;
-  handlers: Record<string, Handler[]>;
-  followUps: string[];
-  thinkingLevels: string[];
-  setThinkingLevel: (lvl: string) => void;
-  sendUserMessage: (msg: string, opts?: any) => void;
-}
-function makePi(): MockPi {
+function makeHarness(initialLevel = "high") {
+  const calls: string[] = []; // ordered log across pi + ctx
+  const followUps: string[] = [];
+  const notifies: string[] = [];
+  let level = initialLevel;
   const handlers: Record<string, Handler[]> = {};
-  return {
+  const pi = {
     handlers,
-    followUps: [],
-    thinkingLevels: [],
-    on(name, h) {
+    on(name: string, h: Handler) {
       (handlers[name] ??= []).push(h);
     },
-    setThinkingLevel(lvl) {
-      this.thinkingLevels.push(lvl);
+    getThinkingLevel() {
+      return level;
     },
-    sendUserMessage(msg, _opts) {
-      this.followUps.push(msg);
+    setThinkingLevel(l: string) {
+      level = l;
+      calls.push(`set:${l}`);
     },
-  } as MockPi;
-}
-function makeCtx() {
-  const aborts: number[] = [];
+    sendUserMessage(m: string) {
+      followUps.push(m);
+      calls.push("send");
+    },
+  };
+  const ctx = {
+    abort() {
+      calls.push("abort");
+    },
+    ui: {
+      notify(m: string) {
+        notifies.push(m);
+        calls.push("notify");
+      },
+    },
+  };
   return {
-    aborts,
-    abort: () => { aborts.push(1); },
-    ui: { notify: (_m: string, _l?: string) => {} },
+    pi,
+    ctx,
+    calls,
+    followUps,
+    notifies,
+    level: () => level,
+    setLevelExternally: (l: string) => {
+      level = l;
+    },
   };
 }
-async function fire(pi: MockPi, name: string, event: any, ctx: any) {
-  for (const h of pi.handlers[name] ?? []) {
-    await h(event, ctx);
-  }
+async function fire(pi: any, name: string, event: any, ctx: any) {
+  for (const h of pi.handlers[name] ?? []) await h(event, ctx);
 }
 function thinkingDelta(s: string) {
   return { assistantMessageEvent: { type: "thinking_delta", delta: s } };
 }
-describe("thinking-budget idempotency (issue #8)", () => {
+// Always begin from a clean session — resets the extension's module-scoped
+// state so cases don't leak `forcedOff` / `priorLevel` into one another (and
+// mirrors real startup: session_start always precedes the first agent run).
+async function startRun(h: ReturnType<typeof makeHarness>) {
+  await fire(h.pi, "session_start", {}, h.ctx);
+  await fire(h.pi, "agent_start", {}, h.ctx);
+  await fire(h.pi, "before_agent_start", { systemPromptOptions: {} }, h.ctx);
+  await fire(h.pi, "turn_start", {}, h.ctx);
+}
+describe("thinking-budget recovery (issue #8)", () => {
   beforeEach(() => {
-    // Force a small budget so we can trigger with short strings.
-    process.env.LITTLE_CODER_THINKING_BUDGET = "10";
+    process.env.LITTLE_CODER_THINKING_BUDGET = "10"; // tiny budget for short strings
+  });
+  afterEach(() => {
+    delete process.env.LITTLE_CODER_THINKING_BUDGET;
+  });
+  it("registers NO turn_end handler (recovery must not run against a stale pi)", () => {
+    const h = makeHarness();
+    setupExtension(h.pi as any);
+    expect(h.pi.handlers["turn_end"]).toBeUndefined();
   });
-  it("fires exactly one abort + one follow-up for a single budget breach across many bursts", async () => {
-    const pi = makePi();
-    const ctx = makeCtx();
-    setupExtension(pi as any);
-    await fire(pi, "agent_start", {}, ctx);
-    await fire(pi, "before_agent_start", { systemPromptOptions: {} }, ctx);
-    await fire(pi, "turn_start", {}, ctx);
-    // Burst: 1000 chars of thinking, way over 10-token budget.
-    await fire(pi, "message_update", thinkingDelta("x".repeat(1000)), ctx);
-    // Second burst on the same turn — must not double-abort.
-    await fire(pi, "message_update", thinkingDelta("y".repeat(1000)), ctx);
-    // Third burst.
-    await fire(pi, "message_update", thinkingDelta("z".repeat(1000)), ctx);
-    await fire(pi, "turn_end", {}, ctx);
-    expect(ctx.aborts.length).toBe(1);
-    expect(pi.followUps.length).toBe(1);
-    expect(pi.followUps[0]).toMatch(/thinking budget exceeded/i);
-    expect(pi.thinkingLevels).toEqual(["off"]);
+  it("on breach, runs the full recovery BEFORE abort and exactly once", async () => {
+    const h = makeHarness();
+    setupExtension(h.pi as any);
+    await startRun(h);
+    await fire(h.pi, "message_update", thinkingDelta("x".repeat(1000)), h.ctx);
+    // setThinkingLevel("off") and sendUserMessage both happen before abort.
+    expect(h.calls).toEqual(["set:off", "send", "notify", "abort"]);
+    expect(h.level()).toBe("off");
+    expect(h.followUps).toHaveLength(1);
+    expect(h.followUps[0]).toMatch(/thinking budget exceeded/i);
+    expect(h.notifies[0]).toMatch(/harness intervention:.*thought long enough/i);
   });
-  it("fires the recovery follow-up only once even if turn_end is re-emitted", async () => {
-    const pi = makePi();
-    const ctx = makeCtx();
-    setupExtension(pi as any);
-    await fire(pi, "agent_start", {}, ctx);
-    await fire(pi, "before_agent_start", { systemPromptOptions: {} }, ctx);
-    await fire(pi, "turn_start", {}, ctx);
-    await fire(pi, "message_update", thinkingDelta("x".repeat(1000)), ctx);
-    await fire(pi, "turn_end", {}, ctx);
-    // Pi can re-emit turn_end during retry / compaction paths — must be a no-op.
-    await fire(pi, "turn_end", {}, ctx);
-    await fire(pi, "turn_end", {}, ctx);
-    expect(ctx.aborts.length).toBe(1);
-    expect(pi.followUps.length).toBe(1);
-    expect(pi.thinkingLevels.length).toBe(1);
+  it("does not double-abort across multiple bursts in the same turn", async () => {
+    const h = makeHarness();
+    setupExtension(h.pi as any);
+    await startRun(h);
+    await fire(h.pi, "message_update", thinkingDelta("x".repeat(1000)), h.ctx);
+    await fire(h.pi, "message_update", thinkingDelta("y".repeat(1000)), h.ctx);
+    await fire(h.pi, "message_update", thinkingDelta("z".repeat(1000)), h.ctx);
+    expect(h.calls.filter((c) => c === "abort")).toHaveLength(1);
+    expect(h.followUps).toHaveLength(1);
   });
-  it("resets state on agent_start so a prior aborted run does not leak", async () => {
-    const pi = makePi();
-    const ctx1 = makeCtx();
-    setupExtension(pi as any);
-    // Run 1: trigger an abort.
-    await fire(pi, "agent_start", {}, ctx1);
-    await fire(pi, "before_agent_start", { systemPromptOptions: {} }, ctx1);
-    await fire(pi, "turn_start", {}, ctx1);
-    await fire(pi, "message_update", thinkingDelta("x".repeat(1000)), ctx1);
-    await fire(pi, "turn_end", {}, ctx1);
-    // Run 2: fresh agent_start — no abort should fire on the first turn
-    // even though run 1 left state behind.
-    const ctx2 = makeCtx();
-    await fire(pi, "agent_start", {}, ctx2);
-    await fire(pi, "before_agent_start", { systemPromptOptions: {} }, ctx2);
-    await fire(pi, "turn_start", {}, ctx2);
-    // A small thinking delta well under budget.
-    await fire(pi, "message_update", thinkingDelta("ok"), ctx2);
-    await fire(pi, "turn_end", {}, ctx2);
-    expect(ctx2.aborts.length).toBe(0);
-    // Total follow-ups: only the one from run 1.
-    expect(pi.followUps.length).toBe(1);
+  it("does not fire under budget", async () => {
+    const h = makeHarness();
+    setupExtension(h.pi as any);
+    await startRun(h);
+    await fire(h.pi, "message_update", thinkingDelta("ok"), h.ctx); // 2 chars < 10 tokens
+    expect(h.calls).toEqual([]);
+    expect(h.level()).toBe("high");
   });
-  it("yields one tick before sendUserMessage so pi's abort barrier can settle", async () => {
-    // We can only assert this indirectly: turn_end must complete the await
-    // chain (it returns a Promise) AFTER setImmediate fires. If it didn't
-    // yield, sendUserMessage would land synchronously inside the same
-    // microtask as ctx.abort(). Verify ordering by interleaving a marker.
-    const pi = makePi();
-    const ctx = makeCtx();
-    setupExtension(pi as any);
-    await fire(pi, "agent_start", {}, ctx);
-    await fire(pi, "before_agent_start", { systemPromptOptions: {} }, ctx);
-    await fire(pi, "turn_start", {}, ctx);
-    await fire(pi, "message_update", thinkingDelta("x".repeat(1000)), ctx);
-    const order: string[] = [];
-    setImmediate(() => order.push("setImmediate-marker"));
-    const turnEndPromise = (pi.handlers["turn_end"] ?? []).reduce<Promise<unknown>>(
-      (p, h) => p.then(() => h({}, ctx)),
-      Promise.resolve(),
+  it("re-asserts thinking off on the restart turn even if pi re-enables it", async () => {
+    const h = makeHarness();
+    setupExtension(h.pi as any);
+    await startRun(h);
+    await fire(h.pi, "message_update", thinkingDelta("x".repeat(1000)), h.ctx); // breach → off
+    // Simulate the post-abort session replacement re-resolving thinking to the
+    // profile default. The bug was that this stuck; the fix re-asserts off.
+    h.setLevelExternally("high");
+    await fire(h.pi, "agent_start", {}, h.ctx); // restart run after the followUp
+    await fire(h.pi, "before_agent_start", { systemPromptOptions: {} }, h.ctx);
+    await fire(h.pi, "turn_start", {}, h.ctx);
+    expect(h.level()).toBe("off");
+  });
+  it("restores the prior thinking level on the next genuine user input", async () => {
+    const h = makeHarness("medium");
+    setupExtension(h.pi as any);
+    await startRun(h);
+    await fire(h.pi, "message_update", thinkingDelta("x".repeat(1000)), h.ctx); // breach
+    expect(h.level()).toBe("off");
+    // A new user prompt ends the forced-off window and restores the level.
+    await fire(h.pi, "input", { text: "next task" }, h.ctx);
+    expect(h.level()).toBe("medium");
+    // And the force is cleared: a subsequent turn does NOT re-disable thinking.
+    await fire(h.pi, "turn_start", {}, h.ctx);
+    expect(h.level()).toBe("medium");
+  });
+  it("a fresh task (no prior breach) is never forced off", async () => {
+    const h = makeHarness("low");
+    setupExtension(h.pi as any);
+    await fire(h.pi, "input", { text: "task" }, h.ctx);
+    await startRun(h);
+    await fire(h.pi, "message_update", thinkingDelta("ok"), h.ctx);
+    expect(h.level()).toBe("low");
+    expect(h.calls).toEqual([]);
+  });
+});
+describe("thinking-budget resolution", () => {
+  afterEach(() => {
+    delete process.env.LITTLE_CODER_THINKING_BUDGET;
+  });
+  it("a profile budget wins over the env budget", async () => {
+    process.env.LITTLE_CODER_THINKING_BUDGET = "10";
+    const h = makeHarness();
+    setupExtension(h.pi as any);
+    await fire(h.pi, "session_start", {}, h.ctx);
+    await fire(h.pi, "agent_start", {}, h.ctx);
+    // profile budget 100 tokens (~350 chars) overrides env's 10.
+    await fire(
+      h.pi,
+      "before_agent_start",
+      { systemPromptOptions: { littleCoder: { thinkingBudget: 100 } } },
+      h.ctx,
     );
-    order.push("after-call");
-    await turnEndPromise;
-    order.push("after-await");
-    // After-call comes first (sync), then the setImmediate marker fires
-    // (because turn_end yielded), then we resume after the await.
-    expect(order[0]).toBe("after-call");
-    // marker must appear before resolve completes
-    expect(order).toContain("setImmediate-marker");
-    expect(pi.followUps.length).toBe(1);
+    await fire(h.pi, "turn_start", {}, h.ctx);
+    // 200 chars ≈ 58 tokens — under the 100-token profile budget, over env's 10.
+    await fire(h.pi, "message_update", thinkingDelta("x".repeat(200)), h.ctx);
+    expect(h.calls).toEqual([]);
   });
 });

package/.pi/extensions/thinking-budget/index.ts CHANGED Viewed

@@ -1,53 +1,118 @@
 import type { ExtensionAPI } from "@earendil-works/pi-coding-agent";
+import { harnessIntervention } from "../_shared/intervention.ts";
+// pi's thinking-level union (not re-exported from the package root). Mirrors
+// settings-manager's ThinkingLevel; structurally assignable to pi's own type.
+type ThinkingLevel = "off" | "minimal" | "low" | "medium" | "high" | "xhigh";
 // Port of the thinking-budget cap + partial-trace reuse logic from
 // providers.py. little-coder's Python implementation aborts the stream
 // mid-flight when thinking tokens cross the budget, re-injects the partial
 // trace as assistant context, and retries with thinking disabled. Pi's
-// AgentSession doesn't expose mid-stream abort-and-replace, so we implement
-// the between-turn fallback documented in the plan:
+// AgentSession doesn't expose mid-stream abort-and-replace, so we approximate
+// it: count thinking tokens, and on breach disable thinking + queue a
+// commit-to-an-implementation nudge, then abort the over-long turn.
 //
-//  1. Count thinking_delta tokens during message_update
-//  2. On budget exceed, call ctx.abort() to end the turn
-//  3. On turn_end after abort, flip thinking level to "off" and queue a
-//     correction follow-up nudging the model to commit to an implementation
+// ── Issue #8, second reproduction (1.4.3) ───────────────────────────────────
+// The v1.0.0 fix deferred the recovery (`setThinkingLevel("off")` +
+// `sendUserMessage`) to a `turn_end` handler, after a `setImmediate` yield, and
+// ran it against the module-scope `pi`. But `ctx.abort()` makes pi's `agent_end`
+// run auto-retry / auto-compaction (both enabled in .pi/settings.json), which
+// REPLACES the session (dispose() → ExtensionRunner.invalidate()). The
+// setImmediate yield is exactly what let that replacement land *before* the
+// deferred recovery, so the recovery touched a now-stale `pi` and threw
+// ("This extension ctx is stale after session replacement or reload"). Net
+// effect: thinking was never turned off (the next step kept thinking) and the
+// follow-up never reached the model (the agent appeared to stop) — the #8
+// symptom, on a different mechanism than the original.
 //
-// The behavioral effect matches the whitepaper's claim that the budget cap
-// "forces the model out of open-ended deliberation and back into committing
-// to an implementation" — the concrete savings of preserving the partial
-// trace are lost, but the commit-to-action pressure is the same.
+// Fix: do the whole recovery SYNCHRONOUSLY inside `message_update`, BEFORE
+// `ctx.abort()`, while `pi` is still live and the session hasn't been replaced.
+// No `turn_end` handler, no `setImmediate` — nothing runs against a stale ref.
 //
-// Idempotency notes (issue #8 fix):
-//   - State is reset on `agent_start` AND `turn_start` so a previous run
-//     leaving `aborted=true` cannot leak into the next conversation.
-//   - `recoveryPending` gates re-entry: while a recovery is mid-flight,
-//     message_update / turn_start cannot re-arm the abort.
-//   - The recovery sequence yields one tick (setImmediate) so pi's async
-//     abort barrier settles before we queue the follow-up message; without
-//     this, fast-streaming local backends drop the follow-up silently and
-//     the agent appears to stop.
+//   1. Count thinking_delta tokens during message_update.
+//   2. On breach: capture the current thinking level, flip thinking to "off",
+//      queue the commit nudge as a follow-up, surface one harness-intervention
+//      line, THEN ctx.abort().
+//   3. Keep thinking off across the restart turn(s): `forcedOff` re-asserts
+//      "off" on every turn_start until the user submits a genuinely new prompt
+//      (`input` event), at which point the prior level is restored so the next
+//      task can think again. (A new task should not inherit "off" just because
+//      a previous one over-thought.)
-const DEFAULT_BUDGET = 2048;
+const DEFAULT_BUDGET = 4096;
-// Per-run rolling state (reset on agent_start)
+// Per-run rolling state.
 let thinkingChars = 0;
 let budgetForTurn = DEFAULT_BUDGET;
 let aborted = false;
-let recoveryPending = false;
+// True from a budget breach until the next genuine user input. While set, we
+// re-assert thinking "off" at the start of every turn so the restart turn (and
+// any follow-on turns of the same task) can't silently come back with thinking
+// re-enabled by the post-replacement profile resolution.
+let forcedOff = false;
+// The thinking level in effect when we first forced it off, restored on the
+// next user input so a new task is unaffected.
+let priorLevel: ThinkingLevel | undefined;
 function charsToTokens(chars: number): number {
   // Matches local/context_manager.estimate_tokens (len/3.5)
   return Math.ceil(chars / 3.5);
 }
+// setThinkingLevel / getThinkingLevel are guarded: a stale-ctx throw must never
+// escape (pi reports an uncaught extension throw as a hard "Extension error"),
+// and older SDK builds may lack the getter.
+function safeGetThinkingLevel(pi: ExtensionAPI): ThinkingLevel | undefined {
+  try {
+    return typeof pi.getThinkingLevel === "function" ? pi.getThinkingLevel() : undefined;
+  } catch {
+    return undefined;
+  }
+}
+function safeSetThinkingLevel(pi: ExtensionAPI, level: ThinkingLevel): void {
+  try {
+    pi.setThinkingLevel(level);
+  } catch {
+    // Stale ctx / unsupported — leave the level alone rather than crash the run.
+  }
+}
 export default function (pi: ExtensionAPI) {
-  // Hard reset between conversations. agent_start fires once per /run; if a
-  // previous run aborted, `aborted` and `recoveryPending` would otherwise
-  // leak into the next conversation.
+  // A new session (startup, /clear, resume, reload) is a clean slate — clear
+  // everything, including the forced-off window. The recovery restart does NOT
+  // fire session_start (it's a follow-up within the same session), so this
+  // never clobbers the re-assertion. Also stops module-scoped state leaking
+  // across sessions in-process.
+  pi.on("session_start", async () => {
+    thinkingChars = 0;
+    aborted = false;
+    forcedOff = false;
+    priorLevel = undefined;
+  });
+  // Hard reset of per-turn counters between agent runs. `forcedOff` /
+  // `priorLevel` are deliberately NOT reset here: agent_start ALSO fires for
+  // the recovery restart turn, and clearing the force there would let thinking
+  // come straight back on — exactly the bug. They are cleared on `input`
+  // (a genuinely new user task) or `session_start`.
   pi.on("agent_start", async () => {
     thinkingChars = 0;
     aborted = false;
-    recoveryPending = false;
+  });
+  // A genuinely new user prompt ends the "forced off" window: restore the
+  // level the user actually had before the breach. Programmatic follow-ups
+  // (our nudge) do not emit an `input` event, so the restart turn stays off.
+  pi.on("input", async () => {
+    if (forcedOff) {
+      if (priorLevel !== undefined) safeSetThinkingLevel(pi, priorLevel);
+      forcedOff = false;
+      priorLevel = undefined;
+    }
+    thinkingChars = 0;
+    aborted = false;
   });
   pi.on("before_agent_start", async (event) => {
@@ -63,9 +128,11 @@ export default function (pi: ExtensionAPI) {
   pi.on("turn_start", async () => {
     thinkingChars = 0;
-    // Don't clear `aborted` if a recovery is mid-flight — the recovery
-    // turn_end handler clears it once the follow-up has been queued.
-    if (!recoveryPending) aborted = false;
+    aborted = false;
+    // Re-assert "off" for the restart turn (and any follow-on turns of the same
+    // task). After the session replacement triggered by the abort, the new
+    // run can otherwise resolve thinking back to the profile default.
+    if (forcedOff) safeSetThinkingLevel(pi, "off");
   });
   pi.on("message_update", async (event, ctx) => {
@@ -74,32 +141,31 @@ export default function (pi: ExtensionAPI) {
     if (ev.type !== "thinking_delta") return;
     const delta = typeof ev.delta === "string" ? ev.delta : "";
     thinkingChars += delta.length;
-    if (aborted || recoveryPending) return;
+    if (aborted) return;
     const tokens = charsToTokens(thinkingChars);
-    if (tokens > budgetForTurn) {
-      aborted = true;
-      recoveryPending = true;
-      ctx.ui.notify(
-        `thinking-budget: ${tokens} > ${budgetForTurn} — aborting turn, will retry with thinking off`,
-        "warning",
+    if (tokens <= budgetForTurn) return;
+    // Breach. Do the entire recovery now, while `pi` is still live — BEFORE
+    // ctx.abort() triggers the session replacement that would make `pi` stale.
+    aborted = true;
+    if (!forcedOff) {
+      priorLevel = safeGetThinkingLevel(pi);
+      forcedOff = true;
+    }
+    safeSetThinkingLevel(pi, "off");
+    try {
+      pi.sendUserMessage(
+        "[thinking budget exceeded] Please commit to an implementation now. " +
+          "Stop deliberating and use your tools to make progress.",
+        { deliverAs: "followUp" },
       );
-      ctx.abort();
+    } catch {
+      // SDK without sendUserMessage — abort still forces the turn to end.
     }
-  });
-  pi.on("turn_end", async (_event, _ctx) => {
-    if (!recoveryPending) return;
-    // Yield one tick so pi's abort barrier settles before we queue the
-    // follow-up. On fast-streaming local backends (qwen3.6 / llama.cpp)
-    // queuing immediately after ctx.abort() drops the follow-up silently
-    // and the agent appears to stop with no message — issue #8.
-    await new Promise<void>((r) => setImmediate(r));
-    pi.setThinkingLevel("off");
-    pi.sendUserMessage(
-      "[thinking budget exceeded] Please commit to an implementation now. Stop deliberating and use your tools to make progress.",
-      { deliverAs: "followUp" },
+    harnessIntervention(
+      ctx,
+      "the model has thought long enough — forcing it to start implementing.",
     );
-    recoveryPending = false;
-    aborted = false;
+    ctx.abort();
   });
 }

package/.pi/extensions/turn-cap/index.ts CHANGED Viewed

@@ -1,4 +1,5 @@
 import type { ExtensionAPI } from "@earendil-works/pi-coding-agent";
+import { harnessIntervention } from "../_shared/intervention.ts";
 // Port of agent.py's max_turns early-break. Counts turn_start events per
 // agent_start span; when the count exceeds LITTLE_CODER_MAX_TURNS (or the
@@ -27,9 +28,9 @@ export default function (pi: ExtensionAPI) {
     if (capForRun <= 0) return;
     turnsThisRun++;
     if (turnsThisRun > capForRun) {
-      ctx.ui.notify(
-        `turn-cap: reached max_turns=${capForRun}, aborting`,
-        "warning",
+      harnessIntervention(
+        ctx,
+        `the model hit the turn limit (${capForRun}) — stopping the run.`,
       );
       ctx.abort();
     }