npm - little-coder - Versions diffs - 1.4.3 → 1.5.0 - Mend

little-coder 1.4.3 → 1.5.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (23) hide show

package/.pi/extensions/_shared/intervention.test.ts +13 -0
package/.pi/extensions/_shared/intervention.ts +41 -0
package/.pi/extensions/benchmark-profiles/index.ts +27 -9
package/.pi/extensions/benchmark-profiles/profiles.test.ts +53 -44
package/.pi/extensions/clear-command/index.test.ts +37 -0
package/.pi/extensions/clear-command/index.ts +26 -0
package/.pi/extensions/finalize-warn/index.ts +4 -3
package/.pi/extensions/output-parser/index.ts +4 -3
package/.pi/extensions/quality-monitor/index.ts +15 -8
package/.pi/extensions/quality-monitor/quality.test.ts +68 -2
package/.pi/extensions/quality-monitor/quality.ts +17 -0
package/.pi/extensions/thinking-budget/budget.test.ts +170 -132
package/.pi/extensions/thinking-budget/index.ts +118 -52
package/.pi/extensions/turn-cap/index.ts +4 -3
package/.pi/extensions/write-guard/index.ts +57 -67
package/.pi/extensions/write-guard/write-guard.test.ts +102 -2
package/.pi/settings.json +6 -6
package/CHANGELOG.md +26 -0
package/README.md +8 -2
package/bin/little-coder.mjs +12 -0
package/package.json +4 -2
package/scripts/patch-pi.mjs +113 -0
package/scripts/patch-pi.test.mjs +63 -0

package/.pi/extensions/_shared/intervention.test.ts ADDED Viewed

@@ -0,0 +1,13 @@
+import { describe, it, expect } from "vitest";
+import { harnessIntervention } from "./intervention.ts";
+describe("harnessIntervention", () => {
+  it("prefixes the message and uses a single info-level notification", () => {
+    const calls: Array<[string, string | undefined]> = [];
+    const ctx = { ui: { notify: (m: string, t?: any) => calls.push([m, t]) } };
+    harnessIntervention(ctx, "the model did X — doing Y.");
+    expect(calls).toEqual([
+      ["harness intervention: the model did X — doing Y.", "info"],
+    ]);
+  });
+});

package/.pi/extensions/_shared/intervention.ts ADDED Viewed

@@ -0,0 +1,41 @@
+// Shared presentation for "harness intervention" events — the moments where
+// little-coder's scaffolding overrides or redirects the model rather than the
+// model deciding for itself (thinking-budget cap, write-guard redirect,
+// turn-cap, finalize-warn, quality-monitor corrections, output-parser nudges).
+//
+// Before this helper each extension emitted its own free-form `ctx.ui.notify`
+// in a different voice and severity, so a single harness decision (e.g. a
+// thinking-budget abort) surfaced as several stacked warnings plus pi's own
+// "Operation aborted" marker. Routing every such message through one helper
+// gives the user a single, consistently-worded line:
+//
+//     harness intervention: the model has thought long enough — forcing it to
+//     start implementing.
+//
+// This dir intentionally has no `index.ts`, so the launcher's extension
+// auto-discovery (bin/little-coder.mjs: requires `<subdir>/index.ts`) skips
+// it — it is a library imported by the real extensions, not an extension.
+// Structurally typed so this helper has no hard dependency on pi's type
+// surface and stays trivially mockable in unit tests.
+export interface InterventionUI {
+  notify(message: string, type?: "info" | "warning" | "error"): void;
+}
+export interface InterventionCtx {
+  ui: InterventionUI;
+}
+/**
+ * Surface a single, uniformly-formatted harness-intervention line to the user.
+ *
+ * @param ctx     Any object exposing `ui.notify` (the event-handler ctx).
+ * @param message The human explanation of what the harness did and why,
+ *                phrased as a continuation of "harness intervention: ".
+ *                Lead with the consequence, e.g.
+ *                "the model has thought long enough — forcing it to start
+ *                implementing."
+ */
+export function harnessIntervention(ctx: InterventionCtx, message: string): void {
+  ctx.ui.notify(`harness intervention: ${message}`, "info");
+}

package/.pi/extensions/benchmark-profiles/index.ts CHANGED Viewed

@@ -60,25 +60,38 @@ function loadSettings(): void {
   }
 }
-function resolveProfile(providerSlashModel: string): ModelProfile {
-  loadSettings();
-  if (!settings) return {};
-  const profiles = settings.model_profiles ?? {};
-  const bench = process.env.LITTLE_CODER_BENCHMARK;
+// Normalize the separator between model-name segments so a profile key written
+// with hyphens (`llamacpp/qwen3.6-35b-a3b`) matches a runtime model id that uses
+// a colon (`llamacpp/qwen3.6:35b-a3b`) and vice-versa. Without this the prefix
+// match silently fails and EVERY model falls back to default_model_profile —
+// per-model thinking_budget / context_limit / temperature are skipped (the
+// quirk surfaced in issue #8's reproduction). Dots (`qwen3.6`) are preserved.
+export function normKey(s: string): string {
+  return s.replace(/:/g, "-");
+}
+// Pure resolver, exported for testing. Exact match → separator-insensitive
+// prefix match → default_model_profile, then benchmark_overrides if `bench` set.
+export function resolveProfileFrom(
+  s: LittleCoderSettings | null,
+  providerSlashModel: string,
+  bench?: string,
+): ModelProfile {
+  if (!s) return {};
+  const profiles = s.model_profiles ?? {};
+  const target = normKey(providerSlashModel);
-  // Exact match first, then prefix match (mirrors get_model_profile)
   let base: ModelProfile | undefined = profiles[providerSlashModel];
   if (!base) {
     for (const [pattern, p] of Object.entries(profiles)) {
-      if (providerSlashModel.startsWith(pattern)) {
+      if (target === normKey(pattern) || target.startsWith(normKey(pattern))) {
         base = p;
         break;
       }
     }
   }
-  if (!base) base = settings.default_model_profile ?? {};
+  if (!base) base = s.default_model_profile ?? {};
-  // Strip + apply benchmark_overrides if set
   const { benchmark_overrides, ...basePlain } = { ...base };
   if (bench && benchmark_overrides && benchmark_overrides[bench]) {
     return { ...basePlain, ...benchmark_overrides[bench] };
@@ -86,6 +99,11 @@ function resolveProfile(providerSlashModel: string): ModelProfile {
   return basePlain;
 }
+function resolveProfile(providerSlashModel: string): ModelProfile {
+  loadSettings();
+  return resolveProfileFrom(settings, providerSlashModel, process.env.LITTLE_CODER_BENCHMARK);
+}
 // Per-benchmark tools that should always have skill cards present on turn 1,
 // even before the agent has used them. Without this, skill-inject relies on
 // recency / error-recovery / intent-matching, none of which fire on the

package/.pi/extensions/benchmark-profiles/profiles.test.ts CHANGED Viewed

@@ -1,78 +1,87 @@
-import { describe, it, expect, beforeEach, afterEach } from "vitest";
+import { describe, it, expect } from "vitest";
 import { readFileSync } from "node:fs";
 import { dirname, join } from "node:path";
 import { fileURLToPath } from "node:url";
+import { resolveProfileFrom, normKey } from "./index.ts";
 const here = dirname(fileURLToPath(import.meta.url));
 const settingsPath = join(here, "..", "..", "settings.json");
-// Mirror the resolution logic so we can test it as a pure function without
-// instantiating the extension.
-interface ModelProfile {
-  thinking_budget?: number;
-  max_turns?: number;
-  temperature?: number;
-  context_limit?: number;
-  benchmark_overrides?: Record<string, Partial<ModelProfile>>;
-}
-function resolveProfile(
-  settings: { model_profiles?: Record<string, ModelProfile>; default_model_profile?: ModelProfile },
-  key: string,
-  benchmark?: string,
-): ModelProfile {
-  const profiles = settings.model_profiles ?? {};
-  let base: ModelProfile | undefined = profiles[key];
-  if (!base) {
-    for (const [pattern, p] of Object.entries(profiles)) {
-      if (key.startsWith(pattern)) { base = p; break; }
-    }
-  }
-  if (!base) base = settings.default_model_profile ?? {};
-  const { benchmark_overrides, ...basePlain } = { ...base };
-  if (benchmark && benchmark_overrides && benchmark_overrides[benchmark]) {
-    return { ...basePlain, ...benchmark_overrides[benchmark] };
-  }
-  return basePlain;
-}
 describe("benchmark-profiles resolution against real settings.json", () => {
   const settings = JSON.parse(readFileSync(settingsPath, "utf-8")).little_coder;
-  it("resolves base profile for llamacpp/qwen3.6-35b-a3b", () => {
-    const p = resolveProfile(settings, "llamacpp/qwen3.6-35b-a3b");
-    expect(p.thinking_budget).toBe(2048);
+  it("resolves base profile for llamacpp/qwen3.6-35b-a3b (budget bumped to 4096)", () => {
+    const p = resolveProfileFrom(settings, "llamacpp/qwen3.6-35b-a3b");
+    expect(p.thinking_budget).toBe(4096);
     expect(p.context_limit).toBe(32768);
     expect(p.max_turns).toBeUndefined();
   });
   it("applies terminal_bench overrides", () => {
-    const p = resolveProfile(settings, "llamacpp/qwen3.6-35b-a3b", "terminal_bench");
-    expect(p.thinking_budget).toBe(3000);
+    const p = resolveProfileFrom(settings, "llamacpp/qwen3.6-35b-a3b", "terminal_bench");
+    expect(p.thinking_budget).toBe(3000); // benchmark override kept
     expect(p.temperature).toBe(0.2);
     expect(p.max_turns).toBe(40);
-    // Non-overridden fields fall through from base
     expect(p.context_limit).toBe(32768);
   });
   it("applies gaia overrides", () => {
-    const p = resolveProfile(settings, "llamacpp/qwen3.6-35b-a3b", "gaia");
+    const p = resolveProfileFrom(settings, "llamacpp/qwen3.6-35b-a3b", "gaia");
     expect(p.thinking_budget).toBe(2000);
     expect(p.temperature).toBe(0.4);
     expect(p.max_turns).toBe(40);
     expect(p.context_limit).toBe(65536);
   });
-  it("unknown model falls back to default_model_profile", () => {
-    const p = resolveProfile(settings, "fake-provider/fake-model");
-    // Default profile defined in settings.json
-    expect(p.thinking_budget).toBe(2048);
+  it("unknown model falls back to default_model_profile (also 4096)", () => {
+    const p = resolveProfileFrom(settings, "fake-provider/fake-model");
+    expect(p.thinking_budget).toBe(4096);
     expect(p.context_limit).toBe(32768);
   });
   it("unknown benchmark name yields base profile unchanged", () => {
-    const p = resolveProfile(settings, "llamacpp/qwen3.6-35b-a3b", "totally_made_up");
-    expect(p.thinking_budget).toBe(2048);
+    const p = resolveProfileFrom(settings, "llamacpp/qwen3.6-35b-a3b", "totally_made_up");
+    expect(p.thinking_budget).toBe(4096);
     expect(p.max_turns).toBeUndefined();
   });
+  it("every shipped per-model profile carries the 4096 budget", () => {
+    for (const key of Object.keys(settings.model_profiles)) {
+      expect(resolveProfileFrom(settings, key).thinking_budget, key).toBe(4096);
+    }
+  });
+});
+describe("separator-insensitive model-key matching (issue #8 quirk)", () => {
+  // The reproduction noted runtime ids using a colon (`qwen3.6:35b-a3b`) never
+  // matched the hyphenated profile key, so per-model profiles were silently
+  // skipped and everything fell back to default.
+  const settings = {
+    default_model_profile: { thinking_budget: 4096 },
+    model_profiles: {
+      "llamacpp/qwen3.6-35b-a3b": { thinking_budget: 1234, temperature: 0.3 },
+    },
+  };
+  it("normKey collapses ':' to '-'", () => {
+    expect(normKey("llamacpp/qwen3.6:35b-a3b")).toBe("llamacpp/qwen3.6-35b-a3b");
+  });
+  it("matches a colon runtime id to a hyphenated profile key", () => {
+    const p = resolveProfileFrom(settings, "llamacpp/qwen3.6:35b-a3b");
+    expect(p.thinking_budget).toBe(1234); // per-model profile, NOT the default
+  });
+  it("still matches the exact hyphenated id", () => {
+    expect(resolveProfileFrom(settings, "llamacpp/qwen3.6-35b-a3b").thinking_budget).toBe(1234);
+  });
+  it("matches via prefix when the runtime id has a tag suffix", () => {
+    const p = resolveProfileFrom(settings, "llamacpp/qwen3.6:35b-a3b:Q4_K_M");
+    expect(p.thinking_budget).toBe(1234);
+  });
+  it("an unrelated model still falls back to default", () => {
+    expect(resolveProfileFrom(settings, "ollama/llama3").thinking_budget).toBe(4096);
+  });
 });

package/.pi/extensions/clear-command/index.test.ts ADDED Viewed

@@ -0,0 +1,37 @@
+import { describe, it, expect } from "vitest";
+import setupClear from "./index.ts";
+describe("/clear command", () => {
+  function register() {
+    let reg: { name: string; opts: any } | undefined;
+    const pi = {
+      registerCommand(name: string, opts: any) {
+        reg = { name, opts };
+      },
+    };
+    setupClear(pi as any);
+    if (!reg) throw new Error("no command registered");
+    return reg;
+  }
+  it("registers a command named 'clear' with a description", () => {
+    const reg = register();
+    expect(reg.name).toBe("clear");
+    expect(typeof reg.opts.description).toBe("string");
+    expect(reg.opts.description.length).toBeGreaterThan(0);
+    expect(typeof reg.opts.handler).toBe("function");
+  });
+  it("starts a new session when invoked", async () => {
+    const reg = register();
+    let calls = 0;
+    const ctx = {
+      newSession: async () => {
+        calls++;
+        return { cancelled: false };
+      },
+    };
+    await reg.opts.handler("", ctx);
+    expect(calls).toBe(1);
+  });
+});

package/.pi/extensions/clear-command/index.ts ADDED Viewed

@@ -0,0 +1,26 @@
+import type { ExtensionAPI } from "@earendil-works/pi-coding-agent";
+// Adds a `/clear` command (the name users expect from other coding agents) that
+// resets the session as if little-coder had been closed and relaunched.
+//
+// `ctx.newSession()` drives pi's full session-replacement lifecycle:
+//   session_before_switch → session_shutdown → session_start{reason:"new"}
+//   → resources_discover{reason:"startup"-equivalent}
+// which:
+//   - re-renders little-coder's branding header (branding ext hooks session_start),
+//   - rebuilds the harness system prompt / AGENTS.md context from scratch,
+//   - resets every session_start-scoped extension's module state
+//     (quality-monitor counters, evidence store, etc.).
+//
+// pi already ships `/new` for this; we register `/clear` as an alias so the
+// muscle-memory command works, and so the help/branding line can advertise it.
+export default function (pi: ExtensionAPI) {
+  pi.registerCommand("clear", {
+    description: "Start a fresh session — clears history and reloads context, like relaunching",
+    handler: async (_args, ctx) => {
+      // newSession() handles the confirm/cancel flow and the full lifecycle.
+      // Returns { cancelled } if the user backed out; nothing else to do here.
+      await ctx.newSession();
+    },
+  });
+}

package/.pi/extensions/finalize-warn/index.ts CHANGED Viewed

@@ -1,4 +1,5 @@
 import type { ExtensionAPI } from "@earendil-works/pi-coding-agent";
+import { harnessIntervention } from "../_shared/intervention.ts";
 // Pre-cap finalize-warn: when the agent has WARN_REMAINING turns left
 // (this turn included), inject a follow-up user message reminding it to
@@ -60,9 +61,9 @@ export default function (pi: ExtensionAPI) {
       `Do not start new tool chains; if you need a fact you don't have, ` +
       `answer with your best supported guess from EvidenceList rather than ` +
       `leaving it blank.`;
-    ctx.ui.notify(
-      `finalize-warn: ${WARN_REMAINING} turns left at ${turnsThisRun}/${capForRun}`,
-      "info",
+    harnessIntervention(
+      ctx,
+      `${WARN_REMAINING} turns left — telling the model to finalize its answer now.`,
     );
     try {
       pi.sendUserMessage(msg, { deliverAs: "followUp" });

package/.pi/extensions/output-parser/index.ts CHANGED Viewed

@@ -1,5 +1,6 @@
 import type { ExtensionAPI } from "@earendil-works/pi-coding-agent";
 import { parseTextToolCalls } from "./parser.ts";
+import { harnessIntervention } from "../_shared/intervention.ts";
 // Detects malformed/fenced tool calls in assistant text and nudges the model
 // back onto native tool-calling. Active-repair (executing extracted calls
@@ -37,9 +38,9 @@ export default function (pi: ExtensionAPI) {
     if (calls.length === 0) return;
     const names = calls.map((c) => c.name).join(", ");
-    ctx.ui.notify(
-      `Detected ${calls.length} text-embedded tool call(s) [${names}] — nudging model to native tool calling`,
-      "warning",
+    harnessIntervention(
+      ctx,
+      `the model wrote ${calls.length} tool call(s) as text [${names}] — nudging it back to native tool calls.`,
     );
     // Queue a follow-up that will be delivered after the agent finishes.

package/.pi/extensions/quality-monitor/index.ts CHANGED Viewed

@@ -1,5 +1,6 @@
 import type { ExtensionAPI } from "@earendil-works/pi-coding-agent";
-import { assessResponse, buildCorrectionMessage, type ToolCall } from "./quality.ts";
+import { assessResponse, buildCorrectionMessage, phraseForUser, type ToolCall } from "./quality.ts";
+import { harnessIntervention } from "../_shared/intervention.ts";
 // Port of local/quality.py. Hooks turn_end, inspects the assistant message
 // + previous turn's tool calls, and — if we detect a failure mode — sends
@@ -30,6 +31,15 @@ export default function (pi: ExtensionAPI) {
     const message = (event as any).message;
     if (!message) return;
+    // Skip turns that were interrupted/aborted — by the user pressing ESC OR by
+    // a harness abort (thinking-budget, turn-cap). pi marks these with
+    // stopReason "aborted"; their content is legitimately partial/empty, so
+    // assessing them spuriously fires `empty_response` and steers a "your
+    // previous response was empty" correction onto the user's NEXT prompt
+    // (the escape-interrupt bug, and the second false warning in the
+    // thinking-budget cascade). An aborted turn is not a model quality failure.
+    if (message.stopReason === "aborted") return;
     // Extract assistant text + tool calls from pi's content-block format
     const content = Array.isArray(message.content) ? message.content : [];
     const text = content
@@ -53,18 +63,15 @@ export default function (pi: ExtensionAPI) {
     // Cap corrections so we don't burn turns in a correction loop
     consecutiveFailures++;
     if (consecutiveFailures > MAX_CONSECUTIVE_CORRECTIONS) {
-      ctx.ui.notify(
-        `quality-monitor: ${verdict.reason} (suppressed after ${consecutiveFailures} in a row)`,
-        "warning",
+      harnessIntervention(
+        ctx,
+        `${phraseForUser(verdict.reason)} — backing off after ${consecutiveFailures} in a row.`,
       );
       return;
     }
     const correction = buildCorrectionMessage(verdict.reason);
-    ctx.ui.notify(
-      `quality-monitor: ${verdict.reason} → injecting correction`,
-      "warning",
-    );
+    harnessIntervention(ctx, `${phraseForUser(verdict.reason)} — redirecting the model.`);
     // "steer" delivers the correction promptly to the in-flight loop. The
     // prior "followUp" mode parked the message until the *next* user input,
     // by which point it was no longer relevant (issue #16).

package/.pi/extensions/quality-monitor/quality.test.ts CHANGED Viewed

@@ -1,5 +1,6 @@
-import { describe, it, expect } from "vitest";
-import { assessResponse, buildCorrectionMessage } from "./quality.ts";
+import { describe, it, expect, beforeEach } from "vitest";
+import { assessResponse, buildCorrectionMessage, phraseForUser } from "./quality.ts";
+import setupQualityMonitor from "./index.ts";
 const known = new Set(["Read", "Write", "Edit", "Bash", "Glob", "Grep"]);
@@ -73,3 +74,68 @@ describe("buildCorrectionMessage", () => {
     expect(buildCorrectionMessage("weird_thing")).toContain("weird_thing");
   });
 });
+describe("phraseForUser", () => {
+  it("phrases known reasons in plain language", () => {
+    expect(phraseForUser("empty_response")).toMatch(/empty response/i);
+    expect(phraseForUser("repeated_tool_call")).toMatch(/repeated/i);
+  });
+  it("includes the tool name for parameterized reasons", () => {
+    expect(phraseForUser("unknown_tool:Frobnicate")).toContain("Frobnicate");
+    expect(phraseForUser("malformed_args:Edit")).toContain("Edit");
+  });
+});
+// ── turn_end handler: must skip interrupted/aborted turns ───────────────────
+function harness() {
+  const handlers: Record<string, ((e: any, c: any) => any)[]> = {};
+  const followUps: { msg: string; opts: any }[] = [];
+  const pi = {
+    handlers,
+    on(name: string, h: (e: any, c: any) => any) {
+      (handlers[name] ??= []).push(h);
+    },
+    sendUserMessage(msg: string, opts: any) {
+      followUps.push({ msg, opts });
+    },
+  };
+  const notifies: string[] = [];
+  const ctx = { ui: { notify: (m: string) => notifies.push(m) } };
+  setupQualityMonitor(pi as any);
+  return { pi, ctx, followUps, notifies };
+}
+async function fire(h: any, name: string, event: any) {
+  for (const fn of h.pi.handlers[name] ?? []) await fn(event, h.ctx);
+}
+describe("quality-monitor turn_end", () => {
+  let h: ReturnType<typeof harness>;
+  beforeEach(async () => {
+    h = harness();
+    await fire(h, "session_start", {}); // reset session-scoped counters
+  });
+  it("skips an aborted/interrupted turn — no empty_response correction", async () => {
+    // An ESC interrupt or harness abort produces a partial/empty message with
+    // stopReason "aborted". This is the escape-interrupt bug: it must NOT steer
+    // a 'your previous response was empty' correction onto the next prompt.
+    await fire(h, "turn_end", { message: { stopReason: "aborted", content: [] } });
+    expect(h.followUps).toHaveLength(0);
+    expect(h.notifies).toHaveLength(0);
+  });
+  it("flags a genuinely empty COMPLETED turn and steers a correction", async () => {
+    await fire(h, "turn_end", { message: { stopReason: "stop", content: [] } });
+    expect(h.followUps).toHaveLength(1);
+    expect(h.followUps[0].opts).toEqual({ deliverAs: "steer" });
+    expect(h.notifies[0]).toMatch(/harness intervention:/i);
+  });
+  it("passes a normal text turn without intervention", async () => {
+    await fire(h, "turn_end", {
+      message: { stopReason: "stop", content: [{ type: "text", text: "done." }] },
+    });
+    expect(h.followUps).toHaveLength(0);
+    expect(h.notifies).toHaveLength(0);
+  });
+});

package/.pi/extensions/quality-monitor/quality.ts CHANGED Viewed

@@ -82,3 +82,20 @@ export function buildCorrectionMessage(reason: string): string {
   return corrections[reason] ?? `Issue detected: ${reason}. Please try again.`;
 }
+// Short, user-facing phrasing for the harness-intervention line (distinct from
+// buildCorrectionMessage, which is the verbose text sent to the model).
+export function phraseForUser(reason: string): string {
+  if (reason.startsWith("unknown_tool:")) {
+    return `the model called a tool that doesn't exist (${reason.slice("unknown_tool:".length)})`;
+  }
+  if (reason.startsWith("malformed_args:")) {
+    return `the model's tool arguments were malformed (${reason.slice("malformed_args:".length)})`;
+  }
+  const phrases: Record<string, string> = {
+    empty_response: "the model returned an empty response",
+    empty_tool_name: "the model emitted a tool call with no name",
+    repeated_tool_call: "the model repeated its previous tool call verbatim",
+  };
+  return phrases[reason] ?? `quality issue (${reason})`;
+}