npm - little-coder - Versions diffs - 1.4.3 → 1.5.1 - Mend

little-coder 1.4.3 → 1.5.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (24) hide show

package/.pi/extensions/_shared/intervention.test.ts +13 -0
package/.pi/extensions/_shared/intervention.ts +41 -0
package/.pi/extensions/benchmark-profiles/index.ts +27 -9
package/.pi/extensions/benchmark-profiles/profiles.test.ts +53 -44
package/.pi/extensions/branding/index.ts +15 -1
package/.pi/extensions/clear-command/index.test.ts +37 -0
package/.pi/extensions/clear-command/index.ts +26 -0
package/.pi/extensions/finalize-warn/index.ts +4 -3
package/.pi/extensions/output-parser/index.ts +4 -3
package/.pi/extensions/quality-monitor/index.ts +15 -8
package/.pi/extensions/quality-monitor/quality.test.ts +68 -2
package/.pi/extensions/quality-monitor/quality.ts +17 -0
package/.pi/extensions/thinking-budget/budget.test.ts +170 -132
package/.pi/extensions/thinking-budget/index.ts +118 -52
package/.pi/extensions/turn-cap/index.ts +4 -3
package/.pi/extensions/write-guard/index.ts +57 -67
package/.pi/extensions/write-guard/write-guard.test.ts +102 -2
package/.pi/settings.json +6 -6
package/CHANGELOG.md +39 -0
package/README.md +9 -3
package/bin/little-coder.mjs +12 -0
package/package.json +4 -2
package/scripts/patch-pi.mjs +113 -0
package/scripts/patch-pi.test.mjs +63 -0

package/.pi/extensions/write-guard/index.ts CHANGED Viewed

@@ -1,10 +1,10 @@
 import type { ExtensionAPI } from "@earendil-works/pi-coding-agent";
-import { Type } from "@sinclair/typebox";
-import { existsSync, mkdirSync, writeFileSync } from "node:fs";
-import { dirname, isAbsolute, join } from "node:path";
+import { existsSync } from "node:fs";
+import { isAbsolute, join } from "node:path";
+import { harnessIntervention } from "../_shared/intervention.ts";
 /**
- * Resolve the Write tool's `file_path` argument to a concrete on-disk path.
+ * Resolve a write `path` argument to a concrete on-disk path.
  *
  * Two deterministic rewrites:
  *
@@ -17,8 +17,6 @@ import { dirname, isAbsolute, join } from "node:path";
  *    accidentally writing to `/`.
  *
  * 2. Bare filename / relative path (no leading slash) → resolved against cwd.
- *    Node's `fs` APIs already do this implicitly, but resolving here makes
- *    the success message report the real absolute path that was written.
  *
  * Anything else (absolute path with at least one intermediate directory) is
  * left untouched.
@@ -36,68 +34,60 @@ export function normalizeWritePath(
   return { path: filePath };
 }
-// Port of tools.py::_write. Preserves the exact Edit-recipe error string so
-// the model recovers to Edit on its next turn. The whitepaper's benchmark
-// result depends on Write refusing whole-file rewrites of existing files
-// (fires on ~57% of Polyglot exercises).
+// Read whichever key carries the destination path. pi's built-in `write` uses
+// `path`; older little-coder builds and some prompts use `file_path`. We accept
+// both so the guard is independent of which write implementation is in play.
+function pathKey(input: Record<string, unknown>): "path" | "file_path" | undefined {
+  if (typeof input.path === "string") return "path";
+  if (typeof input.file_path === "string") return "file_path";
+  return undefined;
+}
+function editRecipe(resolved: string): string {
+  return (
+    `Write refused — ${resolved} already exists.\n` +
+    `\n` +
+    `Write is for creating NEW files only. To change an existing file, use Edit:\n` +
+    `  {"name": "edit", "input": {"path": "${resolved}", ` +
+    `"edits": [{"oldText": "<exact text currently in the file>", ` +
+    `"newText": "<replacement text>"}]}}\n` +
+    `\n` +
+    `If you do not already know the file's current content, Read it first to get the ` +
+    `exact text for oldText (whitespace and indentation must match). Include enough ` +
+    `surrounding context (2-3 lines) to make oldText unique in the file.\n` +
+    `\n` +
+    `For multiple changes, pass multiple entries in edits[] — one per location. Do NOT ` +
+    `retry Write; it will be refused again.`
+  );
+}
+// Port of tools.py::_write's guard. The whitepaper's benchmark result depends
+// on Write refusing whole-file rewrites of existing files (fires on ~57% of
+// Polyglot exercises). The earlier implementation registered a *custom* `write`
+// tool to enforce this — but pi ships its own built-in `write`
+// (`core/tools/write.js`, "overwrites if it does") which shadowed the custom
+// one, so on current pi the guard never fired and existing files were silently
+// rewritten. We now enforce at the `tool_call` event instead, which fires for
+// whichever `write` implementation runs and lets us both normalize the path in
+// place and block the call before it executes.
 export default function (pi: ExtensionAPI) {
-  pi.registerTool({
-    name: "write",
-    label: "Write",
-    description:
-      "Create a NEW file with the given content. Refuses if the file already exists — use edit to modify existing files. " +
-      "Parent directories are created automatically. " +
-      "Pass either a path relative to the working directory (e.g. `notes/plan.md`) or a full absolute path. " +
-      "A bare filename like `foo.md` resolves to <cwd>/foo.md. " +
-      "A path of the form `/<filename>` with no intermediate directories is treated as cwd-relative " +
-      "(use `/etc/hosts` etc. if you really mean the filesystem root).",
-    parameters: Type.Object({
-      file_path: Type.String({ description: "File path (relative to cwd, or absolute)" }),
-      content: Type.String({ description: "Full file content" }),
-    }),
-    async execute(_id, { file_path, content }) {
-      const { path: resolved, rewrittenFrom } = normalizeWritePath(file_path);
-      if (existsSync(resolved)) {
-        const recipe =
-          `Error: Write refused — ${resolved} already exists.\n` +
-          `\n` +
-          `Write is only for creating NEW files. To change an existing file, use Edit:\n` +
-          `  {"name": "Edit", "input": {"file_path": "${resolved}", ` +
-          `"old_string": "<exact text currently in the file>", ` +
-          `"new_string": "<replacement text>"}}\n` +
-          `\n` +
-          `If you do not already know the file's current content, Read it first to ` +
-          `get the exact text for old_string. Include enough surrounding context ` +
-          `(2-3 lines) to make old_string unique in the file.\n` +
-          `\n` +
-          `For multiple changes, emit multiple Edit calls — one per location. Do NOT ` +
-          `retry Write; it will be refused again.`;
-        return {
-          content: [{ type: "text", text: recipe }],
-          details: {},
-          isError: true,
-        };
-      }
+  pi.on("tool_call", async (event, ctx) => {
+    if (String((event as any).toolName ?? "").toLowerCase() !== "write") return;
+    const input = ((event as any).input ?? {}) as Record<string, unknown>;
+    const key = pathKey(input);
+    if (!key) return;
+    const { path: resolved } = normalizeWritePath(String(input[key]), ctx.cwd);
+    // Normalize in place so the executing write (built-in or custom) lands on
+    // the resolved path even when we don't block (e.g. the `/foo.md` → cwd fix).
+    input[key] = resolved;
+    if (!existsSync(resolved)) return; // new file — allow the write through
-      try {
-        mkdirSync(dirname(resolved), { recursive: true });
-        writeFileSync(resolved, content, { encoding: "utf-8" });
-        const lc = content.split("\n").length - (content.endsWith("\n") ? 1 : 0) +
-          (content.length > 0 && !content.endsWith("\n") ? 1 : 0);
-        const suffix = rewrittenFrom
-          ? ` (rewrote ${rewrittenFrom} → cwd; root-path single-segment write redirected)`
-          : "";
-        return {
-          content: [{ type: "text", text: `Created ${resolved} (${lc} lines)${suffix}` }],
-          details: {},
-        };
-      } catch (e) {
-        return {
-          content: [{ type: "text", text: `Error: ${(e as Error).message}` }],
-          details: {},
-          isError: true,
-        };
-      }
-    },
+    harnessIntervention(
+      ctx,
+      "small models can't rewrite whole files — redirected the model to Edit.",
+    );
+    return { block: true, reason: editRecipe(resolved) };
   });
 }

package/.pi/extensions/write-guard/write-guard.test.ts CHANGED Viewed

@@ -1,5 +1,8 @@
-import { describe, it, expect } from "vitest";
-import { normalizeWritePath } from "./index.ts";
+import { describe, it, expect, beforeEach, afterEach } from "vitest";
+import { mkdtempSync, writeFileSync, rmSync } from "node:fs";
+import { tmpdir } from "node:os";
+import { join } from "node:path";
+import setupWriteGuard, { normalizeWritePath } from "./index.ts";
 describe("normalizeWritePath", () => {
   const cwd = "/home/me/proj";
@@ -49,3 +52,100 @@ describe("normalizeWritePath", () => {
     });
   });
 });
+// ── tool_call interceptor: the actual existing-file guard ───────────────────
+// pi ships a built-in `write` that overwrites existing files and shadowed our
+// old custom tool, so the guard never fired. We now enforce on the `tool_call`
+// event, which catches whichever write implementation runs.
+function getToolCallHandler() {
+  let handler: ((event: any, ctx: any) => any) | undefined;
+  const pi = {
+    on(name: string, h: (event: any, ctx: any) => any) {
+      if (name === "tool_call") handler = h;
+    },
+  };
+  setupWriteGuard(pi as any);
+  if (!handler) throw new Error("write-guard did not register a tool_call handler");
+  return handler;
+}
+function makeCtx(cwd: string) {
+  const notifies: string[] = [];
+  return { cwd, notifies, ui: { notify: (m: string) => notifies.push(m) } };
+}
+describe("write-guard tool_call interceptor", () => {
+  let dir: string;
+  let existing: string;
+  beforeEach(() => {
+    dir = mkdtempSync(join(tmpdir(), "wg-"));
+    existing = join(dir, "already.md");
+    writeFileSync(existing, "old content\n");
+  });
+  afterEach(() => {
+    rmSync(dir, { recursive: true, force: true });
+  });
+  it("blocks a write to an existing file with an Edit recipe", async () => {
+    const handler = getToolCallHandler();
+    const ctx = makeCtx(dir);
+    const event = { toolName: "write", input: { path: existing, content: "new" } };
+    const result = await handler(event, ctx);
+    expect(result?.block).toBe(true);
+    expect(result.reason).toContain("already exists");
+    expect(result.reason).toContain('"name": "edit"'); // correct pi edit recipe
+    expect(result.reason).toContain("oldText");
+    expect(ctx.notifies[0]).toMatch(/harness intervention:.*redirected the model to Edit/i);
+  });
+  it("allows a write to a NEW file (no block) and normalizes the path in place", async () => {
+    const handler = getToolCallHandler();
+    const ctx = makeCtx(dir);
+    const input: any = { path: "fresh.md", content: "hi" };
+    const event = { toolName: "write", input };
+    const result = await handler(event, ctx);
+    expect(result).toBeUndefined();
+    expect(input.path).toBe(join(dir, "fresh.md")); // normalized relative → cwd
+    expect(ctx.notifies).toHaveLength(0);
+  });
+  it("rewrites a root-anchored /<bare> path to cwd in place", async () => {
+    const handler = getToolCallHandler();
+    const ctx = makeCtx(dir);
+    const input: any = { path: "/fresh.md", content: "hi" };
+    await handler({ toolName: "write", input }, ctx);
+    expect(input.path).toBe(join(dir, "fresh.md"));
+  });
+  it("honors the file_path arg key as well as path", async () => {
+    const handler = getToolCallHandler();
+    const ctx = makeCtx(dir);
+    const result = await handler(
+      { toolName: "write", input: { file_path: existing, content: "x" } },
+      ctx,
+    );
+    expect(result?.block).toBe(true);
+  });
+  it("is case-insensitive on the tool name", async () => {
+    const handler = getToolCallHandler();
+    const ctx = makeCtx(dir);
+    const result = await handler({ toolName: "Write", input: { path: existing } }, ctx);
+    expect(result?.block).toBe(true);
+  });
+  it("ignores non-write tools", async () => {
+    const handler = getToolCallHandler();
+    const ctx = makeCtx(dir);
+    const result = await handler({ toolName: "read", input: { path: existing } }, ctx);
+    expect(result).toBeUndefined();
+  });
+  it("ignores a write call with no path argument", async () => {
+    const handler = getToolCallHandler();
+    const ctx = makeCtx(dir);
+    const result = await handler({ toolName: "write", input: { content: "x" } }, ctx);
+    expect(result).toBeUndefined();
+  });
+});

package/.pi/settings.json CHANGED Viewed

@@ -6,7 +6,7 @@
     "default_model_profile": {
       "context_limit": 32768,
       "max_tokens": 4096,
-      "thinking_budget": 2048,
+      "thinking_budget": 4096,
       "skill_token_budget": 300,
       "knowledge_token_budget": 200,
       "system_prompt_budget": 0,
@@ -17,7 +17,7 @@
       "llamacpp/qwen3.6-27b": {
         "context_limit": 32768,
         "max_tokens": 4096,
-        "thinking_budget": 2048,
+        "thinking_budget": 4096,
         "skill_token_budget": 300,
         "knowledge_token_budget": 200,
         "temperature": 0.3,
@@ -38,7 +38,7 @@
       "llamacpp/qwen3.6-35b-a3b": {
         "context_limit": 32768,
         "max_tokens": 4096,
-        "thinking_budget": 2048,
+        "thinking_budget": 4096,
         "skill_token_budget": 300,
         "knowledge_token_budget": 200,
         "temperature": 0.3,
@@ -59,7 +59,7 @@
       "llamacpp/qwen3.5-9b": {
         "context_limit": 32768,
         "max_tokens": 4096,
-        "thinking_budget": 2048,
+        "thinking_budget": 4096,
         "skill_token_budget": 300,
         "knowledge_token_budget": 200,
         "temperature": 0.3
@@ -67,7 +67,7 @@
       "ollama/qwen3.5": {
         "context_limit": 32768,
         "max_tokens": 4096,
-        "thinking_budget": 2048,
+        "thinking_budget": 4096,
         "skill_token_budget": 300,
         "knowledge_token_budget": 200,
         "temperature": 0.3
@@ -75,7 +75,7 @@
       "lmstudio/local-model": {
         "context_limit": 32768,
         "max_tokens": 4096,
-        "thinking_budget": 2048,
+        "thinking_budget": 4096,
         "skill_token_budget": 300,
         "knowledge_token_budget": 200,
         "temperature": 0.3

package/CHANGELOG.md CHANGED Viewed

@@ -2,6 +2,45 @@
 All notable changes to little-coder are documented here. The format follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and little-coder's public interface (CLI, providers, tools, skills) follows semver starting at `v0.0.1` post-rename.
+## [v1.5.1] — 2026-05-22
+A branding release — no behaviour changes. little-coder now wears the v1.0 brand book: the warm **paper / ink / honey** palette (`#F2EBDC` · `#1A1410` · `#E15A1F`), the `lc▌` block-cursor mark, and IBM Plex Mono. The "ready to type" cursor is the punchline — it ties the CLI heritage into the identity without saying so.
+### Changed
+- **README hero is now the brand-book terminal banner.** A single self-contained SVG (`assets/banner.svg`, recreating the brand book's "github readme · hero" slide) replaces the old startup screenshot: ink terminal card, `lc▌` monogram in honey, the wordmark + tagline, and the verifiable headline numbers (`qwen3.6-35b-a3b`, terminal-bench 2.0 24.6%, aider polyglot 45.56%). IBM Plex Mono is embedded so it renders in-face on GitHub, with a `ui-monospace` fallback.
+- **TUI header adopts the honey "prompt lockup."** The interactive startup header (`.pi/extensions/branding/index.ts`) now renders `> little-coder▌` with a honey prompt caret and block cursor — the brand's variant for terminals and dark surfaces. Honey is emitted as a 24-bit truecolor SGR so it matches `#E15A1F` exactly regardless of the active pi theme.
+### Removed
+- The stale purple (`#7c3aed`) `docs/assets/startup.svg` mockup (`v0.0.1` / `ollama/qwen3.5`), now superseded by the on-brand banner.
+---
+## [v1.5.0] — 2026-05-22
+A reliability + UX release centered on the harness's intervention machinery. Issue [#8](https://github.com/itayinbarr/little-coder/issues/8) reproduced on 1.4.3 through a *new* mechanism, and chasing it down fixed a cluster of related symptoms: thinking never actually turning off after a budget breach, a spurious "empty response" nag after interrupts, and a noisy stack of warnings around every harness decision. Harness interventions now speak with one voice, and the thinking-budget cap is more generous.
+### Fixed
+- **Thinking-budget recovery no longer dies on a stale `pi` ([#8](https://github.com/itayinbarr/little-coder/issues/8), second reproduction).** The v1.0.0 fix deferred recovery (`setThinkingLevel("off")` + the commit-to-an-implementation follow-up) to a `turn_end` handler that ran, after a `setImmediate` yield, against the module-scope `pi` (`ExtensionAPI`). But the over-budget `ctx.abort()` makes pi's `agent_end` run auto-retry / auto-compaction (both enabled in `.pi/settings.json`; `agent-session.js:761` "compact before sending — catches aborted responses"), which **replaces the session** — `dispose()` → `ExtensionRunner.invalidate()` (`agent-session.js:516`) marks the captured `pi` stale. The `setImmediate` yield was exactly what let that replacement land *before* the deferred recovery, so the recovery touched a stale `pi` and threw (`"This extension ctx is stale after session replacement or reload"`). Net effect: thinking was never disabled (so the *next* step kept thinking) and the follow-up never reached the model (so the agent appeared to stop). The fix does the entire recovery **synchronously inside `message_update`, before `ctx.abort()`**, while `pi` is still live — no deferred handler, no `setImmediate`, nothing that can run against a stale reference. Thanks to the reporter on #8 for the minimal repro and the stale-`ctx` diagnosis.
+- **Thinking stays off across the forced restart turn.** Even with recovery firing, the post-abort run could re-resolve the thinking level back to the profile default. A `forcedOff` latch now re-asserts `"off"` at the start of every turn from a budget breach until your *next* genuine prompt (the `input` event), at which point the level you actually had is restored — so a new task thinks normally and we don't leave thinking globally disabled. State is also cleared on `session_start` (a new session / `/clear` is a clean slate).
+- **No more spurious "your previous response was empty" after an interrupt.** `quality-monitor` assessed *every* `turn_end`, including turns the user interrupted with ESC or that the harness aborted (thinking-budget, turn-cap) — which carry partial/empty content and `stopReason: "aborted"`. It then steered an `empty_response` correction onto your *next* prompt. It now skips `stopReason: "aborted"` turns entirely; genuinely-empty *completed* turns are still flagged.
+- **Per-model profiles are no longer silently skipped on colon-style model ids.** `benchmark-profiles` prefix-matched model keys literally, so a hyphenated profile key (`llamacpp/qwen3.6-35b-a3b`) never matched a runtime id using a colon (`llamacpp/qwen3.6:35b-a3b`) and every such model fell back to `default_model_profile`. Matching is now separator-insensitive (`:` ≡ `-`).
+- **Existing files can no longer be silently overwritten via Write.** pi ships a built-in `write` tool that overwrites existing files (`core/tools/write.js`) and shadowed little-coder's custom guarded `write`, so the whole-file-rewrite guard the benchmark results depend on had stopped firing. The guard now runs on the `tool_call` event — it catches whichever `write` implementation executes, normalizes the path in place, and blocks writes to existing files with a corrected Edit recipe (pi's `edit` takes `edits: [{oldText, newText}]`, not `old_string`/`new_string`).
+### Added
+- **`/clear` command.** Starts a fresh session as if little-coder were closed and relaunched — re-renders the banner, rebuilds the AGENTS.md/system-prompt context, and resets session-scoped extension state — via `ctx.newSession()`. (pi's built-in equivalent is `/new`; `/clear` is the alias muscle-memory expects.)
+- **One-line "harness intervention" UX.** Every moment the scaffolding overrides or redirects the model — thinking-budget cap, write-guard redirect, turn-cap, finalize-warn, quality-monitor corrections, output-parser nudges — now surfaces a single, uniformly-worded line (`harness intervention: …`) instead of each extension's own ad-hoc warning. Helper at `.pi/extensions/_shared/intervention.ts`.
+- **pi's bare "Operation aborted" marker is suppressed.** With harness interventions carrying their own line and a user ESC being self-evident, the stacked red marker was noise. pi is a normal dependency (not vendored), so this ships as an idempotent, dependency-free source patch (`scripts/patch-pi.mjs`) applied on `postinstall` **and** re-applied on every launch by the launcher — it self-heals if install scripts were skipped or pi was reinstalled, and **fails safe**: if a future pi changes that code the patch silently no-ops (you'd just see the marker again) rather than breaking install or launch. A test (`scripts/patch-pi.test.mjs`) fails loudly the moment the installed pi no longer matches, so a pi bump is a caught CI signal to refresh one string — never a silent regression.
+### Changed
+- **Thinking-budget cap raised 2048 → 4096 tokens** across `default_model_profile` and every per-model profile (the `terminal_bench` / `gaia` benchmark overrides keep their tuned values). The hardcoded fallback in the `thinking-budget` extension matches.
+### Notes for upgraders
+- No CLI flag, `models.json` shape, or per-model-profile *schema* changes. The only `.pi/settings.json` value change is `thinking_budget` (2048 → 4096); if you'd pinned it lower on purpose, re-set it in your own settings.
+- The custom `write` tool the `write-guard` extension used to register is gone — writes go through pi's built-in `write`, guarded at the `tool_call` event. If you depended on the old tool's `file_path` arg name in a fork, note pi's built-in uses `path` (both are accepted by the guard).
+- The pi source patch targets `@earendil-works/pi-coding-agent` 0.75.x. If you bundle a newer pi and the abort marker reappears, run `npx vitest run scripts/patch-pi.test.mjs` — a failure tells you to refresh the find/replace in `scripts/patch-pi.mjs`.
+---
 ## [v1.4.3] — 2026-05-19
 Follow-up to v1.4.2: clean up two cosmetic regressions that the @earendil-works scope migration surfaced.

package/README.md CHANGED Viewed

@@ -2,7 +2,7 @@
 **A coding agent tuned for small local models, built on top of [pi](https://pi.dev).**
-![little-coder startup view](docs/assets/startup.svg)
+![little-coder — a coding agent for the laptop in front of you](assets/banner.svg)
 The research story behind all this — why scaffold–model fit matters, how a 9.7 B Qwen beat frontier entries on Aider Polyglot, and what the load-bearing mechanisms actually do — is written up on Substack: **[*Honey, I Shrunk the Coding Agent*](https://open.substack.com/pub/itayinbarr/p/honey-i-shrunk-the-coding-agent)**. Start there if you want the "why"; stay here for the "how".
@@ -242,9 +242,15 @@ All runs used a consumer laptop: i9-14900HX, 32 GB RAM, **8 GB VRAM** on RTX 507
 That spans short coding exercises (Polyglot), interactive shell-bound tasks (Terminal-Bench), and tool-using research (GAIA), all on the same scaffold. The data needed to choose what to fix next is now in hand.
-**Phase 2 — iterative improvement on real-world tasks: starting now.** The motivating question shifts from *how wide is the impact radius?* to *which scaffolding changes compound on long-horizon real work?* The signal we have already points at concrete things to try — thinking-budget / quality-monitor behavior on long-horizon tasks, deliberate.py-style parallel branches on failure, better shell-session recovery for interactive-process traps, evidence-handling on multi-document GAIA L3 tasks — but the priority order comes from real-world use, not from a benchmark suite. Expect smaller, more frequent releases driven by what little-coder actually struggles with on day-to-day coding work.
+**Phase 2 — operating real knowledge bases as day-to-day work: the current focus.** The motivating question shifts from *how wide is the impact radius?* to *can a small local model reliably operate and traverse a large, messy knowledge base?* little-coder's day-to-day target is now real work over **many markdown files at once** — reading, cross-referencing, and updating sprawling note/log collections in the most token-efficient way a small local model can manage. Features are being implemented and tested across several real pipelines in parallel:
-**Future benchmarks (deferred).** New benchmarks like **ProgramBench**, SWE-bench Verified (multi-file real-world patches), and a GAIA test-split run come back into scope after Phase 2 has produced enough scaffolding signal to make a fresh measurement worth running. Re-benchmarking before the next round of changes lands would mostly re-measure the same baseline.
+- **Domains** — medical, athletic, and educational knowledge bases, each with its own structure, vocabulary, and citation needs.
+- **Scale** — 10+ years of logs, tens of thousands of entries of varied kinds, stressing retrieval, compaction, and the context-budgeting extensions on histories far longer than any single benchmark task.
+- **Messy real-world inputs** — validation against conflicting OCR extractions of the same source, and multilingual content where the same fact recurs across languages.
+This is where the scaffolding work now compounds: knowledge injection/selection, evidence handling, compaction fidelity, and the harness-intervention behaviors. Expect smaller, more frequent releases driven by what little-coder actually struggles with on this work rather than by a benchmark suite.
+**Benchmarks (deferred).** The four-benchmark baseline above stands as the scaffold-fit reference point. Fresh runs — **ProgramBench**, SWE-bench Verified (multi-file real-world patches), a GAIA test split — come back into scope once the knowledge-base work has produced enough scaffolding signal to make a new measurement worth running.
 ---

package/bin/little-coder.mjs CHANGED Viewed

@@ -73,6 +73,18 @@ if (!existsSync(piEntry)) {
   process.exit(1);
 }
+// ---- 3b. Re-apply little-coder's pi-runtime patches (best-effort) ----
+// pi is a normal dependency, so we can't ship a modified copy; instead we
+// re-apply small source edits (e.g. suppressing pi's bare "Operation aborted"
+// marker) on every launch. This self-heals when npm install scripts were
+// skipped or pi was reinstalled. Cosmetic only — never block launch.
+try {
+  const { applyPiPatches } = await import("../scripts/patch-pi.mjs");
+  applyPiPatches(piPkgRoot);
+} catch {
+  // patches are non-essential; ignore (missing file, read-only FS, etc.)
+}
 // ---- 4. Auto-discover bundled extensions ----
 const extDir = join(pkgRoot, ".pi", "extensions");
 const extArgs = [];

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "little-coder",
-  "version": "1.4.3",
+  "version": "1.5.1",
   "description": "A pi-based coding agent optimized for small local language models. Reproduces the whitepaper's scaffold-model-fit adaptations as pi extensions.",
   "homepage": "https://github.com/itayinbarr/little-coder",
   "repository": {
@@ -18,6 +18,7 @@
   },
   "files": [
     "bin/",
+    "scripts/",
     "AGENTS.md",
     "skills/",
     ".pi/extensions/",
@@ -33,7 +34,8 @@
     "pi": "pi",
     "test": "vitest run",
     "test:py": "python3 -m pytest benchmarks/test_rpc_client.py -q",
-    "typecheck": "tsc --noEmit"
+    "typecheck": "tsc --noEmit",
+    "postinstall": "node scripts/patch-pi.mjs"
   },
   "dependencies": {
     "@earendil-works/pi-coding-agent": "^0.75.3",

package/scripts/patch-pi.mjs ADDED Viewed

@@ -0,0 +1,113 @@
+#!/usr/bin/env node
+// Idempotent, dependency-free, best-effort patches to the bundled pi runtime
+// for things little-coder can't express through pi's extension API.
+//
+// little-coder treats pi as a substrate it owns, not a boundary — but pi is a
+// normal npm dependency, so we can't ship a modified copy of it. Instead we
+// re-apply small source edits to the installed pi after install AND on every
+// launch (the launcher calls applyPiPatches). Running on launch makes it
+// self-heal if npm install scripts were skipped, if pi was reinstalled, or if
+// the global/hoisted layout defeated the postinstall — the launcher always
+// resolves pi's real location, so it can patch wherever pi actually lives.
+//
+// Contract: NEVER throw, NEVER exit non-zero. A failed patch must not break
+// `npm install` or a launch — the only consequence is the un-patched UI.
+//
+// Current patches:
+//   1. Suppress pi's bare "Operation aborted" assistant-message marker. Harness
+//      interventions surface their own single "harness intervention: …" line,
+//      and a user ESC is self-evident; the stacked red marker was noise. A
+//      genuine custom errorMessage (not the default abort string) is preserved.
+import { readFileSync, writeFileSync, existsSync } from "node:fs";
+import { dirname, join } from "node:path";
+import { fileURLToPath } from "node:url";
+import { createRequire } from "node:module";
+const PI_PKG = "@earendil-works/pi-coding-agent";
+const ABORT_MARKER_PATCH = {
+  rel: "dist/modes/interactive/components/assistant-message.js",
+  // Skip if our edit is already present (idempotency).
+  applied: 'little-coder patch: suppress the bare "Operation aborted" marker',
+  // Exact original block shipped by pi 0.75.x. If it doesn't match (pi changed),
+  // we skip silently rather than guess.
+  find:
+    '                const abortMessage = message.errorMessage && message.errorMessage !== "Request was aborted"\n' +
+    "                    ? message.errorMessage\n" +
+    '                    : "Operation aborted";\n' +
+    "                if (hasVisibleContent) {\n" +
+    "                    this.contentContainer.addChild(new Spacer(1));\n" +
+    "                }\n" +
+    "                else {\n" +
+    "                    this.contentContainer.addChild(new Spacer(1));\n" +
+    "                }\n" +
+    "                this.contentContainer.addChild(new Text(theme.fg(\"error\", abortMessage), 1, 0));",
+  replace:
+    '                // little-coder patch: suppress the bare "Operation aborted" marker.\n' +
+    "                // Harness interventions surface their own single\n" +
+    '                // "harness intervention: …" line, and a user ESC is self-evident.\n' +
+    "                // A genuine custom errorMessage is still shown.\n" +
+    '                const abortMessage = message.errorMessage && message.errorMessage !== "Request was aborted"\n' +
+    "                    ? message.errorMessage\n" +
+    "                    : null;\n" +
+    "                if (abortMessage) {\n" +
+    "                    this.contentContainer.addChild(new Spacer(1));\n" +
+    "                    this.contentContainer.addChild(new Text(theme.fg(\"error\", abortMessage), 1, 0));\n" +
+    "                }",
+};
+export const PATCHES = [ABORT_MARKER_PATCH];
+export function resolvePiRoot(piRootOverride) {
+  if (piRootOverride && existsSync(join(piRootOverride, "package.json"))) {
+    return piRootOverride;
+  }
+  // 1) Module resolution (respects npm hoisting).
+  try {
+    const require = createRequire(import.meta.url);
+    return dirname(require.resolve(`${PI_PKG}/package.json`));
+  } catch {
+    // pi may not export package.json — fall through.
+  }
+  // 2) Nested node_modules next to this package root (scripts/ -> ..).
+  try {
+    const here = dirname(fileURLToPath(import.meta.url));
+    const nested = join(here, "..", "node_modules", ...PI_PKG.split("/"));
+    if (existsSync(join(nested, "package.json"))) return nested;
+  } catch {
+    // ignore
+  }
+  return null;
+}
+/**
+ * Apply all pi patches in place. Best-effort and idempotent.
+ * @param {string} [piRootOverride] Known pi package root (the launcher passes
+ *   its already-resolved path; postinstall omits it and we resolve).
+ */
+export function applyPiPatches(piRootOverride) {
+  const piRoot = resolvePiRoot(piRootOverride);
+  if (!piRoot) return;
+  for (const p of PATCHES) {
+    try {
+      const file = join(piRoot, p.rel);
+      if (!existsSync(file)) continue;
+      const src = readFileSync(file, "utf8");
+      if (src.includes(p.applied)) continue; // already patched
+      if (!src.includes(p.find)) continue; // pi changed — skip silently
+      writeFileSync(file, src.replace(p.find, p.replace));
+    } catch {
+      // best-effort: never break install or launch
+    }
+  }
+}
+// Run directly as a postinstall hook (but not when imported by the launcher).
+let invokedDirectly = false;
+try {
+  invokedDirectly = process.argv[1] != null && fileURLToPath(import.meta.url) === process.argv[1];
+} catch {
+  invokedDirectly = false;
+}
+if (invokedDirectly) applyPiPatches();

package/scripts/patch-pi.test.mjs ADDED Viewed

@@ -0,0 +1,63 @@
+import { describe, it, expect } from "vitest";
+import { readFileSync, existsSync } from "node:fs";
+import { join } from "node:path";
+import { applyPiPatches, resolvePiRoot, PATCHES } from "./patch-pi.mjs";
+// These tests are the upgrade safety-net for our pi source patches.
+//
+// A source patch can silently stop suppressing pi's UI marker when pi is
+// upgraded (the surrounding code shifts and the patch no-ops). We never want
+// that to be a silent production regression — so this test FAILS the moment the
+// installed pi no longer matches what a patch expects, telling us to refresh
+// exactly one string in patch-pi.mjs. A pi bump becomes a loud CI failure, not
+// a quiet cosmetic regression for users.
+describe("pi runtime patches", () => {
+  it("resolves the installed pi package root", () => {
+    expect(resolvePiRoot(), "could not locate @earendil-works/pi-coding-agent").toBeTruthy();
+  });
+  it("applies cleanly and is idempotent", () => {
+    // Idempotent: safe to run repeatedly (postinstall + every launch + tests).
+    applyPiPatches();
+    applyPiPatches();
+    const piRoot = resolvePiRoot();
+    for (const p of PATCHES) {
+      const file = join(piRoot, p.rel);
+      if (!existsSync(file)) continue;
+      const src = readFileSync(file, "utf8");
+      expect(
+        src.includes(p.applied),
+        `expected patch marker in ${p.rel} after applying`,
+      ).toBe(true);
+    }
+  });
+  it("leaves no un-suppressed original block (loud signal to refresh on pi upgrade)", () => {
+    applyPiPatches();
+    const piRoot = resolvePiRoot();
+    for (const p of PATCHES) {
+      const file = join(piRoot, p.rel);
+      if (!existsSync(file)) continue;
+      const src = readFileSync(file, "utf8");
+      // After applying, the original block must be gone — either we replaced it,
+      // or this pi version no longer ships it. If pi changed *around* the block
+      // so our patch silently no-op'd, the original is still present → fail.
+      expect(
+        src.includes(p.find),
+        `pi patch for "${p.rel}" no longer applies — pi likely changed. ` +
+          `Refresh the find/replace in scripts/patch-pi.mjs for the new pi version.`,
+      ).toBe(false);
+    }
+  });
+  it("the patched file no longer renders the bare \"Operation aborted\" string", () => {
+    applyPiPatches();
+    const piRoot = resolvePiRoot();
+    const file = join(piRoot, PATCHES[0].rel);
+    const src = readFileSync(file, "utf8");
+    // The literal only survives inside our explanatory comment, never as the
+    // rendered fallback (`: "Operation aborted";`).
+    expect(src.includes(': "Operation aborted";')).toBe(false);
+  });
+});