npm - little-coder - Versions diffs - 1.2.0 → 1.3.0 - Mend

little-coder 1.2.0 → 1.3.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (9) hide show

package/.pi/extensions/permission-gate/index.ts +4 -0
package/.pi/extensions/permission-gate/permission.test.ts +13 -1
package/.pi/extensions/write-guard/index.ts +51 -9
package/.pi/extensions/write-guard/write-guard.test.ts +51 -0
package/CHANGELOG.md +36 -0
package/README.md +49 -12
package/bin/little-coder.mjs +12 -1
package/models.json +1 -1
package/package.json +1 -1

package/.pi/extensions/permission-gate/index.ts CHANGED Viewed

@@ -21,6 +21,10 @@ const BUILTIN_SAFE_PREFIXES: readonly string[] = [
   "pip show", "pip list", "npm list", "cargo metadata",
   "df ", "du ", "free ", "top -bn", "ps ",
   "curl -I", "curl --head",
+  // Routine filesystem scaffolding. Trailing space = word boundary, so
+  // "cp " matches "cp a b" but not "cpufetch". rm stays off the list by
+  // design; use LITTLE_CODER_BASH_ALLOW=rm if a deployment needs it.
+  "cp ", "mv ", "mkdir ", "touch ",
 ];
 // Trailing whitespace is meaningful — it acts as a word boundary in startsWith

package/.pi/extensions/permission-gate/permission.test.ts CHANGED Viewed

@@ -9,10 +9,22 @@ describe("isSafeBash", () => {
     expect(isSafeBash("grep -r pattern .")).toBe(true);
     expect(isSafeBash("rg pattern src/")).toBe(true);
   });
+  it("allows routine filesystem scaffolding (cp/mv/mkdir/touch)", () => {
+    expect(isSafeBash("cp a b")).toBe(true);
+    expect(isSafeBash("mv old new")).toBe(true);
+    expect(isSafeBash("mkdir -p sub/dir")).toBe(true);
+    expect(isSafeBash("touch foo.md")).toBe(true);
+  });
+  it("preserves trailing-whitespace word boundary on fs prefixes", () => {
+    // Without the trailing space, "cp" would match "cpufetch". With it, these stay blocked.
+    expect(isSafeBash("cpufetch")).toBe(false);
+    expect(isSafeBash("mvtool")).toBe(false);
+    expect(isSafeBash("mkdiroops")).toBe(false);
+    expect(isSafeBash("touchscreen")).toBe(false);
+  });
   it("blocks non-whitelisted commands", () => {
     expect(isSafeBash("rm -rf /")).toBe(false);
     expect(isSafeBash("npm install foo")).toBe(false);
-    expect(isSafeBash("cp a b")).toBe(false);
     expect(isSafeBash("sudo anything")).toBe(false);
   });
   it("handles leading whitespace", () => {

package/.pi/extensions/write-guard/index.ts CHANGED Viewed

@@ -1,7 +1,40 @@
 import type { ExtensionAPI } from "@mariozechner/pi-coding-agent";
 import { Type } from "@sinclair/typebox";
 import { existsSync, mkdirSync, writeFileSync } from "node:fs";
-import { dirname } from "node:path";
+import { dirname, isAbsolute, join } from "node:path";
+/**
+ * Resolve the Write tool's `file_path` argument to a concrete on-disk path.
+ *
+ * Two deterministic rewrites:
+ *
+ * 1. `"/<single-segment>"` (e.g. `/foo.md`) → `<cwd>/<single-segment>`.
+ *    Background: the model has been seen to anchor at filesystem root when
+ *    given an "Absolute file path" schema and no obvious directory context.
+ *    Genuine system-path writes always include at least one intermediate
+ *    directory (`/etc/X`, `/tmp/Y/Z`), so a root + bare filename is almost
+ *    always a mistake. Rewriting to cwd matches user intent and avoids
+ *    accidentally writing to `/`.
+ *
+ * 2. Bare filename / relative path (no leading slash) → resolved against cwd.
+ *    Node's `fs` APIs already do this implicitly, but resolving here makes
+ *    the success message report the real absolute path that was written.
+ *
+ * Anything else (absolute path with at least one intermediate directory) is
+ * left untouched.
+ */
+export function normalizeWritePath(
+  filePath: string,
+  cwd: string = process.cwd(),
+): { path: string; rewrittenFrom?: string } {
+  if (/^\/[^/]+$/.test(filePath)) {
+    return { path: join(cwd, filePath.slice(1)), rewrittenFrom: filePath };
+  }
+  if (!isAbsolute(filePath)) {
+    return { path: join(cwd, filePath) };
+  }
+  return { path: filePath };
+}
 // Port of tools.py::_write. Preserves the exact Edit-recipe error string so
 // the model recovers to Edit on its next turn. The whitepaper's benchmark
@@ -12,18 +45,24 @@ export default function (pi: ExtensionAPI) {
     name: "write",
     label: "Write",
     description:
-      "Create a NEW file with the given content. Refuses if the file already exists — use edit to modify existing files. Parent directories are created automatically.",
+      "Create a NEW file with the given content. Refuses if the file already exists — use edit to modify existing files. " +
+      "Parent directories are created automatically. " +
+      "Pass either a path relative to the working directory (e.g. `notes/plan.md`) or a full absolute path. " +
+      "A bare filename like `foo.md` resolves to <cwd>/foo.md. " +
+      "A path of the form `/<filename>` with no intermediate directories is treated as cwd-relative " +
+      "(use `/etc/hosts` etc. if you really mean the filesystem root).",
     parameters: Type.Object({
-      file_path: Type.String({ description: "Absolute file path" }),
+      file_path: Type.String({ description: "File path (relative to cwd, or absolute)" }),
       content: Type.String({ description: "Full file content" }),
     }),
     async execute(_id, { file_path, content }) {
-      if (existsSync(file_path)) {
+      const { path: resolved, rewrittenFrom } = normalizeWritePath(file_path);
+      if (existsSync(resolved)) {
         const recipe =
-          `Error: Write refused — ${file_path} already exists.\n` +
+          `Error: Write refused — ${resolved} already exists.\n` +
           `\n` +
           `Write is only for creating NEW files. To change an existing file, use Edit:\n` +
-          `  {"name": "Edit", "input": {"file_path": "${file_path}", ` +
+          `  {"name": "Edit", "input": {"file_path": "${resolved}", ` +
           `"old_string": "<exact text currently in the file>", ` +
           `"new_string": "<replacement text>"}}\n` +
           `\n` +
@@ -41,12 +80,15 @@ export default function (pi: ExtensionAPI) {
       }
       try {
-        mkdirSync(dirname(file_path), { recursive: true });
-        writeFileSync(file_path, content, { encoding: "utf-8" });
+        mkdirSync(dirname(resolved), { recursive: true });
+        writeFileSync(resolved, content, { encoding: "utf-8" });
         const lc = content.split("\n").length - (content.endsWith("\n") ? 1 : 0) +
           (content.length > 0 && !content.endsWith("\n") ? 1 : 0);
+        const suffix = rewrittenFrom
+          ? ` (rewrote ${rewrittenFrom} → cwd; root-path single-segment write redirected)`
+          : "";
         return {
-          content: [{ type: "text", text: `Created ${file_path} (${lc} lines)` }],
+          content: [{ type: "text", text: `Created ${resolved} (${lc} lines)${suffix}` }],
           details: {},
         };
       } catch (e) {

package/.pi/extensions/write-guard/write-guard.test.ts ADDED Viewed

@@ -0,0 +1,51 @@
+import { describe, it, expect } from "vitest";
+import { normalizeWritePath } from "./index.ts";
+describe("normalizeWritePath", () => {
+  const cwd = "/home/me/proj";
+  it("rewrites /<bare-filename> to <cwd>/<bare-filename>", () => {
+    // The model anchoring at filesystem root is the bug we're fixing.
+    expect(normalizeWritePath("/foo.md", cwd)).toEqual({
+      path: "/home/me/proj/foo.md",
+      rewrittenFrom: "/foo.md",
+    });
+    expect(normalizeWritePath("/person.md", cwd)).toEqual({
+      path: "/home/me/proj/person.md",
+      rewrittenFrom: "/person.md",
+    });
+  });
+  it("resolves bare filenames against cwd (no rewrite flag — already cwd-relative)", () => {
+    expect(normalizeWritePath("foo.md", cwd)).toEqual({
+      path: "/home/me/proj/foo.md",
+    });
+  });
+  it("resolves nested relative paths against cwd", () => {
+    expect(normalizeWritePath("sub/foo.md", cwd)).toEqual({
+      path: "/home/me/proj/sub/foo.md",
+    });
+    expect(normalizeWritePath("a/b/c.md", cwd)).toEqual({
+      path: "/home/me/proj/a/b/c.md",
+    });
+  });
+  it("leaves genuine absolute paths alone (path has an intermediate directory)", () => {
+    // /etc/hosts has an intermediate directory, so it's a legitimate
+    // absolute path. We don't rewrite it.
+    expect(normalizeWritePath("/etc/hosts", cwd)).toEqual({
+      path: "/etc/hosts",
+    });
+    expect(normalizeWritePath("/tmp/foo.log", cwd)).toEqual({
+      path: "/tmp/foo.log",
+    });
+  });
+  it("leaves deep absolute paths in cwd untouched", () => {
+    // Model handing back its own cwd-prefixed path: unchanged.
+    expect(normalizeWritePath("/home/me/proj/notes/plan.md", cwd)).toEqual({
+      path: "/home/me/proj/notes/plan.md",
+    });
+  });
+});

package/CHANGELOG.md CHANGED Viewed

@@ -2,6 +2,42 @@
 All notable changes to little-coder are documented here. The format follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and little-coder's public interface (CLI, providers, tools, skills) follows semver starting at `v0.0.1` post-rename.
+## [v1.3.0] — 2026-05-16
+First functional release of Phase 2 (iterative improvement on real-world coding tasks). Three concrete sharp edges that surfaced while actually using the Mac → Linux LAN setup, plus a quality-of-life cleanup on the pi update banner. Minor version bump because three of the four changes are new behavior, all backwards-compatible.
+### Added
+- **`cp`, `mv`, `mkdir`, `touch` are now on the built-in bash whitelist.** The permission-gate's `BUILTIN_SAFE_PREFIXES` previously covered only read-only inspection (`ls`, `cat`, `git log`, `find`, `grep`, …), so the model couldn't move or copy a file it just created without flipping `LITTLE_CODER_PERMISSION_MODE=accept-all`. These four were the most common false-positive blocks on day-to-day editing work. Trailing-whitespace word-boundary convention preserved — `cp ` allows `cp a b` but not `cpufetch`. `rm` and `sudo` stay off the list by design; per-deployment escape hatch is still `LITTLE_CODER_BASH_ALLOW`. New positive + negative-boundary assertions in `.pi/extensions/permission-gate/permission.test.ts`.
+- **Image input on `llamacpp/qwen3.6-35b-a3b`.** `models.json` now declares `input: ["text", "image"]` for this entry, so pi's TUI no longer rejects clipboard / drag-and-drop screenshots. Pi already ships the full image-conversion / resize / OpenAI-format encoding stack (`@mariozechner/pi-coding-agent/dist/utils/{clipboard-image,image-resize,image-convert,mime}.js`); the gate was purely the capability flag on the model. README's *Option A — llama.cpp* now folds the vision projector into the canonical setup: an extra `hf download unsloth/Qwen3.6-35B-A3B-GGUF mmproj-F16.gguf` line and `--mmproj ~/models/mmproj-F16.gguf` on the `llama-server` command. Skip both lines if you want a text-only deployment.
+### Fixed
+- **Write tool no longer writes to filesystem root when the model emits `/<filename>`.** Previously the tool's schema described `file_path` as *"Absolute file path"*, so models that had no obvious working-directory context dutifully wrote `/person.md` — landing the file at the filesystem root instead of under cwd. `.pi/extensions/write-guard/index.ts` now runs a deterministic `normalizeWritePath()` before any filesystem call: a path matching `/^\/[^/]+$/` (root + single segment, no intermediate dirs) is rewritten to `<cwd>/<segment>` and the success message says so explicitly; bare filenames / relative paths are resolved against cwd up-front so the returned path is absolute; genuine system writes (`/etc/X`, `/tmp/Y/Z`) are passed through untouched. Tool description updated to give the model the right mental model. New unit-test module `.pi/extensions/write-guard/write-guard.test.ts` covers the five distinct path shapes.
+### Changed
+- **Pi's "Update Available" banner is suppressed by default.** `bin/little-coder.mjs` now defaults `PI_SKIP_VERSION_CHECK=1` unless you've explicitly set it. little-coder bundles `@mariozechner/pi-coding-agent` as an internal dependency pinned per release, so the in-session nag about updating pi was telling users to do something they shouldn't (and couldn't usefully) do — `npm install -g @mariozechner/pi-coding-agent@latest` doesn't affect the bundled copy. Opt back in with `PI_SKIP_VERSION_CHECK=0` if you want the banner. (The broader `PI_OFFLINE=1` is still your hammer for killing pi's other startup network calls — package-update check, tool auto-fetch, install telemetry.)
+### Notes for upgraders
+- No CLI flag, settings.json, or skill-pack breaks. Existing `LITTLE_CODER_BASH_ALLOW` overrides continue to compose on top of the (now-wider) built-in list. Existing `models.json` user-override files for the llamacpp provider continue to work unchanged; if you'd hand-rolled an override entry for `qwen3.6-35b-a3b` you'll keep its old `input` value until you redeclare it. Tool descriptions changed on Write, which the model sees as a system-prompt diff — no API surface change for you.
+---
+## [v1.2.1] — 2026-05-16
+Docs-only release marking two milestones: **Terminal-Bench 2.0 leaderboard acceptance** and the **end of the Phase 1 benchmark baseline**. No CLI, settings, or skill-pack changes — the env-var path for remote inference (`LLAMACPP_BASE_URL` / `OLLAMA_BASE_URL` / `LMSTUDIO_BASE_URL` pointing at a non-loopback host) has worked since v1.1.0 / v1.2.0, but it was undocumented for the LAN-server case until now.
+### Added
+- **README "Serving from another machine on your LAN" section** under *Local model setup → Option C*. Covers all three local providers (llama.cpp `--host 0.0.0.0`, LM Studio's *Serve on local network*, `OLLAMA_HOST=0.0.0.0:11434 ollama serve`), the corresponding `*_BASE_URL` env on the client, a `curl /v1/models` reachability check, and a note on opening port 1234 / 8888 / 11434 in `ufw`. Validated against this repo's own benchmark hardware: `LLAMACPP_BASE_URL=http://<lan-ip>:8888/v1` against `llama-server --host 0.0.0.0` serves Qwen3.6-35B-A3B to a different machine over WiFi at the same per-token throughput as loopback.
+### Changed
+- **Benchmark table — Terminal-Bench 2.0 rows.** Replaced the *"awaiting maintainer merge"* status (HuggingFace PRs [#158](https://huggingface.co/datasets/harborframework/terminal-bench-2-leaderboard/discussions/158) and [#163](https://huggingface.co/datasets/harborframework/terminal-bench-2-leaderboard/discussions/163)) with the accepted leaderboard placements published at [tbench.ai/leaderboard/terminal-bench/2.0](https://www.tbench.ai/leaderboard/terminal-bench/2.0): **Qwen3.6-35B-A3B at 24.6 % ± 3.2 (rank 120)** and **Qwen3.5-9B at 9.2 % ± 2.4 (rank 142)**. The mean shifted slightly from the originally-submitted point estimates (23.82 % → 24.6 %, 9.21 % → 9.2 %) once the leaderboard recomputed across all five trials with a confidence interval; the underlying runs are unchanged.
+- **Roadmap reframed.** Phase 1 (build a wide benchmark baseline across short coding exercises, interactive shell tasks, and tool-using research) is now marked **complete**: Aider Polyglot ✓, Terminal-Bench-Core v0.1.1 ✓, Terminal-Bench 2.0 ✓, GAIA validation ✓. Phase 2 opens now: **iterative improvement driven by real-world coding tasks**, not by the benchmark suite. New benchmarks (ProgramBench, SWE-bench Verified, GAIA test-split) are deferred until Phase 2 produces enough scaffolding signal to be worth re-measuring — re-benchmarking before the next round of changes lands would mostly re-measure the same baseline.
+### Notes for upgraders
+- No CLI flag, settings, or skill-pack breaks. Existing `LMSTUDIO_BASE_URL` / `LLAMACPP_BASE_URL` / `OLLAMA_BASE_URL` users on either loopback or remote hosts keep working with no changes; the only thing that changed is that the remote-host case is now documented.
+- No `models.json` or `.pi/settings.json` shape change. Per-model profiles (context limit, thinking budget, temperature) continue to apply regardless of where the inference server lives — they're keyed by `<provider>/<model-id>`, not by host.
+---
 ## [v1.2.0] — 2026-05-10
 Issue-cleanup release that also ships built-in LM Studio support. Closes [#17](https://github.com/itayinbarr/little-coder/issues/17) (Windows), [#19](https://github.com/itayinbarr/little-coder/issues/19) (phantom Agent tool), [#21](https://github.com/itayinbarr/little-coder/issues/21) (skill param mismatch).

package/README.md CHANGED Viewed

@@ -81,16 +81,21 @@ git clone https://github.com/ggml-org/llama.cpp && cd llama.cpp
 cmake -B build -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=120 -DLLAMA_CURL=ON
 cmake --build build --config Release -j
-# Fetch a GGUF
+# Fetch the model GGUF and the matching vision projector.
+# The mmproj (~900 MB) is what lets the model see attached screenshots.
 pip install -U "huggingface_hub[cli]"
 hf download unsloth/Qwen3.6-35B-A3B-GGUF Qwen3.6-35B-A3B-UD-Q4_K_M.gguf --local-dir ~/models
+hf download unsloth/Qwen3.6-35B-A3B-GGUF mmproj-F16.gguf            --local-dir ~/models
 # Serve it (MoE trick: experts in RAM, attention on GPU → 22 GB model on 8 GB VRAM)
 build/bin/llama-server -m ~/models/Qwen3.6-35B-A3B-UD-Q4_K_M.gguf \
+   --mmproj ~/models/mmproj-F16.gguf \
    --host 127.0.0.1 --port 8888 --jinja \
    -c 16384 -ngl 99 --n-cpu-moe 999 --flash-attn on
 ```
+If you only need text and want to skip the projector download, drop the second `hf download` line and the `--mmproj` flag — little-coder still works text-only, but the TUI's image attachment will be rejected by the server with a 4xx.
 **Option B — Ollama** (simpler, but slower on MoE):
 ```bash
@@ -110,6 +115,31 @@ ollama pull qwen3.5        # 9.7B — the paper's model
    ```
    The shipped `lmstudio/local-model` id routes to whatever model LM Studio currently has loaded — no extra config needed for the single-model case. If you serve on a non-default port, set `LMSTUDIO_BASE_URL=http://127.0.0.1:<port>/v1`. To target a specific model when you have several loaded, add an entry to `~/.config/little-coder/models.json` (see **Configuring models** below).
+**Serving from another machine on your LAN.** Each provider's `*_BASE_URL` env var accepts any host, not just `127.0.0.1`, so you can run inference on a beefier box and connect from a laptop or another device on the same WiFi.
+On the **server** (the box with the GPU):
+- *llama.cpp*: start `llama-server` with `--host 0.0.0.0` (or your specific LAN interface) instead of `127.0.0.1`. Everything else from Option A unchanged.
+- *LM Studio*: in the Server tab, enable **Serve on local network** so it binds `0.0.0.0:1234` instead of `127.0.0.1:1234`.
+- *Ollama*: `OLLAMA_HOST=0.0.0.0:11434 ollama serve` (or set `OLLAMA_HOST=0.0.0.0` in the user systemd unit).
+- If `ufw` / `firewalld` is active, allow your LAN subnet to the relevant port (e.g. `sudo ufw allow from 192.168.0.0/16 to any port 8888 proto tcp`).
+- Find the LAN IP with `hostname -I` (Linux) or `ipconfig getifaddr en0` (macOS).
+On the **client** (the machine running little-coder):
+```bash
+# Pick the env vars matching whichever provider is running on the server
+export LLAMACPP_API_KEY=noop
+export LLAMACPP_BASE_URL=http://<server-lan-ip>:8888/v1
+# Sanity check reachability before launching the agent
+curl -s http://<server-lan-ip>:8888/v1/models | head
+little-coder --model llamacpp/qwen3.6-35b-a3b
+```
+The streaming chat-completions adapter works over a local network the same way it does over loopback — no client code change, no proxy needed. The per-model profile in `.pi/settings.json` (context/thinking-budget/temperature) still applies because it's keyed by `<provider>/<model-id>`, which the client picks regardless of where the server lives.
 All small-model-specific extensions auto-disable for large/cloud models so they don't interfere.
 ---
@@ -161,7 +191,7 @@ Then verify with `little-coder --list-models` — you should see your overridden
 ## Permissions
-little-coder gates `Bash` tool calls against a built-in safe-prefix whitelist (`ls`, `cat`, `git log/status/diff`, `find`, `grep`, etc.) before pi's own confirmation flow ever sees them.
+little-coder gates `Bash` tool calls against a built-in safe-prefix whitelist (`ls`, `cat`, `head`, `tail`, `git log/status/diff`, `find`, `grep`, `cp`, `mv`, `mkdir`, `touch`, etc.) before pi's own confirmation flow ever sees them. `rm` and `sudo` are intentionally not on the list — add them via `LITTLE_CODER_BASH_ALLOW` per deployment if you really need them.
 Two env vars control the gate:
@@ -191,8 +221,8 @@ Write/Edit confirmations are pi's responsibility; little-coder doesn't intercept
 | [**v0.0.2**](https://github.com/itayinbarr/little-coder/releases/tag/v0.0.2) (commit `1d62bde`) — the paper | Qwen3.5-9B via Ollama | Aider Polyglot (225 exercises) | **45.56 %** mean of two runs; matched-model vanilla Aider baseline 19.11 %. Paper: [*Honey, I Shrunk the Coding Agent* on Substack](https://open.substack.com/pub/itayinbarr/p/honey-i-shrunk-the-coding-agent). |
 | [**v0.0.5**](https://github.com/itayinbarr/little-coder/releases/tag/v0.0.5) — pre-pi Python | Qwen3.6-35B-A3B via llama.cpp | Aider Polyglot | **78.67 %**. [Full narrative](docs/benchmark-qwen3.6-35b-a3b.md). |
 | [**v0.1.4**](https://github.com/itayinbarr/little-coder/releases/tag/v0.1.4) — on pi | Qwen3.6-35B-A3B via llama.cpp | Terminal-Bench-Core v0.1.1 (80 tasks) | **40.0 %** in 6 h 50 min. [Write-up](docs/benchmark-terminal-bench-v0.1.1.md). |
-| [**v0.1.13**](https://github.com/itayinbarr/little-coder/releases/tag/v0.1.13) — on pi, TB 2.0 leaderboard | Qwen3.6-35B-A3B via llama.cpp | Terminal-Bench 2.0 (89 tasks × 5 trials = 445) | **23.82 %** (106 / 445). [PR #158](https://huggingface.co/datasets/harborframework/terminal-bench-2-leaderboard/discussions/158) — awaiting maintainer merge. |
-| [**v0.1.24**](https://github.com/itayinbarr/little-coder/releases/tag/v0.1.24) — on pi, TB 2.0 leaderboard, smaller model | Qwen3.5-9B (Q4_K_M) via llama.cpp (5.3 GB on GPU, 2× faster per-token than the 35B-A3B) | Terminal-Bench 2.0 (89 tasks × 5 trials = 445) | **9.21 %** (41 / 445). [PR #163](https://huggingface.co/datasets/harborframework/terminal-bench-2-leaderboard/discussions/163) — awaiting maintainer merge. |
+| [**v0.1.13**](https://github.com/itayinbarr/little-coder/releases/tag/v0.1.13) — on pi, TB 2.0 leaderboard | Qwen3.6-35B-A3B via llama.cpp | Terminal-Bench 2.0 (89 tasks × 5 trials = 445) | **24.6 % ± 3.2** — accepted to the [Terminal-Bench 2.0 leaderboard](https://www.tbench.ai/leaderboard/terminal-bench/2.0) (rank 120). |
+| [**v0.1.24**](https://github.com/itayinbarr/little-coder/releases/tag/v0.1.24) — on pi, TB 2.0 leaderboard, smaller model | Qwen3.5-9B (Q4_K_M) via llama.cpp (5.3 GB on GPU, 2× faster per-token than the 35B-A3B) | Terminal-Bench 2.0 (89 tasks × 5 trials = 445) | **9.2 % ± 2.4** — accepted to the [Terminal-Bench 2.0 leaderboard](https://www.tbench.ai/leaderboard/terminal-bench/2.0) (rank 142). |
 | [**v0.1.27**](https://github.com/itayinbarr/little-coder/releases/tag/v0.1.27) — on pi, GAIA validation | Qwen3.6-35B-A3B via llama.cpp | GAIA validation set (165 tasks) | **40.00 %** (66 / 165). L1 60.4 % / L2 37.2 % / L3 7.7 %. Test-split run pending. |
 All runs used a consumer laptop: i9-14900HX, 32 GB RAM, **8 GB VRAM** on RTX 5070 Laptop (Blackwell). No cloud inference at any point.
@@ -201,17 +231,18 @@ All runs used a consumer laptop: i9-14900HX, 32 GB RAM, **8 GB VRAM** on RTX 507
 ## Roadmap
-The near-term focus is **benchmarking**, not new features. The paper established that scaffold–model fit moves a 9.7 B model from 19 % to 45 % on Aider Polyglot. The open question is: **how wide is the impact radius?** Does the same set of adaptations — Write-vs-Edit invariant, per-turn skill injection, thinking-budget cap, output-repair, quality monitor — help on tasks that *aren't* self-contained coding exercises? What breaks? What compounds?
+**Phase 1 — wide benchmark baseline: complete.** The paper established that scaffold–model fit moves a 9.7 B model from 19 % to 45 % on Aider Polyglot, and the goal of Phase 1 was to find out how wide that impact radius is. We now have a four-benchmark baseline on a single laptop-class GPU:
-The plan is to establish a wide baseline before any further scaffolding changes:
+1. **Aider Polyglot** — 45.56 % (paper, Qwen3.5-9B) and 78.67 % (v0.0.5, Qwen3.6-35B-A3B).
+2. **Terminal-Bench-Core v0.1.1** — 40.0 % (v0.1.4).
+3. **Terminal-Bench 2.0** — accepted to the [official leaderboard](https://www.tbench.ai/leaderboard/terminal-bench/2.0): Qwen3.6-35B-A3B at **24.6 % ± 3.2** (rank 120) and Qwen3.5-9B at **9.2 % ± 2.4** (rank 142). The v0.1.24 prompt-repetition fix (re-add tool descriptions + concision guideline, validated by a 4 / 4 pilot on the previously-regressing `prove-plus-comm` task) was the prompt for both submissions.
+4. **GAIA** — validation set at v0.1.27: **40.00 %** (66 / 165) on Qwen3.6-35B-A3B. Per-level L1 60.4 % / L2 37.2 % / L3 7.7 %.
-1. **Aider Polyglot** — done. 45.56 % (paper, Qwen3.5-9B) and 78.67 % (v0.0.5, Qwen3.6-35B-A3B).
-2. **Terminal-Bench-Core v0.1.1** — done. 40.0 % (v0.1.4).
-3. **Terminal-Bench 2.0** — done. Qwen3.6-35B-A3B at **23.82 %** ([PR #158](https://huggingface.co/datasets/harborframework/terminal-bench-2-leaderboard/discussions/158)) and Qwen3.5-9B at **9.21 %** ([PR #163](https://huggingface.co/datasets/harborframework/terminal-bench-2-leaderboard/discussions/163)), both awaiting maintainer merge. The v0.1.24 prompt-repetition fix (re-add tool descriptions + concision guideline, validated by a 4 / 4 pilot on the previously-regressing `prove-plus-comm` task) was the prompt for both submissions.
-4. **GAIA** — validation set done at v0.1.27: **40.00 %** (66 / 165) on Qwen3.6-35B-A3B. Per-level L1 60.4 % / L2 37.2 % / L3 7.7 %. Test-split run (301 tasks) pending → leaderboard submission to follow.
-5. **SWE-bench Verified** — after GAIA. Multi-file real-world patches; the longest-horizon test of whether the scaffolding generalizes past exercise-scale tasks.
+That spans short coding exercises (Polyglot), interactive shell-bound tasks (Terminal-Bench), and tool-using research (GAIA), all on the same scaffold. The data needed to choose what to fix next is now in hand.
-**After that baseline is in place**, the next phase starts: improvement experiments targeted at the specific failure patterns we've seen (thinking-budget / quality-monitor behavior on long-horizon tasks, deliberate.py-style parallel branches on failure, better shell-session recovery for interactive-process traps). No scaffold changes until the data says which ones are worth running.
+**Phase 2 — iterative improvement on real-world tasks: starting now.** The motivating question shifts from *how wide is the impact radius?* to *which scaffolding changes compound on long-horizon real work?* The signal we have already points at concrete things to try — thinking-budget / quality-monitor behavior on long-horizon tasks, deliberate.py-style parallel branches on failure, better shell-session recovery for interactive-process traps, evidence-handling on multi-document GAIA L3 tasks — but the priority order comes from real-world use, not from a benchmark suite. Expect smaller, more frequent releases driven by what little-coder actually struggles with on day-to-day coding work.
+**Future benchmarks (deferred).** New benchmarks like **ProgramBench**, SWE-bench Verified (multi-file real-world patches), and a GAIA test-split run come back into scope after Phase 2 has produced enough scaffolding signal to make a fresh measurement worth running. Re-benchmarking before the next round of changes lands would mostly re-measure the same baseline.
 ---
@@ -221,8 +252,14 @@ The plan is to establish a wide baseline before any further scaffolding changes:
 **`ECONNREFUSED 127.0.0.1:8888`** — llama.cpp isn't running. Start `llama-server` first, or switch `--model` to an Ollama/cloud ID.
+**LAN client times out (no `RST`, just hangs)** — the inference box's firewall is dropping the SYN. The usual cause is `ufw` with a default-deny policy that allow-lists only SSH / a few dev ports. From the server: `sudo ufw status verbose` to confirm; `sudo ufw allow from <your-lan-subnet>/24 to any port 8888 proto tcp` to fix (scoped to the LAN so you're not exposing the box). Docker-published ports bypass `ufw` via `PREROUTING` NAT, which is why a Docker container can be reachable while a plain `llama-server` on the same host isn't.
+**Image attachment is accepted but the request returns 4xx** — your llama-server is running without a vision projector. Re-launch it with `--mmproj ~/models/mmproj-F16.gguf` (or another mmproj variant from the same GGUF repo). The `--list-models` `images` column reflects what the client *will attempt to send*, not what the server can answer; the projector is what gives the model eyes.
 **No API key env var warning** — pi expects *some* key even for local providers. Export `LLAMACPP_API_KEY=noop` (or `OLLAMA_API_KEY=noop`) before launching.
+**No pi "Update Available" banner** — that's intentional. little-coder defaults `PI_SKIP_VERSION_CHECK=1` so the bundled pi runtime doesn't nag about updating itself; little-coder pins pi to a known-good version per release. If you actually want the banner back, `export PI_SKIP_VERSION_CHECK=0` before launching.
 **Extension load failures on startup** — run `little-coder --list-models --verbose`; extension errors surface there. If the install looks corrupt: `npm uninstall -g little-coder && npm install -g little-coder`.
 **Node version too old** — little-coder needs Node ≥ 20.6.0. Check with `node --version`. Easiest fix: `nvm install 20 && nvm use 20`.

package/bin/little-coder.mjs CHANGED Viewed

@@ -87,7 +87,18 @@ const piArgs = [
   ...userArgs,
 ];
-// ---- 7. Spawn pi in the user's cwd ----
+// ---- 7. Suppress pi's own version-banner by default ----
+// pi is an internal dependency here; users install `little-coder` and shouldn't
+// see in-session nags about updating the underlying coding-agent package.
+// PI_SKIP_VERSION_CHECK is the surgical pi switch (interactive-mode.js:525)
+// that gates the "Update Available" banner without touching pi's other
+// network-dependent startup paths. Honor an explicit user value (set to "0" or
+// anything else to re-enable the banner; PI_OFFLINE=1 also re-overrides).
+if (process.env.PI_SKIP_VERSION_CHECK === undefined) {
+  process.env.PI_SKIP_VERSION_CHECK = "1";
+}
+// ---- 8. Spawn pi in the user's cwd ----
 const [spawnCmd, spawnArgs] = isWindows
   ? ["cmd.exe", ["/c", piBin, ...piArgs]]
   : [piBin, piArgs];

package/models.json CHANGED Viewed

@@ -18,7 +18,7 @@
           "id": "qwen3.6-35b-a3b",
           "name": "Qwen3.6-35B-A3B (MoE, local llama.cpp)",
           "reasoning": true,
-          "input": ["text"],
+          "input": ["text", "image"],
           "contextWindow": 32768,
           "maxTokens": 4096,
           "cost": { "input": 0, "output": 0, "cacheRead": 0, "cacheWrite": 0 }

package/package.json CHANGED Viewed

@@ -1,6 +1,6 @@
 {
   "name": "little-coder",
-  "version": "1.2.0",
+  "version": "1.3.0",
   "description": "A pi-based coding agent optimized for small local language models. Reproduces the whitepaper's scaffold-model-fit adaptations as pi extensions.",
   "homepage": "https://github.com/itayinbarr/little-coder",
   "repository": {