little-coder 1.1.0 → 1.2.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/.pi/extensions/llama-cpp-provider/config.test.ts +27 -1
- package/.pi/extensions/llama-cpp-provider/config.ts +5 -2
- package/.pi/extensions/skill-inject/index.ts +0 -1
- package/.pi/extensions/skill-inject/selector.test.ts +2 -3
- package/.pi/settings.json +8 -0
- package/CHANGELOG.md +37 -0
- package/README.md +53 -14
- package/bin/little-coder.mjs +8 -2
- package/models.json +16 -0
- package/package.json +1 -1
- package/skills/tools/edit.md +16 -10
- package/skills/tools/grep.md +5 -4
- package/skills/tools/read.md +3 -3
- package/skills/tools/write.md +4 -4
- package/skills/tools/agent.md +0 -24
|
@@ -1,7 +1,8 @@
|
|
|
1
1
|
import { describe, it, expect, beforeEach, afterEach } from "vitest";
|
|
2
2
|
import { mkdtempSync, rmSync, writeFileSync, mkdirSync } from "node:fs";
|
|
3
3
|
import { tmpdir } from "node:os";
|
|
4
|
-
import { join } from "node:path";
|
|
4
|
+
import { dirname, join, resolve } from "node:path";
|
|
5
|
+
import { fileURLToPath } from "node:url";
|
|
5
6
|
import { applyEnvOverrides, loadProviders, mergeProviders, resolveOverridePath, type ProviderEntry } from "./config.ts";
|
|
6
7
|
|
|
7
8
|
const sampleProvider = (baseUrl: string, modelId: string): ProviderEntry => ({
|
|
@@ -76,6 +77,11 @@ describe("applyEnvOverrides", () => {
|
|
|
76
77
|
const out = applyEnvOverrides(providers, { OLLAMA_BASE_URL: "http://env/v1" });
|
|
77
78
|
expect(out.ollama.baseUrl).toBe("http://env/v1");
|
|
78
79
|
});
|
|
80
|
+
it("LMSTUDIO_BASE_URL overrides lmstudio baseUrl", () => {
|
|
81
|
+
const providers = { lmstudio: sampleProvider("http://127.0.0.1:1234/v1", "local-model") };
|
|
82
|
+
const out = applyEnvOverrides(providers, { LMSTUDIO_BASE_URL: "http://127.0.0.1:5678/v1" });
|
|
83
|
+
expect(out.lmstudio.baseUrl).toBe("http://127.0.0.1:5678/v1");
|
|
84
|
+
});
|
|
79
85
|
it("does not alter providers without a known env knob", () => {
|
|
80
86
|
const providers = { custom: sampleProvider("http://file/v1", "m") };
|
|
81
87
|
const out = applyEnvOverrides(providers, { LLAMACPP_BASE_URL: "http://env/v1" });
|
|
@@ -159,3 +165,23 @@ describe("loadProviders (filesystem)", () => {
|
|
|
159
165
|
expect(result.providers.llamacpp.models[0].id).toBe("via-xdg");
|
|
160
166
|
});
|
|
161
167
|
});
|
|
168
|
+
|
|
169
|
+
describe("shipped models.json", () => {
|
|
170
|
+
const here = dirname(fileURLToPath(import.meta.url));
|
|
171
|
+
const pkgRoot = resolve(here, "..", "..", "..");
|
|
172
|
+
|
|
173
|
+
it("registers lmstudio/local-model on http://127.0.0.1:1234/v1", () => {
|
|
174
|
+
const result = loadProviders(pkgRoot, {});
|
|
175
|
+
const lmstudio = result.providers.lmstudio;
|
|
176
|
+
expect(lmstudio, "lmstudio provider should be present in shipped models.json").toBeDefined();
|
|
177
|
+
expect(lmstudio.baseUrl).toBe("http://127.0.0.1:1234/v1");
|
|
178
|
+
expect(lmstudio.api).toBe("openai-completions");
|
|
179
|
+
expect(lmstudio.apiKey).toBe("LMSTUDIO_API_KEY");
|
|
180
|
+
expect(lmstudio.models.find((m) => m.id === "local-model")).toBeDefined();
|
|
181
|
+
});
|
|
182
|
+
|
|
183
|
+
it("still registers llamacpp and ollama alongside lmstudio", () => {
|
|
184
|
+
const result = loadProviders(pkgRoot, {});
|
|
185
|
+
expect(Object.keys(result.providers).sort()).toEqual(["llamacpp", "lmstudio", "ollama"]);
|
|
186
|
+
});
|
|
187
|
+
});
|
|
@@ -43,11 +43,14 @@ export interface LoadResult {
|
|
|
43
43
|
sources: { path: string; status: "ok" | "missing" | "invalid"; error?: string }[];
|
|
44
44
|
}
|
|
45
45
|
|
|
46
|
-
/** Provider env knob: if set, overrides the provider's baseUrl.
|
|
47
|
-
* for the two providers we shipped before the data-driven
|
|
46
|
+
/** Provider env knob: if set, overrides the provider's baseUrl. Originally a
|
|
47
|
+
* back-compat shim for the two providers we shipped before the data-driven
|
|
48
|
+
* refactor; kept as the per-provider env-override pattern for any provider
|
|
49
|
+
* whose baseUrl changes between deployments. */
|
|
48
50
|
const LEGACY_BASE_URL_ENV: Record<string, string> = {
|
|
49
51
|
llamacpp: "LLAMACPP_BASE_URL",
|
|
50
52
|
ollama: "OLLAMA_BASE_URL",
|
|
53
|
+
lmstudio: "LMSTUDIO_BASE_URL",
|
|
51
54
|
};
|
|
52
55
|
|
|
53
56
|
/** Resolution order for the user-override file. First existing path wins. */
|
|
@@ -21,7 +21,6 @@ const INTENT_MAP: Record<string, string[]> = {
|
|
|
21
21
|
grep: ["Grep"], glob: ["Glob"],
|
|
22
22
|
fetch: ["WebFetch"], download: ["WebFetch"], url: ["WebFetch"],
|
|
23
23
|
web: ["WebSearch"],
|
|
24
|
-
agent: ["Agent"], delegate: ["Agent"], spawn: ["Agent"],
|
|
25
24
|
};
|
|
26
25
|
|
|
27
26
|
function predictTools(userText: string): string[] {
|
|
@@ -61,10 +60,10 @@ describe("skills directory loads from repo", () => {
|
|
|
61
60
|
const here = dirname(fileURLToPath(import.meta.url));
|
|
62
61
|
const toolsDir = join(here, "..", "..", "..", "skills", "tools");
|
|
63
62
|
|
|
64
|
-
it("exists and has
|
|
63
|
+
it("exists and has 13 markdown files", () => {
|
|
65
64
|
expect(existsSync(toolsDir)).toBe(true);
|
|
66
65
|
const files = readdirSync(toolsDir).filter((f) => f.endsWith(".md"));
|
|
67
|
-
expect(files.length).toBe(
|
|
66
|
+
expect(files.length).toBe(13);
|
|
68
67
|
});
|
|
69
68
|
|
|
70
69
|
it("every tool skill has target_tool in frontmatter", () => {
|
package/.pi/settings.json
CHANGED
|
@@ -70,6 +70,14 @@
|
|
|
70
70
|
"skill_token_budget": 300,
|
|
71
71
|
"knowledge_token_budget": 200,
|
|
72
72
|
"temperature": 0.3
|
|
73
|
+
},
|
|
74
|
+
"lmstudio/local-model": {
|
|
75
|
+
"context_limit": 32768,
|
|
76
|
+
"max_tokens": 4096,
|
|
77
|
+
"thinking_budget": 2048,
|
|
78
|
+
"skill_token_budget": 300,
|
|
79
|
+
"knowledge_token_budget": 200,
|
|
80
|
+
"temperature": 0.3
|
|
73
81
|
}
|
|
74
82
|
}
|
|
75
83
|
}
|
package/CHANGELOG.md
CHANGED
|
@@ -2,6 +2,43 @@
|
|
|
2
2
|
|
|
3
3
|
All notable changes to little-coder are documented here. The format follows [Keep a Changelog](https://keepachangelog.com/en/1.1.0/), and little-coder's public interface (CLI, providers, tools, skills) follows semver starting at `v0.0.1` post-rename.
|
|
4
4
|
|
|
5
|
+
## [v1.2.1] — 2026-05-16
|
|
6
|
+
|
|
7
|
+
Docs-only release marking two milestones: **Terminal-Bench 2.0 leaderboard acceptance** and the **end of the Phase 1 benchmark baseline**. No CLI, settings, or skill-pack changes — the env-var path for remote inference (`LLAMACPP_BASE_URL` / `OLLAMA_BASE_URL` / `LMSTUDIO_BASE_URL` pointing at a non-loopback host) has worked since v1.1.0 / v1.2.0, but it was undocumented for the LAN-server case until now.
|
|
8
|
+
|
|
9
|
+
### Added
|
|
10
|
+
- **README "Serving from another machine on your LAN" section** under *Local model setup → Option C*. Covers all three local providers (llama.cpp `--host 0.0.0.0`, LM Studio's *Serve on local network*, `OLLAMA_HOST=0.0.0.0:11434 ollama serve`), the corresponding `*_BASE_URL` env on the client, a `curl /v1/models` reachability check, and a note on opening port 1234 / 8888 / 11434 in `ufw`. Validated against this repo's own benchmark hardware: `LLAMACPP_BASE_URL=http://<lan-ip>:8888/v1` against `llama-server --host 0.0.0.0` serves Qwen3.6-35B-A3B to a different machine over WiFi at the same per-token throughput as loopback.
|
|
11
|
+
|
|
12
|
+
### Changed
|
|
13
|
+
- **Benchmark table — Terminal-Bench 2.0 rows.** Replaced the *"awaiting maintainer merge"* status (HuggingFace PRs [#158](https://huggingface.co/datasets/harborframework/terminal-bench-2-leaderboard/discussions/158) and [#163](https://huggingface.co/datasets/harborframework/terminal-bench-2-leaderboard/discussions/163)) with the accepted leaderboard placements published at [tbench.ai/leaderboard/terminal-bench/2.0](https://www.tbench.ai/leaderboard/terminal-bench/2.0): **Qwen3.6-35B-A3B at 24.6 % ± 3.2 (rank 120)** and **Qwen3.5-9B at 9.2 % ± 2.4 (rank 142)**. The mean shifted slightly from the originally-submitted point estimates (23.82 % → 24.6 %, 9.21 % → 9.2 %) once the leaderboard recomputed across all five trials with a confidence interval; the underlying runs are unchanged.
|
|
14
|
+
- **Roadmap reframed.** Phase 1 (build a wide benchmark baseline across short coding exercises, interactive shell tasks, and tool-using research) is now marked **complete**: Aider Polyglot ✓, Terminal-Bench-Core v0.1.1 ✓, Terminal-Bench 2.0 ✓, GAIA validation ✓. Phase 2 opens now: **iterative improvement driven by real-world coding tasks**, not by the benchmark suite. New benchmarks (ProgramBench, SWE-bench Verified, GAIA test-split) are deferred until Phase 2 produces enough scaffolding signal to be worth re-measuring — re-benchmarking before the next round of changes lands would mostly re-measure the same baseline.
|
|
15
|
+
|
|
16
|
+
### Notes for upgraders
|
|
17
|
+
- No CLI flag, settings, or skill-pack breaks. Existing `LMSTUDIO_BASE_URL` / `LLAMACPP_BASE_URL` / `OLLAMA_BASE_URL` users on either loopback or remote hosts keep working with no changes; the only thing that changed is that the remote-host case is now documented.
|
|
18
|
+
- No `models.json` or `.pi/settings.json` shape change. Per-model profiles (context limit, thinking budget, temperature) continue to apply regardless of where the inference server lives — they're keyed by `<provider>/<model-id>`, not by host.
|
|
19
|
+
|
|
20
|
+
---
|
|
21
|
+
|
|
22
|
+
## [v1.2.0] — 2026-05-10
|
|
23
|
+
|
|
24
|
+
Issue-cleanup release that also ships built-in LM Studio support. Closes [#17](https://github.com/itayinbarr/little-coder/issues/17) (Windows), [#19](https://github.com/itayinbarr/little-coder/issues/19) (phantom Agent tool), [#21](https://github.com/itayinbarr/little-coder/issues/21) (skill param mismatch).
|
|
25
|
+
|
|
26
|
+
### Added
|
|
27
|
+
- **Built-in `lmstudio/local-model` provider.** [LM Studio](https://lmstudio.ai/) exposes an OpenAI-compatible server on `http://127.0.0.1:1234/v1` by default, and previously the only way to use it was to overload `LLAMACPP_BASE_URL`. Now you can run `little-coder --model lmstudio/local-model` and it routes to whatever model LM Studio currently has loaded — no extra config for the single-model case. New env knobs `LMSTUDIO_BASE_URL` (overrides baseUrl, parity with `LLAMACPP_BASE_URL`/`OLLAMA_BASE_URL`) and `LMSTUDIO_API_KEY` (any value; LM Studio ignores it locally but pi requires the env var to exist). README has a new **Option C — LM Studio** under *Local model setup*. `.pi/settings.json` ships a `lmstudio/local-model` profile so the same context/thinking-budget tuning as the llamacpp profiles applies.
|
|
28
|
+
|
|
29
|
+
### Fixed
|
|
30
|
+
- **Windows launch ([#17](https://github.com/itayinbarr/little-coder/issues/17), thanks @Grogger for [PR #18](https://github.com/itayinbarr/little-coder/pull/18)).** On Windows, `node_modules/.bin/pi` is a `.cmd` shim that Node 20's `spawn()` can't execute directly without `shell: true`, and `shell: true` reintroduces the CVE-2024-27980 / DEP0190 shell-injection class. The launcher now resolves `pi.cmd` on Windows and invokes `cmd.exe /c pi.cmd ...` with args as an array — works on Windows 11, no Linux/macOS regression.
|
|
31
|
+
- **Edit skill documentation ([#21](https://github.com/itayinbarr/little-coder/issues/21)).** `skills/tools/edit.md` advertised `old_string` / `new_string`, but pi's Edit tool only accepts `oldText` / `newText` (single-edit form) or `edits: [{oldText, newText}]` (array form). Rewritten to show the canonical array form *and* the single-edit back-compat form. While in there, also corrected `skills/tools/read.md` and `skills/tools/write.md` (`file_path` → `path` — pi aliases both, but the canonical name is now in the docs) and `skills/tools/grep.md` (`include` → `glob`, `max_results` → `limit`; pi does not alias these, so the old skill could genuinely produce tool-call errors on the grep path the same way Edit did).
|
|
32
|
+
|
|
33
|
+
### Changed
|
|
34
|
+
- **Removed phantom `Agent` skill ([#19](https://github.com/itayinbarr/little-coder/issues/19)).** `skills/tools/agent.md` documented an `Agent` tool that little-coder never actually registered — pi ships `examples/extensions/subagent/` as a reference impl, but it was not wired up by default. Deleted the skill card and the `agent` / `delegate` / `spawn` keys from `.pi/extensions/skill-inject/index.ts`'s `INTENT_MAP` so the model is no longer told it has a delegation tool. The `skills/protocols/task_decomposition.md` cheatsheet is untouched — decomposition guidance does not depend on a delegation tool.
|
|
35
|
+
|
|
36
|
+
### Notes for upgraders
|
|
37
|
+
- No CLI flag, settings, or skill-pack breaks. `--model lmstudio/local-model` works out of the box if LM Studio is serving on its default port 1234 with a model loaded.
|
|
38
|
+
- If you'd been overloading `LLAMACPP_BASE_URL=http://127.0.0.1:1234/v1` to point at LM Studio, that keeps working — but the cleaner path is now `--model lmstudio/local-model` with no env tweaking.
|
|
39
|
+
|
|
40
|
+
---
|
|
41
|
+
|
|
5
42
|
## [v1.1.0] — 2026-05-03
|
|
6
43
|
|
|
7
44
|
Issue-cleanup release. Three small features and one bug fix, driven by GitHub issues #12 / #13 / #15 / #16.
|
package/README.md
CHANGED
|
@@ -51,19 +51,21 @@ Cloud models work the same way:
|
|
|
51
51
|
little-coder --model anthropic/claude-haiku-4-5
|
|
52
52
|
little-coder --model openai/gpt-4o-mini "What does this codebase do?"
|
|
53
53
|
little-coder --model ollama/qwen3.5 # local Ollama
|
|
54
|
+
little-coder --model lmstudio/local-model # local LM Studio (whatever model you have loaded)
|
|
54
55
|
little-coder --list-models # see everything pi knows about
|
|
55
56
|
```
|
|
56
57
|
|
|
57
58
|
The agent uses the directory you launched it from as its working directory — `Read` / `Write` / `Edit` / `Bash` operate on your project, not on little-coder's install path.
|
|
58
59
|
|
|
59
|
-
For local providers (llama.cpp, Ollama) pi expects *some* value in the API-key env even though local servers ignore it:
|
|
60
|
+
For local providers (llama.cpp, Ollama, LM Studio) pi expects *some* value in the API-key env even though local servers ignore it:
|
|
60
61
|
|
|
61
62
|
```bash
|
|
62
63
|
export LLAMACPP_API_KEY=noop
|
|
63
64
|
export OLLAMA_API_KEY=noop
|
|
65
|
+
export LMSTUDIO_API_KEY=noop
|
|
64
66
|
```
|
|
65
67
|
|
|
66
|
-
`LLAMACPP_BASE_URL` and `
|
|
68
|
+
`LLAMACPP_BASE_URL`, `OLLAMA_BASE_URL`, and `LMSTUDIO_BASE_URL` override the defaults (`http://127.0.0.1:8888/v1`, `http://127.0.0.1:11434/v1`, `http://127.0.0.1:1234/v1`).
|
|
67
69
|
|
|
68
70
|
For cloud providers, set the standard env (`ANTHROPIC_API_KEY`, `OPENAI_API_KEY`, etc.) and pi will discover it.
|
|
69
71
|
|
|
@@ -97,6 +99,42 @@ ollama pull qwen3.5 # 9.7B — the paper's model
|
|
|
97
99
|
# or: ollama pull qwen3.6-35b-a3b
|
|
98
100
|
```
|
|
99
101
|
|
|
102
|
+
**Option C — LM Studio** (GUI; OpenAI-compatible server on port 1234):
|
|
103
|
+
|
|
104
|
+
1. Install [LM Studio](https://lmstudio.ai/) and download a model (e.g. Qwen3.6 35B A3B GGUF).
|
|
105
|
+
2. Open the **Developer** / **Local Server** tab, load the model, and click **Start Server** (default `http://127.0.0.1:1234`).
|
|
106
|
+
3. Run little-coder:
|
|
107
|
+
```bash
|
|
108
|
+
export LMSTUDIO_API_KEY=noop
|
|
109
|
+
little-coder --model lmstudio/local-model
|
|
110
|
+
```
|
|
111
|
+
The shipped `lmstudio/local-model` id routes to whatever model LM Studio currently has loaded — no extra config needed for the single-model case. If you serve on a non-default port, set `LMSTUDIO_BASE_URL=http://127.0.0.1:<port>/v1`. To target a specific model when you have several loaded, add an entry to `~/.config/little-coder/models.json` (see **Configuring models** below).
|
|
112
|
+
|
|
113
|
+
**Serving from another machine on your LAN.** Each provider's `*_BASE_URL` env var accepts any host, not just `127.0.0.1`, so you can run inference on a beefier box and connect from a laptop or another device on the same WiFi.
|
|
114
|
+
|
|
115
|
+
On the **server** (the box with the GPU):
|
|
116
|
+
|
|
117
|
+
- *llama.cpp*: start `llama-server` with `--host 0.0.0.0` (or your specific LAN interface) instead of `127.0.0.1`. Everything else from Option A unchanged.
|
|
118
|
+
- *LM Studio*: in the Server tab, enable **Serve on local network** so it binds `0.0.0.0:1234` instead of `127.0.0.1:1234`.
|
|
119
|
+
- *Ollama*: `OLLAMA_HOST=0.0.0.0:11434 ollama serve` (or set `OLLAMA_HOST=0.0.0.0` in the user systemd unit).
|
|
120
|
+
- If `ufw` / `firewalld` is active, allow your LAN subnet to the relevant port (e.g. `sudo ufw allow from 192.168.0.0/16 to any port 8888 proto tcp`).
|
|
121
|
+
- Find the LAN IP with `hostname -I` (Linux) or `ipconfig getifaddr en0` (macOS).
|
|
122
|
+
|
|
123
|
+
On the **client** (the machine running little-coder):
|
|
124
|
+
|
|
125
|
+
```bash
|
|
126
|
+
# Pick the env vars matching whichever provider is running on the server
|
|
127
|
+
export LLAMACPP_API_KEY=noop
|
|
128
|
+
export LLAMACPP_BASE_URL=http://<server-lan-ip>:8888/v1
|
|
129
|
+
|
|
130
|
+
# Sanity check reachability before launching the agent
|
|
131
|
+
curl -s http://<server-lan-ip>:8888/v1/models | head
|
|
132
|
+
|
|
133
|
+
little-coder --model llamacpp/qwen3.6-35b-a3b
|
|
134
|
+
```
|
|
135
|
+
|
|
136
|
+
The streaming chat-completions adapter works over a local network the same way it does over loopback — no client code change, no proxy needed. The per-model profile in `.pi/settings.json` (context/thinking-budget/temperature) still applies because it's keyed by `<provider>/<model-id>`, which the client picks regardless of where the server lives.
|
|
137
|
+
|
|
100
138
|
All small-model-specific extensions auto-disable for large/cloud models so they don't interfere.
|
|
101
139
|
|
|
102
140
|
---
|
|
@@ -140,7 +178,7 @@ Example — switch the llama.cpp port and bump `qwen3.6-35b-a3b` to a 150K conte
|
|
|
140
178
|
|
|
141
179
|
Then verify with `little-coder --list-models` — you should see your overridden entry.
|
|
142
180
|
|
|
143
|
-
`LLAMACPP_BASE_URL` and `
|
|
181
|
+
`LLAMACPP_BASE_URL`, `OLLAMA_BASE_URL`, and `LMSTUDIO_BASE_URL` env vars still beat both files for those three providers.
|
|
144
182
|
|
|
145
183
|
`.pi/settings.json` is a separate concern: it controls per-model **profiles** (context_limit, thinking_budget, temperature, benchmark_overrides) referenced by the `<provider>/<id>` key. Profiles don't register or describe models — they only tune how little-coder runs against models that are already registered.
|
|
146
184
|
|
|
@@ -178,8 +216,8 @@ Write/Edit confirmations are pi's responsibility; little-coder doesn't intercept
|
|
|
178
216
|
| [**v0.0.2**](https://github.com/itayinbarr/little-coder/releases/tag/v0.0.2) (commit `1d62bde`) — the paper | Qwen3.5-9B via Ollama | Aider Polyglot (225 exercises) | **45.56 %** mean of two runs; matched-model vanilla Aider baseline 19.11 %. Paper: [*Honey, I Shrunk the Coding Agent* on Substack](https://open.substack.com/pub/itayinbarr/p/honey-i-shrunk-the-coding-agent). |
|
|
179
217
|
| [**v0.0.5**](https://github.com/itayinbarr/little-coder/releases/tag/v0.0.5) — pre-pi Python | Qwen3.6-35B-A3B via llama.cpp | Aider Polyglot | **78.67 %**. [Full narrative](docs/benchmark-qwen3.6-35b-a3b.md). |
|
|
180
218
|
| [**v0.1.4**](https://github.com/itayinbarr/little-coder/releases/tag/v0.1.4) — on pi | Qwen3.6-35B-A3B via llama.cpp | Terminal-Bench-Core v0.1.1 (80 tasks) | **40.0 %** in 6 h 50 min. [Write-up](docs/benchmark-terminal-bench-v0.1.1.md). |
|
|
181
|
-
| [**v0.1.13**](https://github.com/itayinbarr/little-coder/releases/tag/v0.1.13) — on pi, TB 2.0 leaderboard | Qwen3.6-35B-A3B via llama.cpp | Terminal-Bench 2.0 (89 tasks × 5 trials = 445) | **
|
|
182
|
-
| [**v0.1.24**](https://github.com/itayinbarr/little-coder/releases/tag/v0.1.24) — on pi, TB 2.0 leaderboard, smaller model | Qwen3.5-9B (Q4_K_M) via llama.cpp (5.3 GB on GPU, 2× faster per-token than the 35B-A3B) | Terminal-Bench 2.0 (89 tasks × 5 trials = 445) | **9.
|
|
219
|
+
| [**v0.1.13**](https://github.com/itayinbarr/little-coder/releases/tag/v0.1.13) — on pi, TB 2.0 leaderboard | Qwen3.6-35B-A3B via llama.cpp | Terminal-Bench 2.0 (89 tasks × 5 trials = 445) | **24.6 % ± 3.2** — accepted to the [Terminal-Bench 2.0 leaderboard](https://www.tbench.ai/leaderboard/terminal-bench/2.0) (rank 120). |
|
|
220
|
+
| [**v0.1.24**](https://github.com/itayinbarr/little-coder/releases/tag/v0.1.24) — on pi, TB 2.0 leaderboard, smaller model | Qwen3.5-9B (Q4_K_M) via llama.cpp (5.3 GB on GPU, 2× faster per-token than the 35B-A3B) | Terminal-Bench 2.0 (89 tasks × 5 trials = 445) | **9.2 % ± 2.4** — accepted to the [Terminal-Bench 2.0 leaderboard](https://www.tbench.ai/leaderboard/terminal-bench/2.0) (rank 142). |
|
|
183
221
|
| [**v0.1.27**](https://github.com/itayinbarr/little-coder/releases/tag/v0.1.27) — on pi, GAIA validation | Qwen3.6-35B-A3B via llama.cpp | GAIA validation set (165 tasks) | **40.00 %** (66 / 165). L1 60.4 % / L2 37.2 % / L3 7.7 %. Test-split run pending. |
|
|
184
222
|
|
|
185
223
|
All runs used a consumer laptop: i9-14900HX, 32 GB RAM, **8 GB VRAM** on RTX 5070 Laptop (Blackwell). No cloud inference at any point.
|
|
@@ -188,17 +226,18 @@ All runs used a consumer laptop: i9-14900HX, 32 GB RAM, **8 GB VRAM** on RTX 507
|
|
|
188
226
|
|
|
189
227
|
## Roadmap
|
|
190
228
|
|
|
191
|
-
|
|
229
|
+
**Phase 1 — wide benchmark baseline: complete.** The paper established that scaffold–model fit moves a 9.7 B model from 19 % to 45 % on Aider Polyglot, and the goal of Phase 1 was to find out how wide that impact radius is. We now have a four-benchmark baseline on a single laptop-class GPU:
|
|
230
|
+
|
|
231
|
+
1. **Aider Polyglot** — 45.56 % (paper, Qwen3.5-9B) and 78.67 % (v0.0.5, Qwen3.6-35B-A3B).
|
|
232
|
+
2. **Terminal-Bench-Core v0.1.1** — 40.0 % (v0.1.4).
|
|
233
|
+
3. **Terminal-Bench 2.0** — accepted to the [official leaderboard](https://www.tbench.ai/leaderboard/terminal-bench/2.0): Qwen3.6-35B-A3B at **24.6 % ± 3.2** (rank 120) and Qwen3.5-9B at **9.2 % ± 2.4** (rank 142). The v0.1.24 prompt-repetition fix (re-add tool descriptions + concision guideline, validated by a 4 / 4 pilot on the previously-regressing `prove-plus-comm` task) was the prompt for both submissions.
|
|
234
|
+
4. **GAIA** — validation set at v0.1.27: **40.00 %** (66 / 165) on Qwen3.6-35B-A3B. Per-level L1 60.4 % / L2 37.2 % / L3 7.7 %.
|
|
192
235
|
|
|
193
|
-
The
|
|
236
|
+
That spans short coding exercises (Polyglot), interactive shell-bound tasks (Terminal-Bench), and tool-using research (GAIA), all on the same scaffold. The data needed to choose what to fix next is now in hand.
|
|
194
237
|
|
|
195
|
-
|
|
196
|
-
2. **Terminal-Bench-Core v0.1.1** — done. 40.0 % (v0.1.4).
|
|
197
|
-
3. **Terminal-Bench 2.0** — done. Qwen3.6-35B-A3B at **23.82 %** ([PR #158](https://huggingface.co/datasets/harborframework/terminal-bench-2-leaderboard/discussions/158)) and Qwen3.5-9B at **9.21 %** ([PR #163](https://huggingface.co/datasets/harborframework/terminal-bench-2-leaderboard/discussions/163)), both awaiting maintainer merge. The v0.1.24 prompt-repetition fix (re-add tool descriptions + concision guideline, validated by a 4 / 4 pilot on the previously-regressing `prove-plus-comm` task) was the prompt for both submissions.
|
|
198
|
-
4. **GAIA** — validation set done at v0.1.27: **40.00 %** (66 / 165) on Qwen3.6-35B-A3B. Per-level L1 60.4 % / L2 37.2 % / L3 7.7 %. Test-split run (301 tasks) pending → leaderboard submission to follow.
|
|
199
|
-
5. **SWE-bench Verified** — after GAIA. Multi-file real-world patches; the longest-horizon test of whether the scaffolding generalizes past exercise-scale tasks.
|
|
238
|
+
**Phase 2 — iterative improvement on real-world tasks: starting now.** The motivating question shifts from *how wide is the impact radius?* to *which scaffolding changes compound on long-horizon real work?* The signal we have already points at concrete things to try — thinking-budget / quality-monitor behavior on long-horizon tasks, deliberate.py-style parallel branches on failure, better shell-session recovery for interactive-process traps, evidence-handling on multi-document GAIA L3 tasks — but the priority order comes from real-world use, not from a benchmark suite. Expect smaller, more frequent releases driven by what little-coder actually struggles with on day-to-day coding work.
|
|
200
239
|
|
|
201
|
-
**
|
|
240
|
+
**Future benchmarks (deferred).** New benchmarks like **ProgramBench**, SWE-bench Verified (multi-file real-world patches), and a GAIA test-split run come back into scope after Phase 2 has produced enough scaffolding signal to make a fresh measurement worth running. Re-benchmarking before the next round of changes lands would mostly re-measure the same baseline.
|
|
202
241
|
|
|
203
242
|
---
|
|
204
243
|
|
|
@@ -241,7 +280,7 @@ little-coder/
|
|
|
241
280
|
├── .pi/
|
|
242
281
|
│ ├── settings.json # per-model profiles + benchmark_overrides (terminal_bench, gaia)
|
|
243
282
|
│ └── extensions/ # 20 TypeScript extensions, auto-discovered by pi
|
|
244
|
-
│ ├── llama-cpp-provider/ # data-driven provider registration from models.json (+ user override file)
|
|
283
|
+
│ ├── llama-cpp-provider/ # data-driven provider registration from models.json — ships llamacpp, ollama, lmstudio (+ user override file)
|
|
245
284
|
│ ├── write-guard/ # Write refuses on existing files — the whitepaper invariant
|
|
246
285
|
│ ├── extra-tools/ # glob, webfetch, websearch (pi ships grep/find)
|
|
247
286
|
│ ├── skill-inject/ # per-turn tool-skill selection (error > recency > intent)
|
package/bin/little-coder.mjs
CHANGED
|
@@ -29,7 +29,9 @@ const here = dirname(fileURLToPath(import.meta.url));
|
|
|
29
29
|
const pkgRoot = resolve(here, "..");
|
|
30
30
|
|
|
31
31
|
// ---- 3. Resolve the bundled pi binary ----
|
|
32
|
-
const
|
|
32
|
+
const isWindows = process.platform === "win32";
|
|
33
|
+
const piBinBase = join(pkgRoot, "node_modules", ".bin", "pi");
|
|
34
|
+
const piBin = isWindows && existsSync(`${piBinBase}.cmd`) ? `${piBinBase}.cmd` : piBinBase;
|
|
33
35
|
if (!existsSync(piBin)) {
|
|
34
36
|
console.error(
|
|
35
37
|
`little-coder: cannot find pi at ${piBin}.\n` +
|
|
@@ -86,7 +88,11 @@ const piArgs = [
|
|
|
86
88
|
];
|
|
87
89
|
|
|
88
90
|
// ---- 7. Spawn pi in the user's cwd ----
|
|
89
|
-
const
|
|
91
|
+
const [spawnCmd, spawnArgs] = isWindows
|
|
92
|
+
? ["cmd.exe", ["/c", piBin, ...piArgs]]
|
|
93
|
+
: [piBin, piArgs];
|
|
94
|
+
|
|
95
|
+
const child = spawn(spawnCmd, spawnArgs, {
|
|
90
96
|
stdio: "inherit",
|
|
91
97
|
cwd: process.cwd(),
|
|
92
98
|
env: process.env,
|
package/models.json
CHANGED
|
@@ -49,6 +49,22 @@
|
|
|
49
49
|
"cost": { "input": 0, "output": 0, "cacheRead": 0, "cacheWrite": 0 }
|
|
50
50
|
}
|
|
51
51
|
]
|
|
52
|
+
},
|
|
53
|
+
"lmstudio": {
|
|
54
|
+
"api": "openai-completions",
|
|
55
|
+
"baseUrl": "http://127.0.0.1:1234/v1",
|
|
56
|
+
"apiKey": "LMSTUDIO_API_KEY",
|
|
57
|
+
"models": [
|
|
58
|
+
{
|
|
59
|
+
"id": "local-model",
|
|
60
|
+
"name": "LM Studio (currently-loaded local model)",
|
|
61
|
+
"reasoning": true,
|
|
62
|
+
"input": ["text"],
|
|
63
|
+
"contextWindow": 32768,
|
|
64
|
+
"maxTokens": 4096,
|
|
65
|
+
"cost": { "input": 0, "output": 0, "cacheRead": 0, "cacheWrite": 0 }
|
|
66
|
+
}
|
|
67
|
+
]
|
|
52
68
|
}
|
|
53
69
|
}
|
|
54
70
|
}
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "little-coder",
|
|
3
|
-
"version": "1.1
|
|
3
|
+
"version": "1.2.1",
|
|
4
4
|
"description": "A pi-based coding agent optimized for small local language models. Reproduces the whitepaper's scaffold-model-fit adaptations as pi extensions.",
|
|
5
5
|
"homepage": "https://github.com/itayinbarr/little-coder",
|
|
6
6
|
"repository": {
|
package/skills/tools/edit.md
CHANGED
|
@@ -9,22 +9,28 @@ user-invocable: false
|
|
|
9
9
|
## Edit Tool
|
|
10
10
|
Replace exact text in a file. This is the **default tool for changing any existing file** — prefer it over Write for anything except creating a new file from scratch.
|
|
11
11
|
|
|
12
|
-
REQUIRED:
|
|
13
|
-
OPTIONAL:
|
|
12
|
+
REQUIRED: path (absolute), edits (array of {oldText, newText})
|
|
13
|
+
OPTIONAL: none
|
|
14
14
|
|
|
15
15
|
RULES:
|
|
16
|
-
-
|
|
17
|
-
-
|
|
18
|
-
-
|
|
19
|
-
- To delete text: set
|
|
16
|
+
- Each `oldText` must match EXACTLY (whitespace, indentation, line endings all matter)
|
|
17
|
+
- Each `oldText` must be unique in the file — include 2-3 lines of surrounding context if needed
|
|
18
|
+
- `edits` is matched against the **original** file, not after earlier edits apply — do not overlap or nest
|
|
19
|
+
- To delete text: set `newText` to ""
|
|
20
20
|
- Read the file first if you do not already have its current content
|
|
21
|
+
- Batch multiple disjoint changes in one call by passing multiple `edits[]` entries
|
|
21
22
|
|
|
22
|
-
EXAMPLE:
|
|
23
|
+
EXAMPLE (single change):
|
|
23
24
|
```tool
|
|
24
|
-
{"name": "Edit", "input": {"
|
|
25
|
+
{"name": "Edit", "input": {"path": "/absolute/path/file.py", "edits": [{"oldText": "def hello():\n return 1", "newText": "def hello():\n return 2"}]}}
|
|
26
|
+
```
|
|
27
|
+
|
|
28
|
+
EXAMPLE (two changes in one call):
|
|
29
|
+
```tool
|
|
30
|
+
{"name": "Edit", "input": {"path": "/absolute/path/file.py", "edits": [{"oldText": "MAX = 10", "newText": "MAX = 20"}, {"oldText": "TIMEOUT = 5", "newText": "TIMEOUT = 30"}]}}
|
|
25
31
|
```
|
|
26
32
|
|
|
27
33
|
RECOVERY WHEN Edit FAILS:
|
|
28
34
|
- "String not found" → Read the file to get the exact current content (whitespace often differs), then retry Edit with the exact string
|
|
29
|
-
- "Found multiple times" → include more surrounding context so
|
|
30
|
-
- Do NOT fall back to Write just because Edit failed once — re-read, fix
|
|
35
|
+
- "Found multiple times" → include more surrounding context so `oldText` is unique, then retry Edit
|
|
36
|
+
- Do NOT fall back to Write just because Edit failed once — re-read, fix `oldText`, retry. Write is almost always the wrong recovery here for an existing file.
|
package/skills/tools/grep.md
CHANGED
|
@@ -10,17 +10,18 @@ user-invocable: false
|
|
|
10
10
|
Search file contents with regex. Uses ripgrep.
|
|
11
11
|
|
|
12
12
|
REQUIRED: pattern (regex pattern)
|
|
13
|
-
OPTIONAL: path (directory/file),
|
|
13
|
+
OPTIONAL: path (directory/file), glob (file glob filter like "*.py"), ignoreCase (bool), literal (bool — treat pattern as literal text), context (lines of context before/after), limit (max matches, default 100)
|
|
14
14
|
|
|
15
15
|
RULES:
|
|
16
|
-
- Supports full regex syntax
|
|
17
|
-
- Use
|
|
16
|
+
- Supports full regex syntax (unless `literal: true`)
|
|
17
|
+
- Use `glob` to filter by file type (e.g. "*.py", "*.js")
|
|
18
|
+
- Use `limit` to cap results; default 100
|
|
18
19
|
- Returns matching lines with file path and line number
|
|
19
20
|
- Good for finding function definitions, imports, references
|
|
20
21
|
|
|
21
22
|
EXAMPLE:
|
|
22
23
|
```tool
|
|
23
|
-
{"name": "Grep", "input": {"pattern": "def main", "
|
|
24
|
+
{"name": "Grep", "input": {"pattern": "def main", "glob": "*.py"}}
|
|
24
25
|
```
|
|
25
26
|
|
|
26
27
|
EXAMPLE with path:
|
package/skills/tools/read.md
CHANGED
|
@@ -9,7 +9,7 @@ user-invocable: false
|
|
|
9
9
|
## Read Tool
|
|
10
10
|
Read a file's contents with line numbers.
|
|
11
11
|
|
|
12
|
-
REQUIRED:
|
|
12
|
+
REQUIRED: path (absolute path)
|
|
13
13
|
OPTIONAL: limit (max lines), offset (start line, 0-indexed)
|
|
14
14
|
|
|
15
15
|
RULES:
|
|
@@ -19,10 +19,10 @@ RULES:
|
|
|
19
19
|
|
|
20
20
|
EXAMPLE:
|
|
21
21
|
```tool
|
|
22
|
-
{"name": "Read", "input": {"
|
|
22
|
+
{"name": "Read", "input": {"path": "/absolute/path/to/file.py"}}
|
|
23
23
|
```
|
|
24
24
|
|
|
25
25
|
EXAMPLE with range:
|
|
26
26
|
```tool
|
|
27
|
-
{"name": "Read", "input": {"
|
|
27
|
+
{"name": "Read", "input": {"path": "/absolute/path/to/file.py", "limit": 50, "offset": 100}}
|
|
28
28
|
```
|
package/skills/tools/write.md
CHANGED
|
@@ -9,7 +9,7 @@ user-invocable: false
|
|
|
9
9
|
## Write Tool
|
|
10
10
|
Create a **new** file with the given content. Creates parent directories automatically.
|
|
11
11
|
|
|
12
|
-
REQUIRED:
|
|
12
|
+
REQUIRED: path (absolute), content (full file content)
|
|
13
13
|
|
|
14
14
|
**Write is for creating new files only.** If the file already exists, Write will be **refused** by the tool and return an error telling you to use Edit instead. Do not retry Write on the same path — it will be refused again.
|
|
15
15
|
|
|
@@ -17,13 +17,13 @@ WHEN TO USE Write:
|
|
|
17
17
|
- The file does not exist yet and you are creating it from scratch
|
|
18
18
|
|
|
19
19
|
WHEN TO USE Edit INSTEAD:
|
|
20
|
-
- ANY change to an existing file — bug fixes, refactors, format tweaks, adding a function, renaming a variable, everything. Edit takes
|
|
20
|
+
- ANY change to an existing file — bug fixes, refactors, format tweaks, adding a function, renaming a variable, everything. Edit takes `path` + `edits: [{oldText, newText}]` and patches in place.
|
|
21
21
|
- Iterating after a failed test — never retype the whole file
|
|
22
22
|
|
|
23
|
-
If you need to completely replace an existing file's content, Edit can still do that: pass the entire current content as
|
|
23
|
+
If you need to completely replace an existing file's content, Edit can still do that: pass the entire current content as `oldText` and the full new content as `newText`. Read the file first if you don't already have its current content.
|
|
24
24
|
|
|
25
25
|
EXAMPLE:
|
|
26
26
|
```tool
|
|
27
|
-
{"name": "Write", "input": {"
|
|
27
|
+
{"name": "Write", "input": {"path": "/tmp/example/new_module.py", "content": "def hello():\n return 'hi'\n"}}
|
|
28
28
|
```
|
|
29
29
|
NOTE: Always use the EXACT file path given in the task, never a placeholder.
|
package/skills/tools/agent.md
DELETED
|
@@ -1,24 +0,0 @@
|
|
|
1
|
-
---
|
|
2
|
-
name: agent-guidance
|
|
3
|
-
type: tool-guidance
|
|
4
|
-
target_tool: Agent
|
|
5
|
-
priority: 6
|
|
6
|
-
token_cost: 120
|
|
7
|
-
user-invocable: false
|
|
8
|
-
---
|
|
9
|
-
## Agent Tool
|
|
10
|
-
Spawn a sub-agent to handle a task autonomously.
|
|
11
|
-
|
|
12
|
-
REQUIRED: prompt (task description for the sub-agent)
|
|
13
|
-
OPTIONAL: subagent_type (coder/reviewer/researcher/tester/general-purpose), name (for messaging), isolation ("worktree" for git isolation)
|
|
14
|
-
|
|
15
|
-
RULES:
|
|
16
|
-
- Use for independent tasks that don't need your direct attention
|
|
17
|
-
- Sub-agents get their own context and can use all tools
|
|
18
|
-
- Use subagent_type to get specialized behavior
|
|
19
|
-
- Use isolation="worktree" when the agent needs to modify files independently
|
|
20
|
-
|
|
21
|
-
EXAMPLE:
|
|
22
|
-
```tool
|
|
23
|
-
{"name": "Agent", "input": {"prompt": "Find all Python files that import requests and list them", "subagent_type": "researcher"}}
|
|
24
|
-
```
|