localcoder 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- localcoder/__init__.py +2 -0
- localcoder/__main__.py +2 -0
- localcoder/agent.py +35 -0
- localcoder/backends.py +2470 -0
- localcoder/bench.py +335 -0
- localcoder/cli.py +827 -0
- localcoder/gemma4coder_display.py +583 -0
- localcoder/setup.py +321 -0
- localcoder/tui.py +276 -0
- localcoder/voice.py +187 -0
- localcoder-0.1.0.dist-info/METADATA +187 -0
- localcoder-0.1.0.dist-info/RECORD +15 -0
- localcoder-0.1.0.dist-info/WHEEL +4 -0
- localcoder-0.1.0.dist-info/entry_points.txt +2 -0
- localcoder-0.1.0.dist-info/licenses/LICENSE +4 -0
localcoder/bench.py
ADDED
|
@@ -0,0 +1,335 @@
|
|
|
1
|
+
"""localcoder bench — local model arena. Download, test, rank models automatically."""
|
|
2
|
+
import json, os, time, subprocess, urllib.request, urllib.parse
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
from rich.console import Console
|
|
5
|
+
from rich.table import Table
|
|
6
|
+
from rich.panel import Panel
|
|
7
|
+
from rich.markup import escape
|
|
8
|
+
|
|
9
|
+
console = Console()
|
|
10
|
+
BENCH_FILE = Path.home() / ".localcoder" / "benchmarks.json"
|
|
11
|
+
LLAMA_SERVER = Path.home() / ".unsloth/llama.cpp/llama-server"
|
|
12
|
+
PORT = 8099 # dedicated bench port
|
|
13
|
+
|
|
14
|
+
# ── Test Suite ──
|
|
15
|
+
TESTS = [
|
|
16
|
+
{
|
|
17
|
+
"id": "code_function",
|
|
18
|
+
"name": "Write Function",
|
|
19
|
+
"category": "coding",
|
|
20
|
+
"prompt": "Write a Python function that finds the longest palindromic substring. Include type hints and docstring.",
|
|
21
|
+
"check": lambda r: ("def " in r) + ("->" in r or ": str" in r) + ('"""' in r or "'''" in r),
|
|
22
|
+
"max_score": 3,
|
|
23
|
+
},
|
|
24
|
+
{
|
|
25
|
+
"id": "code_debug",
|
|
26
|
+
"name": "Debug Code",
|
|
27
|
+
"category": "coding",
|
|
28
|
+
"prompt": "This Python code has a bug. Fix it and explain:\n\ndef binary_search(arr, target):\n left, right = 0, len(arr)\n while left < right:\n mid = (left + right) // 2\n if arr[mid] == target:\n return mid\n elif arr[mid] < target:\n left = mid\n else:\n right = mid\n return -1",
|
|
29
|
+
"check": lambda r: ("left = mid + 1" in r or "left = mid+1" in r) + ("infinite" in r.lower() or "loop" in r.lower() or "bug" in r.lower()) + ("def " in r or "fix" in r.lower()),
|
|
30
|
+
"max_score": 3,
|
|
31
|
+
},
|
|
32
|
+
{
|
|
33
|
+
"id": "reasoning",
|
|
34
|
+
"name": "Logic Puzzle",
|
|
35
|
+
"category": "reasoning",
|
|
36
|
+
"prompt": "A farmer has a fox, a chicken, and a sack of grain. He must cross a river in a boat that can only carry him and one item. If left alone, the fox will eat the chicken and the chicken will eat the grain. How does he get everything across?",
|
|
37
|
+
"check": lambda r: ("chicken" in r.lower()) + ("fox" in r.lower()) + ("grain" in r.lower()) + (r.lower().count("cross") >= 2 or r.lower().count("take") >= 2 or r.lower().count("bring") >= 2),
|
|
38
|
+
"max_score": 4,
|
|
39
|
+
},
|
|
40
|
+
{
|
|
41
|
+
"id": "tool_json",
|
|
42
|
+
"name": "Tool Call JSON",
|
|
43
|
+
"category": "tool_use",
|
|
44
|
+
"prompt": 'You have a tool called "search" with parameter "query" (string). The user says: "find the weather in Paris". Respond with a JSON tool call.',
|
|
45
|
+
"check": lambda r: ('"search"' in r or "'search'" in r) + ('"query"' in r or "'query'" in r) + ("paris" in r.lower()) + ("{" in r),
|
|
46
|
+
"max_score": 4,
|
|
47
|
+
},
|
|
48
|
+
{
|
|
49
|
+
"id": "follow_instructions",
|
|
50
|
+
"name": "Follow Instructions",
|
|
51
|
+
"category": "instruction",
|
|
52
|
+
"prompt": "List exactly 5 programming languages that start with the letter P. Output only the list, nothing else.",
|
|
53
|
+
"check": lambda r: (r.lower().count("python") >= 1) + (sum(1 for line in r.strip().splitlines() if line.strip()) <= 7) + ("perl" in r.lower() or "php" in r.lower() or "prolog" in r.lower()),
|
|
54
|
+
"max_score": 3,
|
|
55
|
+
},
|
|
56
|
+
]
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def _load_results():
|
|
60
|
+
"""Load previous benchmark results."""
|
|
61
|
+
if BENCH_FILE.exists():
|
|
62
|
+
try:
|
|
63
|
+
return json.loads(BENCH_FILE.read_text())
|
|
64
|
+
except Exception:
|
|
65
|
+
pass
|
|
66
|
+
return {}
|
|
67
|
+
|
|
68
|
+
|
|
69
|
+
def _save_results(results):
|
|
70
|
+
"""Save benchmark results."""
|
|
71
|
+
BENCH_FILE.parent.mkdir(parents=True, exist_ok=True)
|
|
72
|
+
BENCH_FILE.write_text(json.dumps(results, indent=2))
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def _find_gguf(name_pattern):
|
|
76
|
+
"""Find a GGUF file matching a pattern in HF cache."""
|
|
77
|
+
cache = Path.home() / ".cache/huggingface/hub"
|
|
78
|
+
for f in cache.rglob("*.gguf"):
|
|
79
|
+
if "mmproj" in f.name.lower():
|
|
80
|
+
continue
|
|
81
|
+
real = f.resolve()
|
|
82
|
+
if name_pattern.lower().replace("-", "") in f.name.lower().replace("-", ""):
|
|
83
|
+
return str(real)
|
|
84
|
+
return None
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
def _find_mmproj(model_dir_pattern):
|
|
88
|
+
"""Find mmproj file for vision models."""
|
|
89
|
+
cache = Path.home() / ".cache/huggingface/hub"
|
|
90
|
+
for d in cache.iterdir():
|
|
91
|
+
if model_dir_pattern.lower().replace("-","") in d.name.lower().replace("-",""):
|
|
92
|
+
for f in d.rglob("*mmproj*"):
|
|
93
|
+
return str(f.resolve())
|
|
94
|
+
return None
|
|
95
|
+
|
|
96
|
+
|
|
97
|
+
def _start_server(gguf_path, extra_flags=""):
|
|
98
|
+
"""Start llama-server with a model, return process."""
|
|
99
|
+
binary = str(LLAMA_SERVER)
|
|
100
|
+
if not os.path.exists(binary):
|
|
101
|
+
import shutil
|
|
102
|
+
binary = shutil.which("llama-server")
|
|
103
|
+
if not binary:
|
|
104
|
+
return None
|
|
105
|
+
|
|
106
|
+
cmd = f"{binary} -m {gguf_path} --port {PORT} -ngl 99 -c 8192 -fa on -ctk q4_0 -ctv q4_0 --jinja {extra_flags}"
|
|
107
|
+
proc = subprocess.Popen(cmd.split(), stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
|
108
|
+
|
|
109
|
+
for _ in range(60):
|
|
110
|
+
try:
|
|
111
|
+
req = urllib.request.Request(f"http://127.0.0.1:{PORT}/health")
|
|
112
|
+
with urllib.request.urlopen(req, timeout=1):
|
|
113
|
+
return proc
|
|
114
|
+
except Exception:
|
|
115
|
+
time.sleep(1)
|
|
116
|
+
|
|
117
|
+
proc.kill()
|
|
118
|
+
return None
|
|
119
|
+
|
|
120
|
+
|
|
121
|
+
def _run_test(test, max_tokens=512):
|
|
122
|
+
"""Run a single test against the server. Returns score, tok/s, response."""
|
|
123
|
+
payload = json.dumps({
|
|
124
|
+
"model": "bench",
|
|
125
|
+
"messages": [{"role": "user", "content": test["prompt"]}],
|
|
126
|
+
"max_tokens": max_tokens,
|
|
127
|
+
"temperature": 0.1,
|
|
128
|
+
}).encode()
|
|
129
|
+
|
|
130
|
+
try:
|
|
131
|
+
t0 = time.time()
|
|
132
|
+
req = urllib.request.Request(
|
|
133
|
+
f"http://127.0.0.1:{PORT}/v1/chat/completions",
|
|
134
|
+
data=payload,
|
|
135
|
+
headers={"Content-Type": "application/json"},
|
|
136
|
+
)
|
|
137
|
+
with urllib.request.urlopen(req, timeout=120) as resp:
|
|
138
|
+
data = json.loads(resp.read())
|
|
139
|
+
elapsed = time.time() - t0
|
|
140
|
+
|
|
141
|
+
content = data["choices"][0]["message"].get("content", "")
|
|
142
|
+
tokens = data.get("usage", {}).get("completion_tokens", len(content.split()))
|
|
143
|
+
tps = tokens / elapsed if elapsed > 0 else 0
|
|
144
|
+
score = test["check"](content)
|
|
145
|
+
|
|
146
|
+
return {
|
|
147
|
+
"score": score,
|
|
148
|
+
"max_score": test["max_score"],
|
|
149
|
+
"tps": round(tps, 1),
|
|
150
|
+
"tokens": tokens,
|
|
151
|
+
"time": round(elapsed, 1),
|
|
152
|
+
"ttft": round(elapsed - (tokens / max(1, tps)), 2) if tps > 0 else 0,
|
|
153
|
+
}
|
|
154
|
+
except Exception as e:
|
|
155
|
+
return {"score": 0, "max_score": test["max_score"], "tps": 0, "tokens": 0, "time": 0, "error": str(e)}
|
|
156
|
+
|
|
157
|
+
|
|
158
|
+
def _download_model(repo, filename):
|
|
159
|
+
"""Download a GGUF from HuggingFace."""
|
|
160
|
+
try:
|
|
161
|
+
from huggingface_hub import hf_hub_download
|
|
162
|
+
path = hf_hub_download(repo_id=repo, filename=filename)
|
|
163
|
+
return path
|
|
164
|
+
except ImportError:
|
|
165
|
+
console.print(" [red]pip install huggingface_hub[/]")
|
|
166
|
+
return None
|
|
167
|
+
except Exception as e:
|
|
168
|
+
console.print(f" [red]{e}[/]")
|
|
169
|
+
return None
|
|
170
|
+
|
|
171
|
+
|
|
172
|
+
def bench_model(name, gguf_path, extra_flags="", skip_if_cached=True):
|
|
173
|
+
"""Benchmark a single model. Returns results dict."""
|
|
174
|
+
results = _load_results()
|
|
175
|
+
|
|
176
|
+
if skip_if_cached and name in results:
|
|
177
|
+
console.print(f" [dim]{name}: cached results (run --bench --force to re-test)[/]")
|
|
178
|
+
return results[name]
|
|
179
|
+
|
|
180
|
+
console.print(f"\n [bold cyan]{name}[/]")
|
|
181
|
+
console.print(f" [dim]Loading model...[/]")
|
|
182
|
+
|
|
183
|
+
proc = _start_server(gguf_path, extra_flags)
|
|
184
|
+
if not proc:
|
|
185
|
+
console.print(f" [red]Failed to start server[/]")
|
|
186
|
+
return None
|
|
187
|
+
|
|
188
|
+
console.print(f" [green]Server ready[/] — running {len(TESTS)} tests")
|
|
189
|
+
|
|
190
|
+
model_results = {"name": name, "path": gguf_path, "tests": {}, "timestamp": time.strftime("%Y-%m-%d %H:%M")}
|
|
191
|
+
total_score = 0
|
|
192
|
+
total_max = 0
|
|
193
|
+
total_tps = []
|
|
194
|
+
|
|
195
|
+
for test in TESTS:
|
|
196
|
+
console.print(f" {test['name']:<22}", end="")
|
|
197
|
+
r = _run_test(test)
|
|
198
|
+
stars = r["score"]
|
|
199
|
+
max_s = r["max_score"]
|
|
200
|
+
total_score += stars
|
|
201
|
+
total_max += max_s
|
|
202
|
+
if r["tps"] > 0:
|
|
203
|
+
total_tps.append(r["tps"])
|
|
204
|
+
|
|
205
|
+
bar = "[green]" + "★" * stars + "[/][dim]" + "☆" * (max_s - stars) + "[/]"
|
|
206
|
+
console.print(f" {bar} {r['tps']:>5.1f} tok/s {r['time']:>5.1f}s")
|
|
207
|
+
|
|
208
|
+
model_results["tests"][test["id"]] = r
|
|
209
|
+
|
|
210
|
+
avg_tps = sum(total_tps) / len(total_tps) if total_tps else 0
|
|
211
|
+
model_results["total_score"] = total_score
|
|
212
|
+
model_results["total_max"] = total_max
|
|
213
|
+
model_results["avg_tps"] = round(avg_tps, 1)
|
|
214
|
+
model_results["pct"] = round(total_score / total_max * 100) if total_max > 0 else 0
|
|
215
|
+
|
|
216
|
+
console.print(f" {'─' * 50}")
|
|
217
|
+
console.print(f" [bold]Total: {total_score}/{total_max} ({model_results['pct']}%) · {avg_tps:.1f} tok/s avg[/]")
|
|
218
|
+
|
|
219
|
+
proc.kill()
|
|
220
|
+
proc.wait()
|
|
221
|
+
time.sleep(2)
|
|
222
|
+
|
|
223
|
+
# Save
|
|
224
|
+
results[name] = model_results
|
|
225
|
+
_save_results(results)
|
|
226
|
+
|
|
227
|
+
return model_results
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def show_leaderboard():
|
|
231
|
+
"""Show the benchmark leaderboard like LM Arena."""
|
|
232
|
+
results = _load_results()
|
|
233
|
+
if not results:
|
|
234
|
+
console.print("\n [dim]No benchmarks yet. Run: localcoder --bench[/]\n")
|
|
235
|
+
return
|
|
236
|
+
|
|
237
|
+
console.print()
|
|
238
|
+
|
|
239
|
+
table = Table(
|
|
240
|
+
title="[bold]localcoder Arena — Model Leaderboard[/]",
|
|
241
|
+
title_style="bold #e07a5f",
|
|
242
|
+
show_header=True, header_style="bold",
|
|
243
|
+
border_style="dim", padding=(0, 1),
|
|
244
|
+
)
|
|
245
|
+
table.add_column("#", style="bold", width=3)
|
|
246
|
+
table.add_column("Model", width=28)
|
|
247
|
+
table.add_column("Score", justify="center", width=8)
|
|
248
|
+
table.add_column("tok/s", justify="right", width=7)
|
|
249
|
+
table.add_column("Code", justify="center", width=6)
|
|
250
|
+
table.add_column("Reason", justify="center", width=6)
|
|
251
|
+
table.add_column("Tools", justify="center", width=6)
|
|
252
|
+
table.add_column("Instruct", justify="center", width=6)
|
|
253
|
+
table.add_column("", width=16)
|
|
254
|
+
|
|
255
|
+
# Sort by score, then speed
|
|
256
|
+
ranked = sorted(results.values(), key=lambda x: (x.get("pct", 0), x.get("avg_tps", 0)), reverse=True)
|
|
257
|
+
|
|
258
|
+
for i, r in enumerate(ranked, 1):
|
|
259
|
+
pct = r.get("pct", 0)
|
|
260
|
+
tps = r.get("avg_tps", 0)
|
|
261
|
+
tests = r.get("tests", {})
|
|
262
|
+
|
|
263
|
+
# Category scores
|
|
264
|
+
code_score = sum(tests.get(t, {}).get("score", 0) for t in ["code_function", "code_debug"])
|
|
265
|
+
code_max = sum(tests.get(t, {}).get("max_score", 3) for t in ["code_function", "code_debug"])
|
|
266
|
+
reason_score = tests.get("reasoning", {}).get("score", 0)
|
|
267
|
+
reason_max = tests.get("reasoning", {}).get("max_score", 4)
|
|
268
|
+
tool_score = tests.get("tool_json", {}).get("score", 0)
|
|
269
|
+
tool_max = tests.get("tool_json", {}).get("max_score", 4)
|
|
270
|
+
inst_score = tests.get("follow_instructions", {}).get("score", 0)
|
|
271
|
+
inst_max = tests.get("follow_instructions", {}).get("max_score", 3)
|
|
272
|
+
|
|
273
|
+
# Visual bar
|
|
274
|
+
bar_w = 14
|
|
275
|
+
filled = int(pct / 100 * bar_w)
|
|
276
|
+
bc = "green" if pct >= 80 else "yellow" if pct >= 50 else "red"
|
|
277
|
+
bar = f"[{bc}]{'█' * filled}[/{bc}][dim]{'░' * (bar_w - filled)}[/]"
|
|
278
|
+
|
|
279
|
+
# Medal
|
|
280
|
+
medal = {1: "[bold yellow]🥇[/]", 2: "[white]🥈[/]", 3: "[#cd7f32]🥉[/]"}.get(i, f" ")
|
|
281
|
+
|
|
282
|
+
score_str = f"[bold]{pct}%[/]"
|
|
283
|
+
tps_str = f"{tps:.0f}"
|
|
284
|
+
|
|
285
|
+
table.add_row(
|
|
286
|
+
medal,
|
|
287
|
+
escape(r.get("name", "?")),
|
|
288
|
+
score_str,
|
|
289
|
+
tps_str,
|
|
290
|
+
f"{code_score}/{code_max}",
|
|
291
|
+
f"{reason_score}/{reason_max}",
|
|
292
|
+
f"{tool_score}/{tool_max}",
|
|
293
|
+
f"{inst_score}/{inst_max}",
|
|
294
|
+
bar,
|
|
295
|
+
)
|
|
296
|
+
|
|
297
|
+
console.print(table)
|
|
298
|
+
console.print(f"\n [dim]Tested on {ranked[0].get('timestamp', '?') if ranked else '?'} · M4 Pro 24GB · llama.cpp[/]")
|
|
299
|
+
console.print(f" [dim]Run [bold]localcoder --bench[/bold] to test more models · Results in ~/.localcoder/benchmarks.json[/]\n")
|
|
300
|
+
|
|
301
|
+
|
|
302
|
+
def run_full_bench(force=False):
|
|
303
|
+
"""Run benchmarks on all installed models."""
|
|
304
|
+
from localcoder.backends import get_disk_info
|
|
305
|
+
|
|
306
|
+
console.print("\n [bold #e07a5f]localcoder Arena[/] — benchmarking all installed models\n")
|
|
307
|
+
|
|
308
|
+
di = get_disk_info()
|
|
309
|
+
models_to_test = []
|
|
310
|
+
|
|
311
|
+
for m in di.get("models", []):
|
|
312
|
+
name = m["name"].replace(".gguf", "")
|
|
313
|
+
size = m["size_gb"]
|
|
314
|
+
path = m["path"]
|
|
315
|
+
|
|
316
|
+
if size * 1024 > 16384:
|
|
317
|
+
console.print(f" [dim]Skip {name} ({size}GB) — won't fit GPU[/]")
|
|
318
|
+
continue
|
|
319
|
+
|
|
320
|
+
# Determine extra flags based on model name
|
|
321
|
+
extra = ""
|
|
322
|
+
if "gemma" in name.lower():
|
|
323
|
+
extra = "--reasoning off --no-mmproj"
|
|
324
|
+
elif "qwen" in name.lower():
|
|
325
|
+
extra = "--reasoning-budget 0"
|
|
326
|
+
|
|
327
|
+
models_to_test.append((name, path, extra))
|
|
328
|
+
|
|
329
|
+
console.print(f" Testing {len(models_to_test)} models...\n")
|
|
330
|
+
|
|
331
|
+
for name, path, extra in models_to_test:
|
|
332
|
+
bench_model(name, path, extra, skip_if_cached=not force)
|
|
333
|
+
|
|
334
|
+
console.print()
|
|
335
|
+
show_leaderboard()
|