localcoder 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
localcoder/bench.py ADDED
@@ -0,0 +1,335 @@
1
+ """localcoder bench — local model arena. Download, test, rank models automatically."""
2
+ import json, os, time, subprocess, urllib.request, urllib.parse
3
+ from pathlib import Path
4
+ from rich.console import Console
5
+ from rich.table import Table
6
+ from rich.panel import Panel
7
+ from rich.markup import escape
8
+
9
+ console = Console()
10
+ BENCH_FILE = Path.home() / ".localcoder" / "benchmarks.json"
11
+ LLAMA_SERVER = Path.home() / ".unsloth/llama.cpp/llama-server"
12
+ PORT = 8099 # dedicated bench port
13
+
14
+ # ── Test Suite ──
15
+ TESTS = [
16
+ {
17
+ "id": "code_function",
18
+ "name": "Write Function",
19
+ "category": "coding",
20
+ "prompt": "Write a Python function that finds the longest palindromic substring. Include type hints and docstring.",
21
+ "check": lambda r: ("def " in r) + ("->" in r or ": str" in r) + ('"""' in r or "'''" in r),
22
+ "max_score": 3,
23
+ },
24
+ {
25
+ "id": "code_debug",
26
+ "name": "Debug Code",
27
+ "category": "coding",
28
+ "prompt": "This Python code has a bug. Fix it and explain:\n\ndef binary_search(arr, target):\n left, right = 0, len(arr)\n while left < right:\n mid = (left + right) // 2\n if arr[mid] == target:\n return mid\n elif arr[mid] < target:\n left = mid\n else:\n right = mid\n return -1",
29
+ "check": lambda r: ("left = mid + 1" in r or "left = mid+1" in r) + ("infinite" in r.lower() or "loop" in r.lower() or "bug" in r.lower()) + ("def " in r or "fix" in r.lower()),
30
+ "max_score": 3,
31
+ },
32
+ {
33
+ "id": "reasoning",
34
+ "name": "Logic Puzzle",
35
+ "category": "reasoning",
36
+ "prompt": "A farmer has a fox, a chicken, and a sack of grain. He must cross a river in a boat that can only carry him and one item. If left alone, the fox will eat the chicken and the chicken will eat the grain. How does he get everything across?",
37
+ "check": lambda r: ("chicken" in r.lower()) + ("fox" in r.lower()) + ("grain" in r.lower()) + (r.lower().count("cross") >= 2 or r.lower().count("take") >= 2 or r.lower().count("bring") >= 2),
38
+ "max_score": 4,
39
+ },
40
+ {
41
+ "id": "tool_json",
42
+ "name": "Tool Call JSON",
43
+ "category": "tool_use",
44
+ "prompt": 'You have a tool called "search" with parameter "query" (string). The user says: "find the weather in Paris". Respond with a JSON tool call.',
45
+ "check": lambda r: ('"search"' in r or "'search'" in r) + ('"query"' in r or "'query'" in r) + ("paris" in r.lower()) + ("{" in r),
46
+ "max_score": 4,
47
+ },
48
+ {
49
+ "id": "follow_instructions",
50
+ "name": "Follow Instructions",
51
+ "category": "instruction",
52
+ "prompt": "List exactly 5 programming languages that start with the letter P. Output only the list, nothing else.",
53
+ "check": lambda r: (r.lower().count("python") >= 1) + (sum(1 for line in r.strip().splitlines() if line.strip()) <= 7) + ("perl" in r.lower() or "php" in r.lower() or "prolog" in r.lower()),
54
+ "max_score": 3,
55
+ },
56
+ ]
57
+
58
+
59
+ def _load_results():
60
+ """Load previous benchmark results."""
61
+ if BENCH_FILE.exists():
62
+ try:
63
+ return json.loads(BENCH_FILE.read_text())
64
+ except Exception:
65
+ pass
66
+ return {}
67
+
68
+
69
+ def _save_results(results):
70
+ """Save benchmark results."""
71
+ BENCH_FILE.parent.mkdir(parents=True, exist_ok=True)
72
+ BENCH_FILE.write_text(json.dumps(results, indent=2))
73
+
74
+
75
+ def _find_gguf(name_pattern):
76
+ """Find a GGUF file matching a pattern in HF cache."""
77
+ cache = Path.home() / ".cache/huggingface/hub"
78
+ for f in cache.rglob("*.gguf"):
79
+ if "mmproj" in f.name.lower():
80
+ continue
81
+ real = f.resolve()
82
+ if name_pattern.lower().replace("-", "") in f.name.lower().replace("-", ""):
83
+ return str(real)
84
+ return None
85
+
86
+
87
+ def _find_mmproj(model_dir_pattern):
88
+ """Find mmproj file for vision models."""
89
+ cache = Path.home() / ".cache/huggingface/hub"
90
+ for d in cache.iterdir():
91
+ if model_dir_pattern.lower().replace("-","") in d.name.lower().replace("-",""):
92
+ for f in d.rglob("*mmproj*"):
93
+ return str(f.resolve())
94
+ return None
95
+
96
+
97
+ def _start_server(gguf_path, extra_flags=""):
98
+ """Start llama-server with a model, return process."""
99
+ binary = str(LLAMA_SERVER)
100
+ if not os.path.exists(binary):
101
+ import shutil
102
+ binary = shutil.which("llama-server")
103
+ if not binary:
104
+ return None
105
+
106
+ cmd = f"{binary} -m {gguf_path} --port {PORT} -ngl 99 -c 8192 -fa on -ctk q4_0 -ctv q4_0 --jinja {extra_flags}"
107
+ proc = subprocess.Popen(cmd.split(), stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
108
+
109
+ for _ in range(60):
110
+ try:
111
+ req = urllib.request.Request(f"http://127.0.0.1:{PORT}/health")
112
+ with urllib.request.urlopen(req, timeout=1):
113
+ return proc
114
+ except Exception:
115
+ time.sleep(1)
116
+
117
+ proc.kill()
118
+ return None
119
+
120
+
121
+ def _run_test(test, max_tokens=512):
122
+ """Run a single test against the server. Returns score, tok/s, response."""
123
+ payload = json.dumps({
124
+ "model": "bench",
125
+ "messages": [{"role": "user", "content": test["prompt"]}],
126
+ "max_tokens": max_tokens,
127
+ "temperature": 0.1,
128
+ }).encode()
129
+
130
+ try:
131
+ t0 = time.time()
132
+ req = urllib.request.Request(
133
+ f"http://127.0.0.1:{PORT}/v1/chat/completions",
134
+ data=payload,
135
+ headers={"Content-Type": "application/json"},
136
+ )
137
+ with urllib.request.urlopen(req, timeout=120) as resp:
138
+ data = json.loads(resp.read())
139
+ elapsed = time.time() - t0
140
+
141
+ content = data["choices"][0]["message"].get("content", "")
142
+ tokens = data.get("usage", {}).get("completion_tokens", len(content.split()))
143
+ tps = tokens / elapsed if elapsed > 0 else 0
144
+ score = test["check"](content)
145
+
146
+ return {
147
+ "score": score,
148
+ "max_score": test["max_score"],
149
+ "tps": round(tps, 1),
150
+ "tokens": tokens,
151
+ "time": round(elapsed, 1),
152
+ "ttft": round(elapsed - (tokens / max(1, tps)), 2) if tps > 0 else 0,
153
+ }
154
+ except Exception as e:
155
+ return {"score": 0, "max_score": test["max_score"], "tps": 0, "tokens": 0, "time": 0, "error": str(e)}
156
+
157
+
158
+ def _download_model(repo, filename):
159
+ """Download a GGUF from HuggingFace."""
160
+ try:
161
+ from huggingface_hub import hf_hub_download
162
+ path = hf_hub_download(repo_id=repo, filename=filename)
163
+ return path
164
+ except ImportError:
165
+ console.print(" [red]pip install huggingface_hub[/]")
166
+ return None
167
+ except Exception as e:
168
+ console.print(f" [red]{e}[/]")
169
+ return None
170
+
171
+
172
+ def bench_model(name, gguf_path, extra_flags="", skip_if_cached=True):
173
+ """Benchmark a single model. Returns results dict."""
174
+ results = _load_results()
175
+
176
+ if skip_if_cached and name in results:
177
+ console.print(f" [dim]{name}: cached results (run --bench --force to re-test)[/]")
178
+ return results[name]
179
+
180
+ console.print(f"\n [bold cyan]{name}[/]")
181
+ console.print(f" [dim]Loading model...[/]")
182
+
183
+ proc = _start_server(gguf_path, extra_flags)
184
+ if not proc:
185
+ console.print(f" [red]Failed to start server[/]")
186
+ return None
187
+
188
+ console.print(f" [green]Server ready[/] — running {len(TESTS)} tests")
189
+
190
+ model_results = {"name": name, "path": gguf_path, "tests": {}, "timestamp": time.strftime("%Y-%m-%d %H:%M")}
191
+ total_score = 0
192
+ total_max = 0
193
+ total_tps = []
194
+
195
+ for test in TESTS:
196
+ console.print(f" {test['name']:<22}", end="")
197
+ r = _run_test(test)
198
+ stars = r["score"]
199
+ max_s = r["max_score"]
200
+ total_score += stars
201
+ total_max += max_s
202
+ if r["tps"] > 0:
203
+ total_tps.append(r["tps"])
204
+
205
+ bar = "[green]" + "★" * stars + "[/][dim]" + "☆" * (max_s - stars) + "[/]"
206
+ console.print(f" {bar} {r['tps']:>5.1f} tok/s {r['time']:>5.1f}s")
207
+
208
+ model_results["tests"][test["id"]] = r
209
+
210
+ avg_tps = sum(total_tps) / len(total_tps) if total_tps else 0
211
+ model_results["total_score"] = total_score
212
+ model_results["total_max"] = total_max
213
+ model_results["avg_tps"] = round(avg_tps, 1)
214
+ model_results["pct"] = round(total_score / total_max * 100) if total_max > 0 else 0
215
+
216
+ console.print(f" {'─' * 50}")
217
+ console.print(f" [bold]Total: {total_score}/{total_max} ({model_results['pct']}%) · {avg_tps:.1f} tok/s avg[/]")
218
+
219
+ proc.kill()
220
+ proc.wait()
221
+ time.sleep(2)
222
+
223
+ # Save
224
+ results[name] = model_results
225
+ _save_results(results)
226
+
227
+ return model_results
228
+
229
+
230
+ def show_leaderboard():
231
+ """Show the benchmark leaderboard like LM Arena."""
232
+ results = _load_results()
233
+ if not results:
234
+ console.print("\n [dim]No benchmarks yet. Run: localcoder --bench[/]\n")
235
+ return
236
+
237
+ console.print()
238
+
239
+ table = Table(
240
+ title="[bold]localcoder Arena — Model Leaderboard[/]",
241
+ title_style="bold #e07a5f",
242
+ show_header=True, header_style="bold",
243
+ border_style="dim", padding=(0, 1),
244
+ )
245
+ table.add_column("#", style="bold", width=3)
246
+ table.add_column("Model", width=28)
247
+ table.add_column("Score", justify="center", width=8)
248
+ table.add_column("tok/s", justify="right", width=7)
249
+ table.add_column("Code", justify="center", width=6)
250
+ table.add_column("Reason", justify="center", width=6)
251
+ table.add_column("Tools", justify="center", width=6)
252
+ table.add_column("Instruct", justify="center", width=6)
253
+ table.add_column("", width=16)
254
+
255
+ # Sort by score, then speed
256
+ ranked = sorted(results.values(), key=lambda x: (x.get("pct", 0), x.get("avg_tps", 0)), reverse=True)
257
+
258
+ for i, r in enumerate(ranked, 1):
259
+ pct = r.get("pct", 0)
260
+ tps = r.get("avg_tps", 0)
261
+ tests = r.get("tests", {})
262
+
263
+ # Category scores
264
+ code_score = sum(tests.get(t, {}).get("score", 0) for t in ["code_function", "code_debug"])
265
+ code_max = sum(tests.get(t, {}).get("max_score", 3) for t in ["code_function", "code_debug"])
266
+ reason_score = tests.get("reasoning", {}).get("score", 0)
267
+ reason_max = tests.get("reasoning", {}).get("max_score", 4)
268
+ tool_score = tests.get("tool_json", {}).get("score", 0)
269
+ tool_max = tests.get("tool_json", {}).get("max_score", 4)
270
+ inst_score = tests.get("follow_instructions", {}).get("score", 0)
271
+ inst_max = tests.get("follow_instructions", {}).get("max_score", 3)
272
+
273
+ # Visual bar
274
+ bar_w = 14
275
+ filled = int(pct / 100 * bar_w)
276
+ bc = "green" if pct >= 80 else "yellow" if pct >= 50 else "red"
277
+ bar = f"[{bc}]{'█' * filled}[/{bc}][dim]{'░' * (bar_w - filled)}[/]"
278
+
279
+ # Medal
280
+ medal = {1: "[bold yellow]🥇[/]", 2: "[white]🥈[/]", 3: "[#cd7f32]🥉[/]"}.get(i, f" ")
281
+
282
+ score_str = f"[bold]{pct}%[/]"
283
+ tps_str = f"{tps:.0f}"
284
+
285
+ table.add_row(
286
+ medal,
287
+ escape(r.get("name", "?")),
288
+ score_str,
289
+ tps_str,
290
+ f"{code_score}/{code_max}",
291
+ f"{reason_score}/{reason_max}",
292
+ f"{tool_score}/{tool_max}",
293
+ f"{inst_score}/{inst_max}",
294
+ bar,
295
+ )
296
+
297
+ console.print(table)
298
+ console.print(f"\n [dim]Tested on {ranked[0].get('timestamp', '?') if ranked else '?'} · M4 Pro 24GB · llama.cpp[/]")
299
+ console.print(f" [dim]Run [bold]localcoder --bench[/bold] to test more models · Results in ~/.localcoder/benchmarks.json[/]\n")
300
+
301
+
302
+ def run_full_bench(force=False):
303
+ """Run benchmarks on all installed models."""
304
+ from localcoder.backends import get_disk_info
305
+
306
+ console.print("\n [bold #e07a5f]localcoder Arena[/] — benchmarking all installed models\n")
307
+
308
+ di = get_disk_info()
309
+ models_to_test = []
310
+
311
+ for m in di.get("models", []):
312
+ name = m["name"].replace(".gguf", "")
313
+ size = m["size_gb"]
314
+ path = m["path"]
315
+
316
+ if size * 1024 > 16384:
317
+ console.print(f" [dim]Skip {name} ({size}GB) — won't fit GPU[/]")
318
+ continue
319
+
320
+ # Determine extra flags based on model name
321
+ extra = ""
322
+ if "gemma" in name.lower():
323
+ extra = "--reasoning off --no-mmproj"
324
+ elif "qwen" in name.lower():
325
+ extra = "--reasoning-budget 0"
326
+
327
+ models_to_test.append((name, path, extra))
328
+
329
+ console.print(f" Testing {len(models_to_test)} models...\n")
330
+
331
+ for name, path, extra in models_to_test:
332
+ bench_model(name, path, extra, skip_if_cached=not force)
333
+
334
+ console.print()
335
+ show_leaderboard()