localcoder 0.1.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
localcoder/backends.py ADDED
@@ -0,0 +1,2470 @@
1
+ """Backend discovery, installation, and model management."""
2
+ import json, os, shutil, subprocess, sys, time, urllib.request, urllib.parse
3
+ from pathlib import Path
4
+
5
+ from rich.console import Console
6
+ from rich.panel import Panel
7
+ from rich.table import Table
8
+ from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn
9
+
10
+ console = Console()
11
+
12
+ # ── Platform detection ──
13
+ HOME = Path.home()
14
+ CONFIG_DIR = HOME / ".localcoder"
15
+ MODELS_DIR = HOME / "models"
16
+ IS_MAC = sys.platform == "darwin"
17
+ IS_LINUX = sys.platform == "linux"
18
+ IS_WSL = IS_LINUX and "microsoft" in (Path("/proc/version").read_text().lower() if Path("/proc/version").exists() else "")
19
+
20
+ def _find_binary(name, extra_paths=None):
21
+ """Find a binary in PATH or known locations."""
22
+ found = shutil.which(name)
23
+ if found:
24
+ return Path(found)
25
+ for p in (extra_paths or []):
26
+ if Path(p).exists():
27
+ return Path(p)
28
+ return Path(name) # fallback — will fail on check
29
+
30
+ # ── Known backends ──
31
+ BACKENDS = {
32
+ "llamacpp": {
33
+ "name": "llama.cpp",
34
+ "default_port": 8089,
35
+ "binary": _find_binary("llama-server", [
36
+ HOME / ".unsloth/llama.cpp/llama-server",
37
+ Path("/usr/local/bin/llama-server"),
38
+ ]),
39
+ "install_cmd": "curl -fsSL https://unsloth.ai/install.sh | sh",
40
+ },
41
+ "ollama": {
42
+ "name": "Ollama",
43
+ "default_port": 11434,
44
+ "binary": _find_binary("ollama", [
45
+ Path("/opt/homebrew/bin/ollama"),
46
+ Path("/usr/local/bin/ollama"),
47
+ HOME / ".local/bin/ollama",
48
+ ]),
49
+ "install_cmd": "curl -fsSL https://ollama.com/install.sh | sh" if IS_LINUX else "brew install ollama",
50
+ },
51
+ }
52
+
53
+ # ── Known models ──
54
+ MODELS = {
55
+ "gemma4-26b": {
56
+ "name": "Gemma 4 26B Q3_K_XL",
57
+ "hf_repo": "unsloth/gemma-4-26B-A4B-it-GGUF",
58
+ "hf_pattern": "*UD-Q3_K_XL*",
59
+ "size_gb": 12,
60
+ "ram_required": 16,
61
+ "description": "Best quality on 24GB Mac. MoE, 49 tok/s, perfect tool calling.",
62
+ "ollama_tag": "gemma4:26b",
63
+ "backend": "llamacpp",
64
+ "server_flags": "-ngl 99 -c 131072 -np 1 -fa on -ctk q4_0 -ctv q4_0 --no-warmup --jinja",
65
+ },
66
+ "qwen35b-a3b": {
67
+ "name": "Qwen 3.5 35B-A3B Q2_K_XL",
68
+ "hf_repo": "unsloth/Qwen3.5-35B-A3B-GGUF",
69
+ "hf_pattern": "*UD-Q2_K_XL*",
70
+ "size_gb": 11.3,
71
+ "ram_required": 16,
72
+ "description": "MoE coding beast. 49 tok/s, 256 experts, tool calling, vision.",
73
+ "ollama_tag": None,
74
+ "backend": "llamacpp",
75
+ "server_flags": "-ngl 99 -c 32768 -np 1 -fa on -ctk q4_0 -ctv q4_0 --no-warmup --jinja --reasoning-budget 0",
76
+ },
77
+ "qwen35-4b": {
78
+ "name": "Qwen 3.5 4B",
79
+ "hf_repo": "unsloth/Qwen3.5-4B-GGUF",
80
+ "hf_pattern": "*UD-Q4_K_XL*",
81
+ "size_gb": 2.7,
82
+ "ram_required": 8,
83
+ "description": "Ultrafast at 50 tok/s. Great for quick tasks, only 2.7GB GPU.",
84
+ "ollama_tag": None,
85
+ "backend": "llamacpp",
86
+ "server_flags": "-ngl 99 -c 32768 --jinja --reasoning-budget 0",
87
+ },
88
+ "gemma4-e4b": {
89
+ "name": "Gemma 4 E4B",
90
+ "hf_repo": None,
91
+ "size_gb": 5.5,
92
+ "ram_required": 8,
93
+ "description": "Sweet spot for 16GB. Audio + image + code, 57 tok/s.",
94
+ "ollama_tag": "gemma4:e4b",
95
+ "backend": "ollama",
96
+ },
97
+ "gemma4-e2b": {
98
+ "name": "Gemma 4 E2B",
99
+ "hf_repo": None,
100
+ "size_gb": 4,
101
+ "ram_required": 8,
102
+ "description": "Speed demon. 95 tok/s, basic tasks.",
103
+ "ollama_tag": "gemma4:e2b",
104
+ "backend": "ollama",
105
+ },
106
+ "qwen3.5-27b": {
107
+ "name": "Qwen 3.5 27B",
108
+ "hf_repo": None,
109
+ "size_gb": 17,
110
+ "ram_required": 24,
111
+ "description": "Strong alternative. Dense 27B, good tool calling.",
112
+ "ollama_tag": "qwen3.5:27b",
113
+ "backend": "ollama",
114
+ },
115
+ }
116
+
117
+
118
+ def _parse_footprint_mb(pid):
119
+ """Get process memory footprint in MB using macOS footprint command."""
120
+ if not IS_MAC:
121
+ return 0
122
+ try:
123
+ fp = subprocess.run(
124
+ ["/usr/bin/footprint", str(pid)],
125
+ capture_output=True, text=True, timeout=5,
126
+ )
127
+ for line in fp.stdout.splitlines():
128
+ if "Footprint:" in line:
129
+ parts = line.split("Footprint:")[1].strip().split()
130
+ val = float(parts[0])
131
+ unit = parts[1] if len(parts) > 1 else "KB"
132
+ if "GB" in unit:
133
+ return int(val * 1024)
134
+ elif "MB" in unit:
135
+ return int(val)
136
+ elif "KB" in unit:
137
+ return max(1, int(val / 1024))
138
+ return int(val)
139
+ except Exception:
140
+ pass
141
+ return 0
142
+
143
+
144
+ def get_system_ram_gb():
145
+ """Get total system RAM in GB (macOS, Linux, WSL)."""
146
+ try:
147
+ if IS_MAC:
148
+ out = subprocess.run(["sysctl", "-n", "hw.memsize"], capture_output=True, text=True, timeout=3)
149
+ return int(out.stdout.strip()) // (1024**3)
150
+ else:
151
+ # Linux / WSL
152
+ with open("/proc/meminfo") as f:
153
+ for line in f:
154
+ if line.startswith("MemTotal:"):
155
+ return int(line.split()[1]) // (1024 * 1024)
156
+ except:
157
+ pass
158
+ return 0
159
+
160
+
161
+ def get_machine_specs():
162
+ """Get full machine specs: chip, cores, RAM, GPU memory breakdown."""
163
+ specs = {
164
+ "chip": "Unknown",
165
+ "cpu_cores": 0,
166
+ "gpu_cores": 0,
167
+ "ram_gb": get_system_ram_gb(),
168
+ "gpu_total_mb": 0,
169
+ "gpu_used_mb": 0,
170
+ "gpu_free_mb": 0,
171
+ "gpu_processes": [], # list of {name, pid, rss_mb}
172
+ "mem_pressure": "unknown",
173
+ }
174
+
175
+ if IS_MAC:
176
+ # Chip name
177
+ try:
178
+ out = subprocess.run(
179
+ ["sysctl", "-n", "machdep.cpu.brand_string"],
180
+ capture_output=True, text=True, timeout=3,
181
+ )
182
+ specs["chip"] = out.stdout.strip()
183
+ if not specs["chip"] or "Apple" not in specs["chip"]:
184
+ # Fallback for Apple Silicon
185
+ out2 = subprocess.run(
186
+ ["system_profiler", "SPHardwareDataType"],
187
+ capture_output=True, text=True, timeout=5,
188
+ )
189
+ for line in out2.stdout.splitlines():
190
+ if "Chip" in line and ":" in line:
191
+ specs["chip"] = line.split(":", 1)[1].strip()
192
+ break
193
+ except Exception:
194
+ pass
195
+
196
+ # CPU / GPU core counts
197
+ try:
198
+ out = subprocess.run(
199
+ ["sysctl", "-n", "hw.ncpu"], capture_output=True, text=True, timeout=3,
200
+ )
201
+ specs["cpu_cores"] = int(out.stdout.strip())
202
+ except Exception:
203
+ pass
204
+ try:
205
+ out = subprocess.run(
206
+ ["system_profiler", "SPDisplaysDataType"],
207
+ capture_output=True, text=True, timeout=5,
208
+ )
209
+ for line in out.stdout.splitlines():
210
+ if "Total Number of Cores" in line:
211
+ specs["gpu_cores"] = int(line.split(":")[-1].strip())
212
+ break
213
+ except Exception:
214
+ pass
215
+
216
+ # Metal GPU budget — use real ioreg value, then check sysctl override
217
+ # 1. Try ioreg for real Metal VRAM,totalMB
218
+ try:
219
+ import re as _re_ioreg
220
+ _ioreg_out = subprocess.run(["ioreg", "-l"], capture_output=True, text=True, timeout=10)
221
+ for _line in _ioreg_out.stdout.splitlines():
222
+ if "VRAM,totalMB" in _line:
223
+ _m = _re_ioreg.search(r'"VRAM,totalMB"=(\d+)', _line)
224
+ if _m:
225
+ specs["gpu_total_mb"] = int(_m.group(1))
226
+ break
227
+ except Exception:
228
+ pass
229
+
230
+ # 2. Check if user overrode with iogpu.wired_limit_mb
231
+ if specs["gpu_total_mb"] == 0:
232
+ try:
233
+ out = subprocess.run(
234
+ ["sysctl", "-n", "iogpu.wired_limit_mb"],
235
+ capture_output=True, text=True, timeout=3,
236
+ )
237
+ custom_limit = int(out.stdout.strip())
238
+ if custom_limit > 0:
239
+ specs["gpu_total_mb"] = custom_limit
240
+ except Exception:
241
+ pass
242
+
243
+ # 3. Fallback to estimate
244
+ if specs["gpu_total_mb"] == 0:
245
+ specs["gpu_total_mb"] = int(specs["ram_gb"] * 1024 * 0.67)
246
+
247
+ # Find GPU-heavy processes (llama-server, ollama, any ML inference)
248
+ gpu_proc_names = ["llama-server", "ollama", "ollama_llama_server",
249
+ "mlx_lm", "whisper"]
250
+ try:
251
+ out = subprocess.run(
252
+ ["ps", "axo", "pid,comm"],
253
+ capture_output=True, text=True, timeout=3,
254
+ )
255
+ for line in out.stdout.splitlines()[1:]:
256
+ parts = line.split()
257
+ if len(parts) < 2:
258
+ continue
259
+ pid, comm = parts[0], parts[1]
260
+ name = os.path.basename(comm)
261
+ if not any(gp in name for gp in gpu_proc_names):
262
+ continue
263
+ mem_mb = _parse_footprint_mb(pid)
264
+ if mem_mb < 10:
265
+ # Fallback to RSS
266
+ try:
267
+ rss = subprocess.run(
268
+ ["ps", "-o", "rss=", "-p", pid],
269
+ capture_output=True, text=True,
270
+ )
271
+ if rss.stdout.strip():
272
+ mem_mb = int(rss.stdout.strip()) // 1024
273
+ except Exception:
274
+ pass
275
+
276
+ if mem_mb > 100:
277
+ specs["gpu_processes"].append({
278
+ "name": name, "pid": int(pid), "rss_mb": mem_mb,
279
+ })
280
+ except Exception:
281
+ pass
282
+
283
+ specs["gpu_used_mb"] = sum(p["rss_mb"] for p in specs["gpu_processes"])
284
+ specs["gpu_free_mb"] = max(0, specs["gpu_total_mb"] - specs["gpu_used_mb"])
285
+
286
+ # Memory pressure
287
+ try:
288
+ out = subprocess.run(
289
+ ["sysctl", "-n", "kern.memorystatus_vm_pressure_level"],
290
+ capture_output=True, text=True, timeout=3,
291
+ )
292
+ level = int(out.stdout.strip())
293
+ specs["mem_pressure"] = {0: "normal", 1: "warn", 2: "critical", 4: "critical"}.get(level, "unknown")
294
+ except Exception:
295
+ pass
296
+
297
+ elif IS_LINUX:
298
+ # Linux / WSL
299
+ try:
300
+ with open("/proc/cpuinfo") as f:
301
+ specs["cpu_cores"] = sum(1 for line in f if line.startswith("processor"))
302
+ with open("/proc/meminfo") as f:
303
+ for line in f:
304
+ if line.startswith("MemAvailable:"):
305
+ avail_kb = int(line.split()[1])
306
+ specs["gpu_free_mb"] = avail_kb // 1024
307
+ except Exception:
308
+ pass
309
+
310
+ # Check for NVIDIA GPU
311
+ try:
312
+ out = subprocess.run(
313
+ ["nvidia-smi", "--query-gpu=name,memory.total,memory.used,memory.free",
314
+ "--format=csv,noheader,nounits"],
315
+ capture_output=True, text=True, timeout=5,
316
+ )
317
+ if out.returncode == 0:
318
+ parts = out.stdout.strip().split(",")
319
+ specs["chip"] = parts[0].strip()
320
+ specs["gpu_total_mb"] = int(parts[1].strip())
321
+ specs["gpu_used_mb"] = int(parts[2].strip())
322
+ specs["gpu_free_mb"] = int(parts[3].strip())
323
+ except FileNotFoundError:
324
+ specs["gpu_total_mb"] = specs["ram_gb"] * 1024
325
+ specs["gpu_free_mb"] = specs["gpu_total_mb"]
326
+
327
+ return specs
328
+
329
+
330
+ def cleanup_gpu_memory(force=False):
331
+ """Free GPU memory by unloading idle models and killing stale processes.
332
+
333
+ Returns dict with what was cleaned up.
334
+ """
335
+ cleaned = {"ollama_unloaded": [], "processes_killed": [], "freed_mb": 0}
336
+
337
+ # 1. Unload Ollama models (set keep_alive=0)
338
+ if check_backend_running("ollama"):
339
+ try:
340
+ models = get_running_models("ollama")
341
+ for m in models:
342
+ urllib.request.urlopen(
343
+ urllib.request.Request(
344
+ "http://127.0.0.1:11434/api/generate",
345
+ data=json.dumps({"model": m, "keep_alive": 0}).encode(),
346
+ headers={"Content-Type": "application/json"},
347
+ ), timeout=5,
348
+ )
349
+ cleaned["ollama_unloaded"].append(m)
350
+ except Exception:
351
+ pass
352
+
353
+ # 2. Kill stale llama-server processes (if force or not our session)
354
+ if force:
355
+ try:
356
+ out = subprocess.run(
357
+ ["pgrep", "-f", "llama-server"], capture_output=True, text=True,
358
+ )
359
+ for pid in out.stdout.strip().splitlines():
360
+ pid = pid.strip()
361
+ if pid:
362
+ rss = subprocess.run(
363
+ ["ps", "-o", "rss=", "-p", pid],
364
+ capture_output=True, text=True,
365
+ )
366
+ mb = int(rss.stdout.strip()) // 1024 if rss.stdout.strip() else 0
367
+ subprocess.run(["kill", pid], timeout=3)
368
+ cleaned["processes_killed"].append({"pid": int(pid), "freed_mb": mb})
369
+ cleaned["freed_mb"] += mb
370
+ except Exception:
371
+ pass
372
+
373
+ # Give time for memory to be released
374
+ if cleaned["ollama_unloaded"] or cleaned["processes_killed"]:
375
+ time.sleep(2)
376
+
377
+ return cleaned
378
+
379
+
380
+ def get_top_memory_processes(min_mb=80, limit=12):
381
+ """Get top memory-consuming processes with accurate footprint.
382
+
383
+ Categorizes processes as:
384
+ - 'ml': ML inference servers (llama-server, ollama)
385
+ - 'app': User apps (Chrome, Slack, etc.)
386
+ - 'system': System processes (WindowServer, kernel_task)
387
+ """
388
+ SYSTEM_PROCS = {
389
+ "WindowServer", "WindowManager", "kernel_task", "launchd",
390
+ "mds", "mds_stores", "opendirectoryd", "fseventsd",
391
+ "corebrightnessd", "bluetoothd", "nearbyd", "systemstats",
392
+ "loginwindow", "Dock", "Finder", "SystemUIServer",
393
+ "ControlCenter", "NotificationCenter", "Terminal", "iTerm2",
394
+ "zsh", "bash", "sh",
395
+ }
396
+ # System procs safe to kill (macOS auto-restarts them lean, freeing bloated memory)
397
+ # Maps name → description for the debloat wizard
398
+ SYSTEM_RESTARTABLE = {
399
+ "CoreLocationAgent": "Location services cache — often leaks to 8GB+",
400
+ "CacheDeleteExtension": "Storage cleanup daemon — bloats during disk scans",
401
+ "remindd": "Reminders sync daemon — known memory leak on macOS 15",
402
+ "suggestd": "Siri suggestions indexer — heavy background ML",
403
+ "photoanalysisd": "Photos face/scene ML analysis — runs after imports",
404
+ "mediaanalysisd": "Media ML classifier — visual lookup, Live Text",
405
+ "nsurlsessiond": "Background network downloads — iCloud sync cache",
406
+ "cloudd": "iCloud Drive sync daemon — bloats with many files",
407
+ "bird": "CloudKit/iCloud container daemon",
408
+ "callservicesd": "FaceTime/phone call routing daemon",
409
+ "SafariLaunchAgent": "Safari preload — keeps old pages in memory",
410
+ "SoftwareUpdateNotificationManager": "macOS update checker — safe to kill",
411
+ "com.apple.WebKit.Networking": "WebKit network process — cache bloat",
412
+ }
413
+ ML_PROCS = {
414
+ "llama-server", "ollama", "ollama_llama_server",
415
+ "mlx_lm", "whisper", "vllm", "tgi",
416
+ }
417
+
418
+ procs = []
419
+ try:
420
+ out = subprocess.run(
421
+ ["ps", "-eo", "pid=,rss=,comm="], capture_output=True, text=True, timeout=5,
422
+ )
423
+ # Pre-filter by RSS to avoid calling footprint on hundreds of tiny processes
424
+ candidates = []
425
+ for line in out.stdout.strip().splitlines():
426
+ parts = line.split()
427
+ if len(parts) < 3:
428
+ continue
429
+ pid, rss_kb, comm = parts[0], parts[1], " ".join(parts[2:])
430
+ try:
431
+ rss_mb = int(rss_kb) // 1024
432
+ except ValueError:
433
+ continue
434
+ if rss_mb < min_mb // 4: # loose pre-filter
435
+ continue
436
+ candidates.append((pid, rss_mb, comm))
437
+
438
+ # Sort by RSS descending, only footprint top N candidates (fast)
439
+ candidates.sort(key=lambda x: x[1], reverse=True)
440
+ candidates = candidates[:limit * 3] # check 3x limit, take top N
441
+
442
+ for pid, rss_mb, comm in candidates:
443
+ name = os.path.basename(comm.split()[0]) if comm else "?"
444
+
445
+ # Use RSS directly (fast) — footprint is 0.3s per process
446
+ fp_mb = rss_mb
447
+
448
+ if fp_mb < min_mb:
449
+ continue
450
+
451
+ # Categorize
452
+ if name in ML_PROCS or any(ml in name for ml in ML_PROCS):
453
+ category = "ml"
454
+ elif name in SYSTEM_PROCS:
455
+ category = "system"
456
+ elif name in SYSTEM_RESTARTABLE or any(sr in name for sr in SYSTEM_RESTARTABLE):
457
+ category = "bloat"
458
+ else:
459
+ category = "app"
460
+
461
+ procs.append({
462
+ "pid": int(pid),
463
+ "name": name,
464
+ "mb": fp_mb,
465
+ "category": category,
466
+ "killable": category not in ("system",),
467
+ })
468
+ except Exception:
469
+ pass
470
+
471
+ # Normalize names for grouping
472
+ def _group_name(name):
473
+ # Group all Chrome helpers under "Chrome"
474
+ if "Google" in name or "Chrome" in name:
475
+ return "Chrome"
476
+ return name
477
+
478
+ grouped = {}
479
+ for p in procs:
480
+ key = _group_name(p["name"])
481
+ if key in grouped:
482
+ grouped[key]["mb"] += p["mb"]
483
+ grouped[key]["count"] += 1
484
+ grouped[key]["pids"].append(p["pid"])
485
+ else:
486
+ grouped[key] = {**p, "name": key, "count": 1, "pids": [p["pid"]]}
487
+
488
+ result = sorted(grouped.values(), key=lambda x: x["mb"], reverse=True)
489
+ return result[:limit]
490
+
491
+
492
+ def print_machine_specs(specs=None):
493
+ """Print a compact machine specs panel using Rich."""
494
+ if specs is None:
495
+ specs = get_machine_specs()
496
+
497
+ ram = specs["ram_gb"]
498
+ gpu_total = specs["gpu_total_mb"]
499
+ gpu_used = specs["gpu_used_mb"]
500
+ gpu_free = specs["gpu_free_mb"]
501
+
502
+ # Color code free GPU memory
503
+ if gpu_free > 14000:
504
+ free_color = "green"
505
+ elif gpu_free > 8000:
506
+ free_color = "yellow"
507
+ else:
508
+ free_color = "red"
509
+
510
+ pressure_color = {"normal": "green", "warn": "yellow", "critical": "red"}.get(
511
+ specs["mem_pressure"], "dim"
512
+ )
513
+
514
+ lines = [
515
+ f" [bold]{specs['chip']}[/] · {specs['cpu_cores']} CPU"
516
+ + (f" · {specs['gpu_cores']} GPU cores" if specs['gpu_cores'] else ""),
517
+ f" RAM: [bold]{ram}GB[/] total · Metal GPU budget: [bold]{gpu_total // 1024}GB[/]"
518
+ + (f" · pressure: [{pressure_color}]{specs['mem_pressure']}[/{pressure_color}]"
519
+ if specs["mem_pressure"] != "unknown" else ""),
520
+ f" GPU VRAM: [{free_color}]{gpu_free // 1024}GB free[/{free_color}]"
521
+ + f" · {gpu_used // 1024}GB used · {gpu_total // 1024}GB total",
522
+ ]
523
+
524
+ if specs["gpu_processes"]:
525
+ procs = " GPU processes: " + ", ".join(
526
+ f"[cyan]{p['name']}[/] ({p['rss_mb']//1024}GB)" for p in specs["gpu_processes"]
527
+ )
528
+ lines.append(procs)
529
+
530
+ console.print(Panel(
531
+ "\n".join(lines),
532
+ title="[bold]Machine Specs[/]",
533
+ border_style="dim",
534
+ padding=(0, 1),
535
+ ))
536
+
537
+
538
+ def _detect_model_info(server_config, model_id=None):
539
+ """Detect model name, quant level, and file size from model path or model_id."""
540
+ info = {"name": None, "quant": None, "size_gb": None}
541
+
542
+ # Try model_id first
543
+ if model_id and model_id in MODELS:
544
+ m = MODELS[model_id]
545
+ info["name"] = m["name"].split(" Q")[0] if " Q" in m["name"] else m["name"]
546
+ info["size_gb"] = m["size_gb"]
547
+ # Extract quant from name
548
+ for part in m["name"].split():
549
+ if part.startswith("Q") and "_" in part:
550
+ info["quant"] = part
551
+ break
552
+
553
+ # Try to parse from model path
554
+ model_path = server_config.get("model_path", "") or ""
555
+ if model_path:
556
+ import re
557
+ basename = os.path.basename(model_path)
558
+
559
+ # Detect quant from filename (e.g., Q3_K_XL, Q4_K_M, Q8_0)
560
+ quant_match = re.search(r'(Q\d+_K(?:_[A-Z]+)?|Q\d+_\d+|IQ\d+_[A-Z]+)', basename, re.IGNORECASE)
561
+ if quant_match:
562
+ info["quant"] = quant_match.group(1).upper()
563
+
564
+ # Detect model name from path
565
+ name_patterns = [
566
+ (r'gemma[-_]?4[-_]?(\d+[bB])', 'Gemma 4'),
567
+ (r'qwen[-_]?3\.?5[-_]?(\d+[bB])', 'Qwen 3.5'),
568
+ (r'llama[-_]?3[-_.]?(\d+[bB])', 'Llama 3'),
569
+ (r'mistral[-_]?(\d+[bB])', 'Mistral'),
570
+ (r'phi[-_]?(\d+)', 'Phi'),
571
+ ]
572
+ for pattern, prefix in name_patterns:
573
+ m = re.search(pattern, basename, re.IGNORECASE)
574
+ if m:
575
+ info["name"] = f"{prefix} {m.group(1).upper()}"
576
+ break
577
+
578
+ # Detect file size
579
+ if os.path.exists(model_path):
580
+ try:
581
+ size_bytes = os.path.getsize(model_path)
582
+ info["size_gb"] = round(size_bytes / (1024**3), 1)
583
+ except OSError:
584
+ pass
585
+
586
+ return info
587
+
588
+
589
+ def _build_dashboard_layout(model_id=None):
590
+ """Build the full dashboard as a single Rich renderable (for clear-screen rendering)."""
591
+ from rich.columns import Columns
592
+ from rich.text import Text
593
+ from rich.rule import Rule
594
+
595
+ specs = get_machine_specs()
596
+ diag = diagnose_gpu_health(model_id)
597
+ top_procs = get_top_memory_processes(min_mb=80, limit=8)
598
+ swap_mb = get_swap_usage_mb()
599
+
600
+ # ── Status Bar (full-width colored line) ──
601
+ status_map = {
602
+ "healthy": ("green", "HEALTHY"),
603
+ "degraded": ("yellow", "DEGRADED"),
604
+ "critical": ("red", "CRITICAL"),
605
+ "unknown": ("dim", "UNKNOWN"),
606
+ }
607
+ sc, sl = status_map.get(diag["status"], ("dim", "?"))
608
+ status_bar = Rule(title=f"[bold {sc}] {sl} [/bold {sc}]", style=sc)
609
+
610
+ # ── Header ──
611
+ header = Text()
612
+ header.append(f" {specs['chip']} · {specs['ram_gb']}GB RAM · ", style="bold")
613
+ header.append(f"{specs.get('gpu_cores', '?')} GPU cores", style="bold")
614
+
615
+ # ── Model Info Line ──
616
+ model_info_obj = _detect_model_info(diag["server_config"], model_id)
617
+ model_line = None
618
+ if model_info_obj["name"]:
619
+ parts = []
620
+ parts.append(f"[bold cyan]{model_info_obj['name']}[/bold cyan]")
621
+ if model_info_obj["quant"]:
622
+ parts.append(f"[yellow]{model_info_obj['quant']}[/yellow]")
623
+ if model_info_obj["size_gb"]:
624
+ parts.append(f"[dim]{model_info_obj['size_gb']}GB[/dim]")
625
+ model_line = Text.from_markup(" " + " · ".join(parts))
626
+
627
+ # ── Status Cards (equal height, horizontal row) ──
628
+ CARD_HEIGHT = 6 # content lines per card (excluding border)
629
+
630
+ gpu_on = diag["on_gpu"]
631
+ compute_lines = []
632
+ if diag["server_config"].get("running"):
633
+ icon = "[green]●[/]" if gpu_on else "[red]●[/]"
634
+ compute_lines.append(f"{icon} {'GPU (Metal)' if gpu_on else 'CPU — SLOW!'}")
635
+ compute_lines.append(f" Layers: {diag['gpu_layers']}/99")
636
+ compute_lines.append(f" Util: {diag['gpu_util_pct']}%")
637
+ compute_lines.append(f" Model: {diag['server_config'].get('footprint_mb', 0)} MB")
638
+ if not gpu_on:
639
+ compute_lines.append("[dim]GPU = 20x faster[/]")
640
+ compute_lines.append("[dim]Use -ngl 99[/]")
641
+ else:
642
+ compute_lines.append("[dim]Server not running[/]")
643
+
644
+ kv_lines = []
645
+ kv_ok = diag["kv_quantized"]
646
+ kv_icon = "[green]●[/]" if kv_ok else "[red]●[/]"
647
+ kv_lines.append(f"{kv_icon} {'Quantized' if kv_ok else 'Full (2x mem!)'}")
648
+ if diag["kv_type"]:
649
+ kv_lines.append(f" Type: {diag['kv_type']}")
650
+ kv_lines.append(f" Size: ~{diag['kv_cache_est_mb']} MB")
651
+ kv_lines.append(f" Ctx: {diag['context_size'] // 1024}K")
652
+ fa_icon = "[green]●[/]" if diag["flash_attn"] else "[yellow]●[/]"
653
+ kv_lines.append(f"{fa_icon} FlashAttn: {'on' if diag['flash_attn'] else 'off'}")
654
+
655
+ pressure_color = {"normal": "green", "warn": "yellow", "critical": "red"}.get(diag["mem_pressure"], "dim")
656
+ swap_color = "red" if diag["swap_thrashing"] else "green"
657
+ gpu_headroom = diag["gpu_total_mb"] - diag["gpu_alloc_mb"]
658
+ hr_color = "green" if gpu_headroom > 2048 else "yellow" if gpu_headroom > 0 else "red"
659
+ mem_lines = [
660
+ f" Pressure: [{pressure_color}]{diag['mem_pressure']}[/{pressure_color}]",
661
+ f" Swap: [{swap_color}]{swap_mb // 1024}GB[/{swap_color}]",
662
+ f" GPU: {diag['gpu_alloc_mb'] // 1024}/{diag['gpu_total_mb'] // 1024}GB",
663
+ f" Free: [{hr_color}]{gpu_headroom // 1024}GB[/{hr_color}]",
664
+ ]
665
+ if diag["swap_thrashing"]:
666
+ mem_lines.append("[dim]Swap = 100x slower[/]")
667
+
668
+ # Pad all cards to the same height
669
+ for card_lines in (compute_lines, kv_lines, mem_lines):
670
+ while len(card_lines) < CARD_HEIGHT:
671
+ card_lines.append("")
672
+
673
+ cards = Columns([
674
+ Panel("\n".join(compute_lines), title="[bold]Compute[/]", border_style="cyan", width=26, padding=(0, 1)),
675
+ Panel("\n".join(kv_lines), title="[bold]KV Cache[/]", border_style="cyan", width=26, padding=(0, 1)),
676
+ Panel("\n".join(mem_lines), title="[bold]Memory[/]", border_style="cyan", width=26, padding=(0, 1)),
677
+ ], padding=1)
678
+
679
+ # ── VRAM Usage Bar ──
680
+ gpu_budget_mb = diag["gpu_total_mb"] if diag["gpu_total_mb"] > 0 else (specs["ram_gb"] * 1024 * 75 // 100)
681
+ model_mb = diag.get("model_size_mb", 0) or (diag["server_config"].get("footprint_mb", 0))
682
+ kv_mb = diag["kv_cache_est_mb"]
683
+ apps_mb = max(0, diag["gpu_alloc_mb"] - model_mb - kv_mb)
684
+ free_mb = max(0, gpu_budget_mb - model_mb - kv_mb - apps_mb)
685
+
686
+ BAR_WIDTH = 50
687
+ total_for_bar = max(1, gpu_budget_mb)
688
+ seg_model = max(0, int(BAR_WIDTH * model_mb / total_for_bar))
689
+ seg_kv = max(0, int(BAR_WIDTH * kv_mb / total_for_bar))
690
+ seg_apps = max(0, int(BAR_WIDTH * apps_mb / total_for_bar))
691
+ seg_free = max(0, BAR_WIDTH - seg_model - seg_kv - seg_apps)
692
+
693
+ vram_bar = Text()
694
+ vram_bar.append(" VRAM ", style="bold")
695
+ vram_bar.append("\u2588" * seg_model, style="cyan")
696
+ vram_bar.append("\u2588" * seg_kv, style="magenta")
697
+ vram_bar.append("\u2588" * seg_apps, style="yellow")
698
+ vram_bar.append("\u2591" * seg_free, style="dim")
699
+ vram_bar.append(f" {gpu_budget_mb // 1024}GB", style="dim")
700
+
701
+ vram_legend = Text.from_markup(
702
+ " [cyan]\u2588[/] Model"
703
+ f" ({model_mb // 1024}G)"
704
+ " [magenta]\u2588[/] KV Cache"
705
+ f" ({kv_mb // 1024}G)"
706
+ " [yellow]\u2588[/] Apps"
707
+ f" ({apps_mb // 1024}G)"
708
+ " [dim]\u2591[/] Free"
709
+ f" ({free_mb // 1024}G)"
710
+ )
711
+
712
+ # ── Process Table ──
713
+ table = Table(
714
+ show_header=True, header_style="bold",
715
+ border_style="dim", padding=(0, 1), expand=False, width=82,
716
+ )
717
+ table.add_column("#", style="dim", width=3)
718
+ table.add_column("Process", min_width=18)
719
+ table.add_column("Memory", justify="right", width=8)
720
+ table.add_column("Type", width=6)
721
+ table.add_column("", min_width=14)
722
+
723
+ total_reclaimable = 0
724
+ for i, p in enumerate(top_procs, 1):
725
+ mb = p["mb"]
726
+ name = p["name"]
727
+ count = p.get("count", 1)
728
+ label = f"{name}" + (f" \u00d7{count}" if count > 1 else "")
729
+
730
+ cat_style = {"ml": "[cyan]ML[/]", "app": "[yellow]app[/]", "system": "[dim]sys[/]", "bloat": "[red]bloat[/]"}
731
+ cat = cat_style.get(p["category"], "[dim]?[/]")
732
+
733
+ bar_width = min(14, max(1, mb // 300))
734
+ bar_color = "red" if mb > 2000 else "yellow" if mb > 500 else "green"
735
+ bar = f"[{bar_color}]{'\u2588' * bar_width}[/{bar_color}]"
736
+
737
+ size_str = f"{mb / 1024:.1f}G" if mb >= 1024 else f"{mb}M"
738
+ table.add_row(str(i), label, size_str, cat, bar)
739
+
740
+ if p["category"] in ("app", "bloat") and p["killable"]:
741
+ total_reclaimable += mb
742
+
743
+ # ── Fixes ──
744
+ fix_lines = []
745
+ if diag["issues"]:
746
+ for issue in diag["issues"]:
747
+ fix_lines.append(f" [red]\u25cf[/] {issue}")
748
+ fix_lines.append("")
749
+
750
+ # Bloat fixes
751
+ for p in top_procs:
752
+ if p["category"] == "bloat" and p["mb"] > 500:
753
+ freed = p["mb"] // 1024
754
+ fix_lines.append(f" [green]\u2192[/] Kill {p['name']} [dim](~{freed}GB — auto-restarts lean)[/]")
755
+ # App fixes
756
+ for p in top_procs:
757
+ if p["category"] == "app" and p["mb"] > 500:
758
+ count = p.get("count", 1)
759
+ freed = p["mb"] // 1024
760
+ name = p["name"]
761
+ if name == "Chrome":
762
+ fix_lines.append(f" [green]\u2192[/] Close Chrome tabs [dim]({count} procs = ~{freed}GB)[/]")
763
+ elif "claude" in name.lower():
764
+ fix_lines.append(f" [green]\u2192[/] Close Claude windows [dim]({count} = ~{freed}GB)[/]")
765
+ elif freed >= 1:
766
+ fix_lines.append(f" [green]\u2192[/] Quit {name} [dim](~{freed}GB)[/]")
767
+
768
+ if total_reclaimable > 2000:
769
+ fix_lines.append("")
770
+ fix_lines.append(f" [bold]Reclaimable: ~{total_reclaimable // 1024}GB[/] \u00b7 [dim]localcoder --cleanup[/]")
771
+
772
+ fixes_panel = None
773
+ if fix_lines:
774
+ border = "red" if diag["status"] == "critical" else "yellow" if diag["status"] == "degraded" else "dim"
775
+ fixes_panel = Panel("\n".join(fix_lines), title="[bold]Fixes[/]", border_style=border, padding=(0, 1))
776
+
777
+ # ── Glossary (noob-friendly, using a borderless Rich Table for alignment) ──
778
+ glossary_table = Table(show_header=False, show_edge=False, show_lines=False,
779
+ box=None, padding=(0, 1), expand=False)
780
+ glossary_table.add_column("Term", style="dim", width=14, no_wrap=True)
781
+ glossary_table.add_column("Description", style="dim")
782
+
783
+ glossary_entries = [
784
+ ("KV Cache", "Stores conversation history in GPU. Grows with context length.\n"
785
+ "128K ctx = 630MB (q4_0) or 1.2GB (f16). Use -ctk q4_0 to halve it."),
786
+ ("Quantization", "Compresses model weights: Q3=small Q4=sweet spot Q8=best quality.\n"
787
+ "Rule: ~0.7GB per 1B params at Q4. 26B Q3 = 12GB, Q4 = 18GB."),
788
+ ("GPU Layers", "-ngl 99 = all on GPU (fast). Partial offload = 5-10x slower."),
789
+ ("Flash Attn", "-fa on = memory-efficient attention. Always enable it."),
790
+ ("Swap", "RAM overflow to disk. 100x slower. Keep under 2GB."),
791
+ ("MoE", "Mixture of Experts -- only 4B of 26B active per token."),
792
+ ("Metal Limit", "macOS reserves ~25% RAM. Override: sudo sysctl iogpu.wired_limit_mb=N"),
793
+ ]
794
+ for term, desc in glossary_entries:
795
+ glossary_table.add_row(term, desc)
796
+
797
+ glossary = Panel(glossary_table, title="[bold dim]What do these mean?[/]", border_style="dim", padding=(0, 1))
798
+
799
+ return status_bar, header, model_line, cards, vram_bar, vram_legend, table, fixes_panel, glossary, diag
800
+
801
+
802
+ def _build_status_bar(diag, specs):
803
+ """Build a pinned bottom status bar like Claude Code / btop."""
804
+ from rich.text import Text
805
+
806
+ swap_mb = get_swap_usage_mb()
807
+ gpu_alloc = diag.get("gpu_alloc_mb", 0)
808
+ gpu_total = diag.get("gpu_total_mb", 0)
809
+ pressure = diag.get("mem_pressure", "?")
810
+
811
+ # Color-code values
812
+ pc = {"normal": "green", "warn": "yellow", "critical": "red"}.get(pressure, "dim")
813
+ sc = "red" if swap_mb > 4000 else "yellow" if swap_mb > 1000 else "green"
814
+ gc = "red" if gpu_alloc > gpu_total else "yellow" if gpu_alloc > gpu_total * 0.8 else "green"
815
+
816
+ bar = Text()
817
+ bar.append(" GPU ", style="bold white on blue")
818
+ bar.append(" ")
819
+ bar.append(f"{gpu_alloc // 1024}/{gpu_total // 1024}GB", style=gc)
820
+ bar.append(" ")
821
+ bar.append(" SWAP ", style="bold white on blue")
822
+ bar.append(" ")
823
+ bar.append(f"{swap_mb // 1024}GB", style=sc)
824
+ bar.append(" ")
825
+ bar.append(" MEM ", style="bold white on blue")
826
+ bar.append(" ")
827
+ bar.append(f"{pressure}", style=pc)
828
+ bar.append(" ")
829
+
830
+ # Shortcuts
831
+ bar.append(" h ", style="bold black on white")
832
+ bar.append(" health ", style="dim")
833
+ bar.append(" c ", style="bold black on white")
834
+ bar.append(" cleanup ", style="dim")
835
+ bar.append(" d ", style="bold black on white")
836
+ bar.append(" debloat ", style="dim")
837
+ bar.append(" s ", style="bold black on white")
838
+ bar.append(" simulate ", style="dim")
839
+
840
+ return bar
841
+
842
+
843
+ def print_health_dashboard(model_id=None):
844
+ """Render GPU health dashboard — clear screen, fixed width, status bar at bottom."""
845
+ import shutil
846
+
847
+ term_w, term_h = shutil.get_terminal_size()
848
+
849
+ # Use a fixed-width console to prevent stretching
850
+ from rich.console import Console as _Console
851
+ out = _Console(width=min(90, term_w), highlight=False)
852
+
853
+ # Phase 1: Loading spinner
854
+ out.clear()
855
+ loading = out.status("[bold cyan] Scanning GPU, processes, server...[/]", spinner="dots")
856
+ loading.start()
857
+
858
+ specs = get_machine_specs()
859
+ diag = diagnose_gpu_health(model_id)
860
+
861
+ loading.stop()
862
+
863
+ # Phase 2: Build layout
864
+ result = _build_dashboard_layout(model_id)
865
+ status_bar_top, header, model_line, cards, vram_bar, vram_legend, table, fixes_panel, glossary, _diag = result
866
+
867
+ # Phase 3: Clear and render all at once
868
+ out.clear()
869
+
870
+ out.print(status_bar_top)
871
+ out.print(header)
872
+ if model_line:
873
+ out.print(model_line)
874
+ out.print()
875
+ out.print(cards)
876
+ out.print(vram_bar)
877
+ out.print(vram_legend)
878
+ out.print()
879
+ out.print(table)
880
+ if fixes_panel:
881
+ out.print(fixes_panel)
882
+
883
+ # Glossary only if space
884
+ if term_h > 42:
885
+ out.print(glossary)
886
+
887
+ # Status bar at bottom (no ANSI cursor tricks — just print it)
888
+ status_bar_widget = _build_status_bar(diag, specs)
889
+ out.print()
890
+ out.print(status_bar_widget)
891
+ out.print()
892
+
893
+ return diag
894
+
895
+
896
+ def check_backend_installed(backend_id):
897
+ """Check if a backend binary exists."""
898
+ b = BACKENDS[backend_id]
899
+ # Also check in PATH
900
+ binary = b["binary"]
901
+ if binary.exists():
902
+ return True
903
+ if shutil.which(binary.name):
904
+ return True
905
+ return False
906
+
907
+
908
+ def check_backend_running(backend_id):
909
+ """Check if backend server is responding."""
910
+ b = BACKENDS[backend_id]
911
+ port = b["default_port"]
912
+ try:
913
+ url = f"http://127.0.0.1:{port}/v1/models"
914
+ req = urllib.request.Request(url, headers={"Content-Type": "application/json"})
915
+ with urllib.request.urlopen(req, timeout=2) as resp:
916
+ return True
917
+ except:
918
+ return False
919
+
920
+
921
+ def get_running_models(backend_id):
922
+ """Get list of models from a running backend."""
923
+ b = BACKENDS[backend_id]
924
+ port = b["default_port"]
925
+ try:
926
+ url = f"http://127.0.0.1:{port}/v1/models"
927
+ req = urllib.request.Request(url, headers={"Content-Type": "application/json"})
928
+ with urllib.request.urlopen(req, timeout=2) as resp:
929
+ data = json.loads(resp.read())
930
+ return [m.get("id", "") for m in data.get("data", [])]
931
+ except:
932
+ return []
933
+
934
+
935
+ def discover_all():
936
+ """Discover all backends and their models."""
937
+ results = []
938
+ for bid, b in BACKENDS.items():
939
+ installed = check_backend_installed(bid)
940
+ running = check_backend_running(bid) if installed else False
941
+ models = get_running_models(bid) if running else []
942
+ results.append({
943
+ "id": bid,
944
+ "name": b["name"],
945
+ "installed": installed,
946
+ "running": running,
947
+ "models": models,
948
+ "port": b["default_port"],
949
+ })
950
+ return results
951
+
952
+
953
+ def install_backend(backend_id):
954
+ """Install a backend (macOS, Linux, WSL)."""
955
+ b = BACKENDS[backend_id]
956
+ console.print(f"\n [bold]Installing {b['name']}...[/]")
957
+ console.print(f" [dim]{b['install_cmd']}[/]\n")
958
+
959
+ r = subprocess.run(["bash", "-c", b["install_cmd"]], timeout=600)
960
+ if r.returncode == 0:
961
+ # Re-discover binary path after install
962
+ BACKENDS[backend_id]["binary"] = _find_binary(
963
+ "llama-server" if backend_id == "llamacpp" else "ollama",
964
+ [BACKENDS[backend_id]["binary"]]
965
+ )
966
+ return r.returncode == 0
967
+
968
+
969
+ def download_model_hf(model_id):
970
+ """Download a model from HuggingFace."""
971
+ m = MODELS[model_id]
972
+ if not m.get("hf_repo"):
973
+ console.print(f" [red]No HuggingFace repo for {model_id}[/]")
974
+ return None
975
+
976
+ local_dir = MODELS_DIR / model_id
977
+ console.print(f"\n [bold]Downloading {m['name']}...[/]")
978
+ console.print(f" [dim]From: {m['hf_repo']}[/]")
979
+ console.print(f" [dim]To: {local_dir}[/]")
980
+ console.print(f" [dim]Size: ~{m['size_gb']} GB[/]\n")
981
+
982
+ try:
983
+ from huggingface_hub import snapshot_download
984
+ snapshot_download(
985
+ repo_id=m["hf_repo"],
986
+ local_dir=str(local_dir),
987
+ allow_patterns=m.get("hf_pattern", "*").split(",") if m.get("hf_pattern") else None,
988
+ )
989
+ return str(local_dir)
990
+ except ImportError:
991
+ # Fallback to CLI
992
+ cmd = ["huggingface-cli", "download", m["hf_repo"], "--local-dir", str(local_dir)]
993
+ if m.get("hf_pattern"):
994
+ cmd += ["--include", m["hf_pattern"]]
995
+ r = subprocess.run(cmd, timeout=1800)
996
+ return str(local_dir) if r.returncode == 0 else None
997
+
998
+
999
+ def download_model_ollama(model_id):
1000
+ """Pull a model via Ollama."""
1001
+ m = MODELS[model_id]
1002
+ tag = m.get("ollama_tag")
1003
+ if not tag:
1004
+ return False
1005
+ console.print(f"\n [bold]Pulling {tag} via Ollama...[/]")
1006
+ r = subprocess.run(["ollama", "pull", tag], timeout=1800)
1007
+ return r.returncode == 0
1008
+
1009
+
1010
+ def find_model_file(model_id):
1011
+ """Find the GGUF file for a model."""
1012
+ local_dir = MODELS_DIR / model_id
1013
+ if not local_dir.exists():
1014
+ # Check HF cache
1015
+ cache_dir = HOME / ".cache/huggingface/hub"
1016
+ m = MODELS.get(model_id, {})
1017
+ if m.get("hf_repo"):
1018
+ repo_dir = cache_dir / f"models--{m['hf_repo'].replace('/', '--')}"
1019
+ if repo_dir.exists():
1020
+ for f in repo_dir.rglob("*.gguf"):
1021
+ if "mmproj" not in f.name:
1022
+ return str(f)
1023
+ return None
1024
+
1025
+ # Find the GGUF file in local dir
1026
+ for f in local_dir.rglob("*.gguf"):
1027
+ if "mmproj" not in f.name:
1028
+ return str(f)
1029
+ return None
1030
+
1031
+
1032
+ def find_mmproj_file(model_id):
1033
+ """Find the vision projector file for a model."""
1034
+ local_dir = MODELS_DIR / model_id
1035
+ search_dirs = [local_dir]
1036
+
1037
+ # Also check HF cache
1038
+ m = MODELS.get(model_id, {})
1039
+ if m.get("hf_repo"):
1040
+ cache_dir = HOME / ".cache/huggingface/hub" / f"models--{m['hf_repo'].replace('/', '--')}"
1041
+ search_dirs.append(cache_dir)
1042
+
1043
+ for d in search_dirs:
1044
+ if d.exists():
1045
+ for f in d.rglob("*mmproj*"):
1046
+ return str(f)
1047
+ return None
1048
+
1049
+
1050
+ def start_llama_server(model_id, port=8089):
1051
+ """Start llama-server with a model."""
1052
+ m = MODELS.get(model_id, {})
1053
+ model_file = find_model_file(model_id)
1054
+ if not model_file:
1055
+ console.print(f" [red]Model file not found for {model_id}[/]")
1056
+ return None
1057
+
1058
+ binary = str(BACKENDS["llamacpp"]["binary"])
1059
+ if not os.path.exists(binary):
1060
+ binary = shutil.which("llama-server")
1061
+ if not binary:
1062
+ console.print(f" [red]llama-server not found[/]")
1063
+ return None
1064
+
1065
+ flags = m.get("server_flags", "-ngl 99 -c 32768 --jinja").split()
1066
+ cmd = [binary, "-m", model_file, "--port", str(port)] + flags
1067
+
1068
+ # Add mmproj if available
1069
+ mmproj = find_mmproj_file(model_id)
1070
+ if mmproj:
1071
+ cmd += ["--mmproj", mmproj]
1072
+ else:
1073
+ cmd += ["--no-mmproj"]
1074
+
1075
+ console.print(f" [dim]Starting: {' '.join(os.path.basename(c) if '/' in c else c for c in cmd[:6])}...[/]")
1076
+
1077
+ proc = subprocess.Popen(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
1078
+
1079
+ # Wait for server
1080
+ for i in range(60):
1081
+ try:
1082
+ req = urllib.request.Request(f"http://127.0.0.1:{port}/health")
1083
+ with urllib.request.urlopen(req, timeout=1):
1084
+ console.print(f" [green]✓ Server ready on port {port}[/]")
1085
+ return proc
1086
+ except:
1087
+ time.sleep(1)
1088
+
1089
+ console.print(f" [red]Server failed to start[/]")
1090
+ proc.kill()
1091
+ return None
1092
+
1093
+
1094
+ def get_gpu_memory_info():
1095
+ """Get GPU memory total and available (macOS Metal)."""
1096
+ info = {"total_mb": 0, "free_mb": 0, "used_by_llama_mb": 0}
1097
+ ram = get_system_ram_gb()
1098
+ if IS_MAC:
1099
+ # Metal GPU limit is ~67% of unified memory
1100
+ info["total_mb"] = int(ram * 1024 * 0.67)
1101
+ info["free_mb"] = info["total_mb"]
1102
+
1103
+ # Check if llama-server is using GPU
1104
+ try:
1105
+ out = subprocess.run(["pgrep", "-f", "llama-server"], capture_output=True, text=True)
1106
+ if out.stdout.strip():
1107
+ pid = out.stdout.strip().split()[0]
1108
+ rss = subprocess.run(["ps", "-o", "rss=", "-p", pid], capture_output=True, text=True)
1109
+ if rss.stdout.strip():
1110
+ info["used_by_llama_mb"] = int(rss.stdout.strip()) // 1024
1111
+ info["free_mb"] = max(0, info["total_mb"] - info["used_by_llama_mb"])
1112
+ except:
1113
+ pass
1114
+ else:
1115
+ info["total_mb"] = ram * 1024
1116
+ info["free_mb"] = info["total_mb"]
1117
+ return info
1118
+
1119
+
1120
+ def get_llama_server_config():
1121
+ """Parse running llama-server process flags and API state."""
1122
+ config = {
1123
+ "running": False,
1124
+ "pid": None,
1125
+ "model_path": None,
1126
+ "ngl": 0, # GPU layers (-ngl)
1127
+ "n_ctx": 0, # Context size (-c)
1128
+ "kv_quant": None, # KV cache quantization type (-ctk/-ctv)
1129
+ "flash_attn": False, # Flash attention (-fa)
1130
+ "footprint_mb": 0, # Process memory footprint
1131
+ "flags": [],
1132
+ }
1133
+
1134
+ try:
1135
+ out = subprocess.run(
1136
+ ["pgrep", "-f", "llama-server"], capture_output=True, text=True,
1137
+ )
1138
+ pids = out.stdout.strip().splitlines()
1139
+ if not pids:
1140
+ return config
1141
+ config["running"] = True
1142
+ config["pid"] = int(pids[0].strip())
1143
+
1144
+ # Get full command line
1145
+ cmd_out = subprocess.run(
1146
+ ["ps", "-o", "args=", "-p", str(config["pid"])],
1147
+ capture_output=True, text=True,
1148
+ )
1149
+ args = cmd_out.stdout.strip().split()
1150
+ config["flags"] = args
1151
+
1152
+ # Parse flags
1153
+ for i, arg in enumerate(args):
1154
+ if arg == "-ngl" and i + 1 < len(args):
1155
+ config["ngl"] = int(args[i + 1])
1156
+ elif arg == "-c" and i + 1 < len(args):
1157
+ config["n_ctx"] = int(args[i + 1])
1158
+ elif arg == "-ctk" and i + 1 < len(args):
1159
+ config["kv_quant"] = args[i + 1]
1160
+ elif arg == "-fa":
1161
+ config["flash_attn"] = True
1162
+ elif arg == "-m" and i + 1 < len(args):
1163
+ config["model_path"] = args[i + 1]
1164
+
1165
+ # Get process memory footprint
1166
+ if IS_MAC:
1167
+ config["footprint_mb"] = _parse_footprint_mb(config["pid"])
1168
+ else:
1169
+ try:
1170
+ rss = subprocess.run(
1171
+ ["ps", "-o", "rss=", "-p", str(config["pid"])],
1172
+ capture_output=True, text=True,
1173
+ )
1174
+ if rss.stdout.strip():
1175
+ config["footprint_mb"] = int(rss.stdout.strip()) // 1024
1176
+ except Exception:
1177
+ pass
1178
+
1179
+ except Exception:
1180
+ pass
1181
+
1182
+ return config
1183
+
1184
+
1185
+ def get_metal_gpu_stats():
1186
+ """Get real GPU stats — Metal on macOS, nvidia-smi on Linux."""
1187
+ stats = {
1188
+ "total_mb": 0,
1189
+ "alloc_mb": 0,
1190
+ "in_use_mb": 0,
1191
+ "free_vram_bytes": 0,
1192
+ "utilization_pct": 0,
1193
+ "temperature_c": None,
1194
+ "fan_pct": None,
1195
+ "power_w": None,
1196
+ "gpu_name": None,
1197
+ }
1198
+
1199
+ if IS_MAC:
1200
+ try:
1201
+ import re
1202
+ out = subprocess.run(
1203
+ ["ioreg", "-l"], capture_output=True, text=True, timeout=10,
1204
+ )
1205
+ for line in out.stdout.splitlines():
1206
+ if "VRAM,totalMB" in line:
1207
+ m = re.search(r'"VRAM,totalMB"=(\d+)', line)
1208
+ if m:
1209
+ stats["total_mb"] = int(m.group(1))
1210
+ if "PerformanceStatistics" in line and "Alloc system memory" in line:
1211
+ m = re.search(r'"Alloc system memory"=(\d+)', line)
1212
+ if m:
1213
+ stats["alloc_mb"] = int(m.group(1)) // (1024 * 1024)
1214
+ m2 = re.search(r'"In use system memory"=(\d+)', line)
1215
+ if m2:
1216
+ stats["in_use_mb"] = int(m2.group(1)) // (1024 * 1024)
1217
+ m3 = re.search(r'"Device Utilization %"=(\d+)', line)
1218
+ if m3:
1219
+ stats["utilization_pct"] = int(m3.group(1))
1220
+
1221
+ # Thermal state on macOS (approximate — no direct GPU temp on Apple Silicon)
1222
+ try:
1223
+ therm = subprocess.run(
1224
+ ["pmset", "-g", "therm"], capture_output=True, text=True, timeout=3,
1225
+ )
1226
+ if "CPU_Scheduler_Limit" in therm.stdout:
1227
+ # Thermal throttling active
1228
+ stats["temperature_c"] = 95 # approximate
1229
+ except Exception:
1230
+ pass
1231
+
1232
+ except Exception:
1233
+ pass
1234
+
1235
+ elif IS_LINUX:
1236
+ # nvidia-smi for NVIDIA GPUs
1237
+ try:
1238
+ out = subprocess.run(
1239
+ ["nvidia-smi",
1240
+ "--query-gpu=name,memory.total,memory.used,memory.free,utilization.gpu,temperature.gpu,fan.speed,power.draw",
1241
+ "--format=csv,noheader,nounits"],
1242
+ capture_output=True, text=True, timeout=5,
1243
+ )
1244
+ if out.returncode == 0:
1245
+ parts = [p.strip() for p in out.stdout.strip().split(",")]
1246
+ if len(parts) >= 8:
1247
+ stats["gpu_name"] = parts[0]
1248
+ stats["total_mb"] = int(float(parts[1]))
1249
+ stats["alloc_mb"] = int(float(parts[2]))
1250
+ stats["in_use_mb"] = int(float(parts[2]))
1251
+ stats["utilization_pct"] = int(float(parts[5]))
1252
+ try:
1253
+ stats["temperature_c"] = int(float(parts[6]))
1254
+ except (ValueError, IndexError):
1255
+ pass
1256
+ try:
1257
+ stats["fan_pct"] = int(float(parts[7].replace("%", "")))
1258
+ except (ValueError, IndexError):
1259
+ pass
1260
+ try:
1261
+ stats["power_w"] = float(parts[8])
1262
+ except (ValueError, IndexError):
1263
+ pass
1264
+ except FileNotFoundError:
1265
+ # No NVIDIA GPU — check for AMD via rocm-smi
1266
+ try:
1267
+ out = subprocess.run(
1268
+ ["rocm-smi", "--showmeminfo", "vram", "--csv"],
1269
+ capture_output=True, text=True, timeout=5,
1270
+ )
1271
+ if out.returncode == 0:
1272
+ for line in out.stdout.splitlines()[1:]:
1273
+ parts = line.split(",")
1274
+ if len(parts) >= 3:
1275
+ stats["total_mb"] = int(parts[0]) // (1024 * 1024)
1276
+ stats["alloc_mb"] = int(parts[1]) // (1024 * 1024)
1277
+ except FileNotFoundError:
1278
+ pass
1279
+
1280
+ return stats
1281
+
1282
+
1283
+ def get_disk_info():
1284
+ """Get disk space and model storage info."""
1285
+ info = {
1286
+ "disk_total_gb": 0,
1287
+ "disk_free_gb": 0,
1288
+ "hf_cache_gb": 0,
1289
+ "models": [], # list of {name, size_gb, path}
1290
+ "docker_gb": 0,
1291
+ }
1292
+ try:
1293
+ # Disk space
1294
+ st = os.statvfs(HOME)
1295
+ info["disk_total_gb"] = round((st.f_blocks * st.f_frsize) / (1024**3))
1296
+ info["disk_free_gb"] = round((st.f_bavail * st.f_frsize) / (1024**3))
1297
+
1298
+ # HuggingFace cache total
1299
+ hf_cache = HOME / ".cache/huggingface/hub"
1300
+ if hf_cache.exists():
1301
+ total = 0
1302
+ # Sum blob sizes (the real files, not symlinks)
1303
+ blobs_dir = hf_cache
1304
+ for blob in blobs_dir.rglob("*"):
1305
+ if blob.is_file() and not blob.is_symlink():
1306
+ total += blob.stat().st_size
1307
+ info["hf_cache_gb"] = round(total / (1024**3))
1308
+
1309
+ # Individual GGUF models
1310
+ for gguf in hf_cache.rglob("*.gguf") if hf_cache.exists() else []:
1311
+ name = gguf.name
1312
+ if "mmproj" in name.lower():
1313
+ continue
1314
+ real = gguf.resolve()
1315
+ try:
1316
+ sz = real.stat().st_size / (1024**3)
1317
+ info["models"].append({"name": name, "size_gb": round(sz, 1), "path": str(real)})
1318
+ except OSError:
1319
+ pass
1320
+ info["models"].sort(key=lambda x: x["size_gb"], reverse=True)
1321
+
1322
+ # Docker (if running)
1323
+ try:
1324
+ out = subprocess.run(["docker", "system", "df", "--format", "{{.Size}}"],
1325
+ capture_output=True, text=True, timeout=3)
1326
+ if out.returncode == 0:
1327
+ for line in out.stdout.strip().splitlines():
1328
+ line = line.strip().upper()
1329
+ if "GB" in line:
1330
+ info["docker_gb"] += float(line.replace("GB", ""))
1331
+ elif "MB" in line:
1332
+ info["docker_gb"] += float(line.replace("MB", "")) / 1024
1333
+ info["docker_gb"] = round(info["docker_gb"])
1334
+ except (FileNotFoundError, Exception):
1335
+ pass
1336
+ except Exception:
1337
+ pass
1338
+ return info
1339
+
1340
+
1341
+ def get_swap_usage_mb():
1342
+ """Get swap usage in MB."""
1343
+ try:
1344
+ if IS_MAC:
1345
+ out = subprocess.run(
1346
+ ["sysctl", "-n", "vm.swapusage"], capture_output=True, text=True, timeout=3,
1347
+ )
1348
+ # "total = 10240.00M used = 8538.06M free = 1701.94M"
1349
+ for part in out.stdout.split():
1350
+ if part.endswith("M") and "used" not in out.stdout.split()[out.stdout.split().index(part) - 1]:
1351
+ continue
1352
+ import re
1353
+ m = re.search(r'used\s*=\s*([\d.]+)M', out.stdout)
1354
+ if m:
1355
+ return int(float(m.group(1)))
1356
+ else:
1357
+ with open("/proc/meminfo") as f:
1358
+ for line in f:
1359
+ if line.startswith("SwapTotal:"):
1360
+ total = int(line.split()[1]) // 1024
1361
+ if line.startswith("SwapFree:"):
1362
+ free = int(line.split()[1]) // 1024
1363
+ return total - free
1364
+ except Exception:
1365
+ pass
1366
+ return 0
1367
+
1368
+
1369
+ def diagnose_gpu_health(model_id=None):
1370
+ """Full GPU health diagnostic. Returns dict with status and recommendations.
1371
+
1372
+ Checks:
1373
+ 1. Is model running on GPU or CPU?
1374
+ 2. Is KV cache optimized?
1375
+ 3. Is context size appropriate?
1376
+ 4. Is swap thrashing happening?
1377
+ 5. Are flags optimal?
1378
+ """
1379
+ diag = {
1380
+ "status": "unknown", # "healthy", "degraded", "critical"
1381
+ "on_gpu": False,
1382
+ "gpu_layers": 0,
1383
+ "total_layers": 99,
1384
+ "kv_quantized": False,
1385
+ "kv_type": None,
1386
+ "flash_attn": False,
1387
+ "context_size": 0,
1388
+ "kv_cache_est_mb": 0,
1389
+ "model_size_mb": 0,
1390
+ "gpu_total_mb": 0,
1391
+ "gpu_alloc_mb": 0,
1392
+ "gpu_util_pct": 0,
1393
+ "swap_used_mb": 0,
1394
+ "swap_thrashing": False,
1395
+ "mem_pressure": "unknown",
1396
+ "issues": [],
1397
+ "fixes": [],
1398
+ "server_config": {},
1399
+ }
1400
+
1401
+ # Get server config
1402
+ srv = get_llama_server_config()
1403
+ diag["server_config"] = srv
1404
+
1405
+ if not srv["running"]:
1406
+ diag["status"] = "unknown"
1407
+ diag["issues"].append("llama-server not running")
1408
+ return diag
1409
+
1410
+ # GPU layer offload
1411
+ diag["gpu_layers"] = srv["ngl"]
1412
+ diag["on_gpu"] = srv["ngl"] >= 90 # -ngl 99 means all on GPU
1413
+ diag["context_size"] = srv["n_ctx"]
1414
+ diag["flash_attn"] = srv["flash_attn"]
1415
+
1416
+ if not diag["on_gpu"]:
1417
+ diag["issues"].append(f"Only {srv['ngl']} layers on GPU — model partially on CPU")
1418
+ diag["fixes"].append("Restart with -ngl 99 to offload all layers to GPU")
1419
+
1420
+ # KV cache
1421
+ diag["kv_type"] = srv["kv_quant"]
1422
+ diag["kv_quantized"] = srv["kv_quant"] in ("q4_0", "q8_0", "q4_1", "f16")
1423
+ if not diag["kv_quantized"]:
1424
+ diag["issues"].append("KV cache not quantized — using full precision (2x memory)")
1425
+ diag["fixes"].append("Add -ctk q4_0 -ctv q4_0 to quantize KV cache (saves ~50% KV memory)")
1426
+
1427
+ if not diag["flash_attn"]:
1428
+ diag["issues"].append("Flash attention disabled — slower and more memory")
1429
+ diag["fixes"].append("Add -fa on to enable flash attention")
1430
+
1431
+ # Estimate KV cache memory
1432
+ # For Gemma 4 26B: 5 global layers × 128K context × 2 (K+V) × hidden_dim
1433
+ # With q4_0: ~630MB. Without quantization: ~1.2GB
1434
+ if diag["context_size"] > 0:
1435
+ # Rough estimate: 128K ctx with q4_0 KV ≈ 630MB, without ≈ 1200MB
1436
+ ctx_ratio = diag["context_size"] / 131072
1437
+ if diag["kv_quantized"]:
1438
+ diag["kv_cache_est_mb"] = int(630 * ctx_ratio)
1439
+ else:
1440
+ diag["kv_cache_est_mb"] = int(1200 * ctx_ratio)
1441
+
1442
+ # Model size
1443
+ if model_id and model_id in MODELS:
1444
+ diag["model_size_mb"] = int(MODELS[model_id]["size_gb"] * 1024)
1445
+
1446
+ # Metal GPU stats
1447
+ metal = get_metal_gpu_stats()
1448
+ diag["gpu_total_mb"] = metal["total_mb"]
1449
+ diag["gpu_alloc_mb"] = metal["alloc_mb"]
1450
+ diag["gpu_util_pct"] = metal["utilization_pct"]
1451
+
1452
+ # Swap check
1453
+ diag["swap_used_mb"] = get_swap_usage_mb()
1454
+ diag["swap_thrashing"] = diag["swap_used_mb"] > 4000 # >4GB swap = bad
1455
+
1456
+ if diag["swap_thrashing"]:
1457
+ diag["issues"].append(f"Swap thrashing: {diag['swap_used_mb'] // 1024}GB in swap — major slowdown")
1458
+ diag["fixes"].append("Reduce context size (-c 32768) or use smaller quant to free GPU memory")
1459
+
1460
+ # Memory pressure
1461
+ try:
1462
+ out = subprocess.run(
1463
+ ["sysctl", "-n", "kern.memorystatus_vm_pressure_level"],
1464
+ capture_output=True, text=True, timeout=3,
1465
+ )
1466
+ level = int(out.stdout.strip())
1467
+ diag["mem_pressure"] = {0: "normal", 1: "warn", 2: "critical", 4: "critical"}.get(level, "unknown")
1468
+ except Exception:
1469
+ pass
1470
+
1471
+ if diag["mem_pressure"] == "critical":
1472
+ diag["issues"].append("Critical memory pressure — system may kill processes")
1473
+ diag["fixes"].append("Run: localcoder --cleanup")
1474
+
1475
+ # Context size warnings
1476
+ if diag["context_size"] > 65536 and not diag["kv_quantized"]:
1477
+ diag["issues"].append(f"Large context ({diag['context_size']//1024}K) without KV quantization")
1478
+ diag["fixes"].append("Either reduce context or add -ctk q4_0 -ctv q4_0")
1479
+
1480
+ # Check if Metal limit could be raised
1481
+ if IS_MAC and diag["gpu_total_mb"] > 0:
1482
+ ram_mb = get_system_ram_gb() * 1024
1483
+ current_limit = diag["gpu_total_mb"]
1484
+ max_safe = int(ram_mb * 0.90) # leave 10% for system
1485
+ if current_limit < max_safe and diag["swap_thrashing"]:
1486
+ new_limit = max_safe
1487
+ diag["fixes"].append(
1488
+ f"Raise Metal GPU limit: sudo sysctl iogpu.wired_limit_mb={new_limit}"
1489
+ f" (current: {current_limit}MB, max safe: {new_limit}MB)"
1490
+ )
1491
+
1492
+ # Overall status
1493
+ if not diag["issues"]:
1494
+ diag["status"] = "healthy"
1495
+ elif diag["swap_thrashing"] or not diag["on_gpu"] or diag["mem_pressure"] == "critical":
1496
+ diag["status"] = "critical"
1497
+ else:
1498
+ diag["status"] = "degraded"
1499
+
1500
+ return diag
1501
+
1502
+
1503
+ def print_gpu_health(diag=None, model_id=None):
1504
+ """Print GPU health diagnostic panel."""
1505
+ if diag is None:
1506
+ diag = diagnose_gpu_health(model_id)
1507
+
1508
+ status_style = {
1509
+ "healthy": ("green", "✓ Healthy"),
1510
+ "degraded": ("yellow", "⚠ Degraded"),
1511
+ "critical": ("red", "✗ Critical"),
1512
+ "unknown": ("dim", "? Unknown"),
1513
+ }
1514
+ color, label = status_style.get(diag["status"], ("dim", "?"))
1515
+
1516
+ lines = []
1517
+
1518
+ # GPU offload status
1519
+ if diag["server_config"].get("running"):
1520
+ gpu_icon = "[green]●[/] GPU" if diag["on_gpu"] else "[red]●[/] CPU (SLOW!)"
1521
+ lines.append(f" Compute: {gpu_icon} · {diag['gpu_layers']} layers offloaded · GPU util: {diag['gpu_util_pct']}%")
1522
+
1523
+ # KV cache
1524
+ kv_icon = "[green]●[/]" if diag["kv_quantized"] else "[red]●[/]"
1525
+ kv_info = f"quantized ({diag['kv_type']})" if diag["kv_quantized"] else "full precision (2x memory!)"
1526
+ lines.append(
1527
+ f" KV cache: {kv_icon} {kv_info} · ~{diag['kv_cache_est_mb']}MB"
1528
+ f" · context: {diag['context_size'] // 1024}K tokens"
1529
+ )
1530
+
1531
+ # Flash attention
1532
+ fa_icon = "[green]●[/]" if diag["flash_attn"] else "[yellow]●[/]"
1533
+ lines.append(f" Flash attn: {fa_icon} {'on' if diag['flash_attn'] else 'off'}"
1534
+ f" · footprint: {diag['server_config'].get('footprint_mb', 0)}MB")
1535
+
1536
+ # Memory
1537
+ swap_color = "red" if diag["swap_thrashing"] else "green"
1538
+ pressure_color = {"normal": "green", "warn": "yellow", "critical": "red"}.get(
1539
+ diag["mem_pressure"], "dim"
1540
+ )
1541
+ lines.append(
1542
+ f" Memory: [{pressure_color}]{diag['mem_pressure']}[/{pressure_color}]"
1543
+ f" · swap: [{swap_color}]{diag['swap_used_mb'] // 1024}GB[/{swap_color}]"
1544
+ f" · GPU alloc: {diag['gpu_alloc_mb'] // 1024}GB / {diag['gpu_total_mb'] // 1024}GB"
1545
+ )
1546
+
1547
+ # Issues
1548
+ if diag["issues"]:
1549
+ lines.append("")
1550
+ for issue in diag["issues"]:
1551
+ lines.append(f" [red]✗[/] {issue}")
1552
+ if diag["fixes"]:
1553
+ lines.append("")
1554
+ for fix in diag["fixes"]:
1555
+ lines.append(f" [green]→[/] {fix}")
1556
+
1557
+ console.print(Panel(
1558
+ "\n".join(lines),
1559
+ title=f"[bold]GPU Health [{color}]{label}[/{color}][/]",
1560
+ border_style=color,
1561
+ padding=(0, 1),
1562
+ ))
1563
+
1564
+ return diag
1565
+
1566
+
1567
+ def auto_optimize_server(model_id=None):
1568
+ """Check if server needs optimization and apply fixes.
1569
+
1570
+ Returns True if server was restarted with better flags.
1571
+ """
1572
+ diag = diagnose_gpu_health(model_id)
1573
+
1574
+ if diag["status"] == "healthy":
1575
+ return False
1576
+
1577
+ needs_restart = False
1578
+ srv = diag["server_config"]
1579
+ model_info = MODELS.get(model_id, {}) if model_id else {}
1580
+ optimal_flags = model_info.get("server_flags", "").split() if model_info else []
1581
+
1582
+ # Check if current flags are suboptimal
1583
+ if not diag["on_gpu"] and "-ngl" not in " ".join(srv.get("flags", [])):
1584
+ needs_restart = True
1585
+ if not diag["kv_quantized"] and "-ctk" not in " ".join(srv.get("flags", [])):
1586
+ needs_restart = True
1587
+ if not diag["flash_attn"] and "-fa" not in " ".join(srv.get("flags", [])):
1588
+ needs_restart = True
1589
+
1590
+ if needs_restart and model_id:
1591
+ console.print("\n [yellow]Server running with suboptimal flags — restarting with optimizations...[/]")
1592
+ # Kill current server
1593
+ if srv.get("pid"):
1594
+ try:
1595
+ subprocess.run(["kill", str(srv["pid"])], timeout=5)
1596
+ time.sleep(2)
1597
+ except Exception:
1598
+ pass
1599
+ # Start with optimal flags
1600
+ proc = start_llama_server(model_id)
1601
+ if proc:
1602
+ console.print(" [green]✓ Server restarted with optimal GPU flags[/]")
1603
+ return True
1604
+ else:
1605
+ console.print(" [red]Failed to restart server[/]")
1606
+
1607
+ # If swap thrashing, try to free memory without restart
1608
+ if diag["swap_thrashing"] and not needs_restart:
1609
+ console.print("\n [yellow]Swap thrashing detected — cleaning up GPU memory...[/]")
1610
+ cleanup_gpu_memory(force=False)
1611
+
1612
+ return False
1613
+
1614
+
1615
+ # ── macOS Debloat categories for ML workloads ──
1616
+ DEBLOAT_CATEGORIES = {
1617
+ "ml_hogs": {
1618
+ "name": "ML & Analysis Daemons",
1619
+ "desc": "Apple's background ML that competes with your model for GPU",
1620
+ "safe": True,
1621
+ "services": {
1622
+ "com.apple.photoanalysisd": "Photos face/scene ML — uses GPU + 2-8GB RAM",
1623
+ "com.apple.mediaanalysisd": "Visual Lookup, Live Text ML — GPU heavy",
1624
+ "com.apple.suggestd": "Siri suggestions indexer — background ML",
1625
+ "com.apple.intelligenced": "Apple Intelligence (Sequoia) — GPU heavy",
1626
+ "com.apple.mlruntime": "Core ML runtime — shared GPU compute",
1627
+ },
1628
+ },
1629
+ "location_bloat": {
1630
+ "name": "Location & Sync Bloat",
1631
+ "desc": "Known memory leakers on macOS 14/15",
1632
+ "safe": True,
1633
+ "services": {
1634
+ "com.apple.CoreLocationAgent": "Location cache — leaks to 8GB+ (notorious)",
1635
+ "com.apple.remindd": "Reminders sync — memory leak on macOS 15",
1636
+ "com.apple.cloudd": "iCloud Drive sync — bloats with many files",
1637
+ "com.apple.bird": "CloudKit container daemon",
1638
+ },
1639
+ },
1640
+ "telemetry": {
1641
+ "name": "Telemetry & Analytics",
1642
+ "desc": "Crash reports, analytics, diagnostics — zero impact to disable",
1643
+ "safe": True,
1644
+ "services": {
1645
+ "com.apple.analyticsd": "Analytics collection",
1646
+ "com.apple.ReportCrash": "Crash report generation",
1647
+ "com.apple.spindump": "CPU sampling diagnostics",
1648
+ "com.apple.DiagnosticReportCleanup": "Diagnostic cleanup",
1649
+ "com.apple.ap.adprivacyd": "Ad privacy daemon",
1650
+ "com.apple.ap.adservicesd": "Ad services",
1651
+ "com.apple.triald": "A/B testing framework",
1652
+ },
1653
+ },
1654
+ "siri_ai": {
1655
+ "name": "Siri & Apple AI",
1656
+ "desc": "Siri, assistant, Apple Intelligence",
1657
+ "safe": True,
1658
+ "services": {
1659
+ "com.apple.Siri.agent": "Siri main service",
1660
+ "com.apple.assistantd": "Assistant daemon",
1661
+ "com.apple.parsec.fbf": "Siri search suggestions",
1662
+ "com.apple.tipsd": "Tips and suggestions",
1663
+ "com.apple.ScreenTimeAgent": "Screen time tracking",
1664
+ },
1665
+ },
1666
+ }
1667
+
1668
+
1669
+ def debloat_wizard():
1670
+ """Interactive debloat wizard for ML workloads.
1671
+
1672
+ Shows categories of services that can be disabled to free GPU/memory.
1673
+ User picks categories, we disable via launchctl.
1674
+ Creates restore script.
1675
+ """
1676
+ import shutil
1677
+
1678
+ console.clear()
1679
+ console.print()
1680
+ console.print(" [bold]localcoder debloat wizard[/]")
1681
+ console.print(" [dim]Disable macOS services that compete with your model for GPU & memory[/]")
1682
+ console.print(" [dim]All changes are reversible — a restore script is saved automatically[/]\n")
1683
+
1684
+ # Show current bloated processes
1685
+ top_procs = get_top_memory_processes(min_mb=200, limit=5)
1686
+ bloat_procs = [p for p in top_procs if p["category"] == "bloat"]
1687
+ if bloat_procs:
1688
+ console.print(" [yellow]Currently bloated system processes:[/]")
1689
+ for p in bloat_procs:
1690
+ mb = p["mb"]
1691
+ size = f"{mb / 1024:.1f}GB" if mb >= 1024 else f"{mb}MB"
1692
+ desc = SYSTEM_RESTARTABLE.get(p["name"], "")
1693
+ console.print(f" [red]●[/] {p['name']} [bold]{size}[/] [dim]{desc}[/]")
1694
+ console.print()
1695
+
1696
+ # Show categories
1697
+ cats = list(DEBLOAT_CATEGORIES.items())
1698
+ for i, (key, cat) in enumerate(cats, 1):
1699
+ n_services = len(cat["services"])
1700
+ console.print(f" [bold]{i}.[/] {cat['name']} [dim]({n_services} services)[/]")
1701
+ console.print(f" [dim]{cat['desc']}[/]")
1702
+ for svc, desc in list(cat["services"].items())[:3]:
1703
+ console.print(f" [dim] · {svc.split('.')[-1]}: {desc}[/]")
1704
+ if n_services > 3:
1705
+ console.print(f" [dim] + {n_services - 3} more[/]")
1706
+ console.print()
1707
+
1708
+ console.print(f" [bold]k.[/] Kill bloated processes now [dim](one-time, they may restart)[/]")
1709
+ console.print(f" [bold]a.[/] All categories [dim](maximum GPU headroom)[/]")
1710
+ console.print(f" [bold]r.[/] Restore all [dim](re-enable everything)[/]")
1711
+ console.print(f" [bold]q.[/] Quit\n")
1712
+
1713
+ try:
1714
+ ans = input(" Choose (e.g. 1,2 or a): ").strip().lower()
1715
+ except (EOFError, KeyboardInterrupt):
1716
+ return
1717
+
1718
+ if ans == "q" or not ans:
1719
+ return
1720
+
1721
+ if ans == "r":
1722
+ _debloat_restore()
1723
+ return
1724
+
1725
+ if ans == "k":
1726
+ _kill_bloated_processes()
1727
+ return
1728
+
1729
+ # Parse selection
1730
+ selected_cats = []
1731
+ if ans == "a":
1732
+ selected_cats = list(DEBLOAT_CATEGORIES.keys())
1733
+ else:
1734
+ for part in ans.replace(" ", "").split(","):
1735
+ try:
1736
+ idx = int(part) - 1
1737
+ if 0 <= idx < len(cats):
1738
+ selected_cats.append(cats[idx][0])
1739
+ except ValueError:
1740
+ pass
1741
+
1742
+ if not selected_cats:
1743
+ console.print(" [dim]No categories selected.[/]")
1744
+ return
1745
+
1746
+ # Confirm
1747
+ total_services = sum(len(DEBLOAT_CATEGORIES[c]["services"]) for c in selected_cats)
1748
+ cat_names = ", ".join(DEBLOAT_CATEGORIES[c]["name"] for c in selected_cats)
1749
+ console.print(f"\n [yellow]Will disable {total_services} services: {cat_names}[/]")
1750
+ try:
1751
+ confirm = input(" Proceed? (y/n): ").strip().lower()
1752
+ except (EOFError, KeyboardInterrupt):
1753
+ return
1754
+ if confirm != "y":
1755
+ return
1756
+
1757
+ # Disable services
1758
+ disabled = []
1759
+ restore_cmds = []
1760
+ for cat_key in selected_cats:
1761
+ cat = DEBLOAT_CATEGORIES[cat_key]
1762
+ for svc, desc in cat["services"].items():
1763
+ # Try both user and system domains
1764
+ for domain in [f"gui/{os.getuid()}", "system"]:
1765
+ cmd = ["launchctl", "disable", f"{domain}/{svc}"]
1766
+ r = subprocess.run(cmd, capture_output=True, text=True)
1767
+ # Also bootout if currently loaded
1768
+ subprocess.run(
1769
+ ["launchctl", "bootout", f"{domain}/{svc}"],
1770
+ capture_output=True, text=True,
1771
+ )
1772
+ restore_cmds.append(f"launchctl enable {domain}/{svc}")
1773
+ disabled.append(svc)
1774
+ console.print(f" [green]✓[/] {svc.split('.')[-1]} [dim]{desc}[/]")
1775
+
1776
+ # Also kill currently bloated processes
1777
+ for p in bloat_procs:
1778
+ for pid in p.get("pids", [p["pid"]]):
1779
+ try:
1780
+ import signal
1781
+ os.kill(pid, signal.SIGTERM)
1782
+ except (ProcessLookupError, PermissionError):
1783
+ pass
1784
+ console.print(f" [green]✓[/] Killed {p['name']} (was {p['mb'] // 1024}GB)")
1785
+
1786
+ # Save restore script
1787
+ restore_path = CONFIG_DIR / "restore_debloat.sh"
1788
+ CONFIG_DIR.mkdir(parents=True, exist_ok=True)
1789
+ with open(restore_path, "w") as f:
1790
+ f.write("#!/bin/bash\n# localcoder debloat restore script\n")
1791
+ f.write(f"# Generated: {time.strftime('%Y-%m-%d %H:%M')}\n\n")
1792
+ for cmd in restore_cmds:
1793
+ f.write(f"{cmd}\n")
1794
+ f.write('\necho "All services restored. Reboot recommended."\n')
1795
+ os.chmod(restore_path, 0o755)
1796
+
1797
+ console.print(f"\n [green]Disabled {len(disabled)} services.[/]")
1798
+ console.print(f" [dim]Restore script: {restore_path}[/]")
1799
+ console.print(f" [dim]Run: localcoder --debloat then choose 'r' to restore[/]\n")
1800
+
1801
+
1802
+ def _kill_bloated_processes():
1803
+ """Kill all currently bloated system processes (one-time)."""
1804
+ import signal
1805
+ procs = get_top_memory_processes(min_mb=300)
1806
+ bloat = [p for p in procs if p["category"] == "bloat"]
1807
+ if not bloat:
1808
+ console.print(" [dim]No bloated processes found.[/]")
1809
+ return
1810
+
1811
+ freed = 0
1812
+ for p in bloat:
1813
+ for pid in p.get("pids", [p["pid"]]):
1814
+ try:
1815
+ os.kill(pid, signal.SIGTERM)
1816
+ except (ProcessLookupError, PermissionError):
1817
+ pass
1818
+ mb = p["mb"]
1819
+ freed += mb
1820
+ console.print(f" [green]✓[/] Killed {p['name']} ({mb // 1024}GB)")
1821
+
1822
+ console.print(f"\n [green]Freed ~{freed // 1024}GB[/] [dim](processes may restart smaller)[/]")
1823
+
1824
+
1825
+ def _debloat_restore():
1826
+ """Restore all debloated services."""
1827
+ restore_path = CONFIG_DIR / "restore_debloat.sh"
1828
+ if not restore_path.exists():
1829
+ console.print(" [dim]No restore script found — nothing to restore.[/]")
1830
+ return
1831
+
1832
+ console.print(" [yellow]Restoring all disabled services...[/]")
1833
+ r = subprocess.run(["bash", str(restore_path)], capture_output=True, text=True, timeout=30)
1834
+ if r.returncode == 0:
1835
+ console.print(" [green]All services restored. Reboot recommended.[/]")
1836
+ restore_path.unlink()
1837
+ else:
1838
+ console.print(f" [red]Some services failed to restore: {r.stderr[:200]}[/]")
1839
+
1840
+
1841
+ # LocalLLaMA community favorites for coding — from Best LLMs 2025 megathread
1842
+ # Updated from r/LocalLLaMA actual user recommendations, not benchmarks
1843
+ COMMUNITY_CODING_MODELS = {
1844
+ # <=8GB VRAM
1845
+ "lfm2-8b-a1b": {"name": "LFM2 8B-A1B", "hf": "liquid/LFM2-8B-A1B-GGUF", "vram": "8GB", "note": "Crazy fast MoE, great general + tool calling"},
1846
+ "qwen3-4b": {"name": "Qwen 3 4B", "hf": "unsloth/Qwen3-4B-GGUF", "vram": "4GB", "note": "Best tool calling at 4B size"},
1847
+ # 12-24GB VRAM (most LocalLLaMA users)
1848
+ "qwen3-coder-30b": {"name": "Qwen 3 Coder 30B-A3B", "hf": "unsloth/Qwen3-Coder-30B-A3B-Instruct-GGUF", "vram": "12-24GB", "note": "Top agentic coder, MoE"},
1849
+ "nemotron-30b-a3b": {"name": "Nemotron 30B-A3B", "hf": "unsloth/Nemotron-3-Nano-30B-A3B-GGUF", "vram": "12-24GB", "note": "NVIDIA MoE, fastest generation"},
1850
+ "gemma4-26b": {"name": "Gemma 4 26B-A4B", "hf": "unsloth/gemma-4-26B-A4B-it-GGUF", "vram": "12-16GB", "note": "Best tool calling + vision, 49 tok/s"},
1851
+ "devstral-24b": {"name": "Devstral Small 24B", "hf": "lmstudio-community/Devstral-Small-2-24B-Instruct-2512-GGUF", "vram": "12-24GB", "note": "Reliable daily driver for coding"},
1852
+ "glm-4.6v-flash": {"name": "GLM 4.6V Flash", "hf": "THUDM/glm-4.6v-flash-9b-gguf", "vram": "8-12GB", "note": "Best small model of the year (r/LocalLLaMA)"},
1853
+ # 24-48GB VRAM
1854
+ "gpt-oss-20b": {"name": "GPT-OSS 20B", "hf": "unsloth/gpt-oss-20b-GGUF", "vram": "24GB", "note": "Best accuracy under 48GB"},
1855
+ "qwen3.5-35b-a3b": {"name": "Qwen 3.5 35B-A3B", "hf": "unsloth/Qwen3.5-35B-A3B-GGUF", "vram": "12-24GB", "note": "1.5M downloads, MoE coding beast"},
1856
+ # 48-96GB VRAM
1857
+ "glm-4.5-air": {"name": "GLM 4.5 Air", "hf": "THUDM/glm-4.5-9b-air-gguf", "vram": "48-96GB", "note": "Flat-out amazing for codegen (r/LocalLLaMA)"},
1858
+ # 96GB+
1859
+ "gpt-oss-120b": {"name": "GPT-OSS 120B", "hf": "unsloth/gpt-oss-120b-GGUF", "vram": "96GB+", "note": "Most recommended for agentic coding"},
1860
+ "devstral-123b": {"name": "Devstral 123B", "hf": "mistralai/Devstral-2-123B-GGUF", "vram": "96GB+", "note": "Compact 123B, fits 2x RTX Pro"},
1861
+ "minimax-m2": {"name": "MiniMax M2.1", "hf": "unsloth/MiniMax-M2.1-GGUF", "vram": "96GB+", "note": "Frontier performance, fantastic agentic coding"},
1862
+ }
1863
+
1864
+
1865
+ _hf_model_cache = {"data": None, "ts": 0}
1866
+
1867
+
1868
+ def _fetch_all_hf_models():
1869
+ """Fetch GGUF models from all top providers in parallel. Cached for 10 minutes.
1870
+
1871
+ One call, returns everything — trending, liked, latest. No duplicate fetches.
1872
+ """
1873
+ import concurrent.futures
1874
+
1875
+ # Return cache if fresh
1876
+ if _hf_model_cache["data"] and time.time() - _hf_model_cache["ts"] < 600:
1877
+ return _hf_model_cache["data"]
1878
+
1879
+ providers = ["unsloth", "bartowski", "lmstudio-community"]
1880
+ all_raw = []
1881
+
1882
+ def _fetch_one(author):
1883
+ """Fetch from one provider — downloads sort gets us everything we need."""
1884
+ try:
1885
+ url = f"https://huggingface.co/api/models?author={author}&sort=downloads&direction=-1&limit=20"
1886
+ req = urllib.request.Request(url, headers={"User-Agent": "localcoder/1.0"})
1887
+ with urllib.request.urlopen(req, timeout=8) as resp:
1888
+ return json.loads(resp.read())
1889
+ except Exception:
1890
+ return []
1891
+
1892
+ # Parallel fetch — all 3 providers at once (~1 API call time instead of 3)
1893
+ try:
1894
+ with concurrent.futures.ThreadPoolExecutor(max_workers=3) as pool:
1895
+ futures = {pool.submit(_fetch_one, p): p for p in providers}
1896
+ for future in concurrent.futures.as_completed(futures, timeout=10):
1897
+ author = futures[future]
1898
+ try:
1899
+ for m in future.result():
1900
+ m["_author"] = author
1901
+ all_raw.append(m)
1902
+ except Exception:
1903
+ pass
1904
+ except Exception:
1905
+ return []
1906
+
1907
+ # Deduplicate by base model name, prefer unsloth > bartowski > lmstudio
1908
+ provider_rank = {"unsloth": 0, "bartowski": 1, "lmstudio-community": 2}
1909
+ seen = {}
1910
+ for m in all_raw:
1911
+ tags = m.get("tags", [])
1912
+ if "gguf" not in tags:
1913
+ continue
1914
+ dl = m.get("downloads", 0)
1915
+ if dl < 1000:
1916
+ continue
1917
+
1918
+ rid = m["id"]
1919
+ base = rid.split("/")[-1].replace("-GGUF", "").replace("-Instruct", "").replace("-it", "").lower()
1920
+ author = m.get("_author", "")
1921
+ rank = provider_rank.get(author, 9)
1922
+
1923
+ if base not in seen or rank < seen[base]["_rank"]:
1924
+ name = rid.split("/")[-1].replace("-GGUF", "").replace("-Instruct", "").replace("-it", "")
1925
+ tags = m.get("tags", [])
1926
+
1927
+ # Detect modalities from tags
1928
+ caps = []
1929
+ if "image-text-to-text" in tags:
1930
+ caps.append("vision")
1931
+ if any("audio" in t for t in tags):
1932
+ caps.append("audio")
1933
+ if any("code" in t.lower() or "coder" in t.lower() for t in tags) or "coder" in name.lower():
1934
+ caps.append("code")
1935
+ if any("moe" in t.lower() for t in tags) or "A3B" in name or "A4B" in name or "A10B" in name:
1936
+ caps.append("MoE")
1937
+
1938
+ # Estimate smallest quant size from model name
1939
+ # Rule: ~0.5GB per 1B params at Q2, MoE active params only
1940
+ import re as _re_est
1941
+ param_match = _re_est.search(r'(\d+)[bB]', name)
1942
+ active_match = _re_est.search(r'A(\d+)[bB]', name)
1943
+ est_smallest_gb = None
1944
+ if param_match:
1945
+ total_b = int(param_match.group(1))
1946
+ active_b = int(active_match.group(1)) if active_match else total_b
1947
+ # For MoE: estimate from total params, not active
1948
+ # Q2 quant ≈ 0.35 GB per 1B total params
1949
+ est_smallest_gb = round(total_b * 0.35, 1)
1950
+
1951
+ seen[base] = {
1952
+ "repo_id": rid,
1953
+ "label": name,
1954
+ "downloads": dl,
1955
+ "likes": m.get("likes", 0),
1956
+ "author": author,
1957
+ "caps": caps,
1958
+ "est_smallest_gb": est_smallest_gb,
1959
+ "_rank": rank,
1960
+ "_base": base,
1961
+ }
1962
+
1963
+ result = list(seen.values())
1964
+ _hf_model_cache["data"] = result
1965
+ _hf_model_cache["ts"] = time.time()
1966
+ return result
1967
+
1968
+
1969
+ def fetch_unsloth_top_models(limit=12):
1970
+ """Top GGUF models sorted by downloads. Cached, parallel fetch."""
1971
+ models = _fetch_all_hf_models()
1972
+ models_sorted = sorted(models, key=lambda x: x["downloads"], reverse=True)
1973
+ return models_sorted[:limit]
1974
+
1975
+
1976
+ def fetch_hf_trending_models(limit=5, sort="downloads"):
1977
+ """GGUF models sorted by downloads or likes. Cached, parallel fetch."""
1978
+ models = _fetch_all_hf_models()
1979
+ if sort == "likes":
1980
+ models_sorted = sorted(models, key=lambda x: x.get("likes", 0), reverse=True)
1981
+ else:
1982
+ models_sorted = sorted(models, key=lambda x: x["downloads"], reverse=True)
1983
+ return models_sorted[:limit]
1984
+
1985
+
1986
+ # Legacy compat — old code referenced this directly
1987
+ def _fetch_unsloth_top_compat(limit=12):
1988
+ return fetch_unsloth_top_models(limit)
1989
+
1990
+
1991
+
1992
+
1993
+ def fetch_hf_model(query):
1994
+ """Fetch GGUF model info from HuggingFace.
1995
+
1996
+ Accepts:
1997
+ - Full URL: https://huggingface.co/unsloth/gemma-4-26B-A4B-it-GGUF
1998
+ - Repo ID: unsloth/gemma-4-26B-A4B-it-GGUF
1999
+ - Search term: gemma 4 26b gguf
2000
+
2001
+ Returns dict with model name, GGUF files with sizes, or None.
2002
+ """
2003
+ import re as _re
2004
+
2005
+ repo_id = None
2006
+
2007
+ # Parse URL
2008
+ if "huggingface.co" in query:
2009
+ # https://huggingface.co/org/model or /org/model/...
2010
+ m = _re.search(r'huggingface\.co/([^/]+/[^/\s?#]+)', query)
2011
+ if m:
2012
+ repo_id = m.group(1)
2013
+ elif "/" in query and " " not in query:
2014
+ # Direct repo ID: unsloth/gemma-4-26B-A4B-it-GGUF
2015
+ repo_id = query
2016
+ elif "ollama.com" in query:
2017
+ # Ollama URL — extract model name for search
2018
+ m = _re.search(r'ollama\.com/library/([^/\s?#]+)', query)
2019
+ if m:
2020
+ query = m.group(1) + " gguf"
2021
+
2022
+ # If no repo_id, search HuggingFace
2023
+ if not repo_id:
2024
+ try:
2025
+ search_url = f"https://huggingface.co/api/models?search={urllib.parse.quote(query + ' gguf')}&sort=downloads&direction=-1&limit=5"
2026
+ req = urllib.request.Request(search_url, headers={"User-Agent": "localcoder/1.0"})
2027
+ with urllib.request.urlopen(req, timeout=10) as resp:
2028
+ results = json.loads(resp.read())
2029
+ # Pick first GGUF repo
2030
+ for r in results:
2031
+ if any("gguf" in t.lower() for t in r.get("tags", [])):
2032
+ repo_id = r["id"]
2033
+ break
2034
+ if not repo_id and results:
2035
+ repo_id = results[0]["id"]
2036
+ except Exception:
2037
+ return None
2038
+
2039
+ if not repo_id:
2040
+ return None
2041
+
2042
+ # Fetch model metadata + file sizes (with fallback to search)
2043
+ data = None
2044
+ try:
2045
+ api_url = f"https://huggingface.co/api/models/{repo_id}?blobs=true"
2046
+ req = urllib.request.Request(api_url, headers={"User-Agent": "localcoder/1.0"})
2047
+ with urllib.request.urlopen(req, timeout=10) as resp:
2048
+ data = json.loads(resp.read())
2049
+ except Exception:
2050
+ # Direct lookup failed — try searching with the repo name as query
2051
+ try:
2052
+ import re as _re2
2053
+ search_term = repo_id.split("/")[-1].replace("-", " ").replace("_", " ")
2054
+ # Strip version numbers for better search
2055
+ search_term = _re2.sub(r'\b\d{4}\b', '', search_term).strip()
2056
+ search_url = f"https://huggingface.co/api/models?search={urllib.parse.quote(search_term)}&sort=downloads&direction=-1&limit=3"
2057
+ req = urllib.request.Request(search_url, headers={"User-Agent": "localcoder/1.0"})
2058
+ with urllib.request.urlopen(req, timeout=10) as resp:
2059
+ results = json.loads(resp.read())
2060
+ if results:
2061
+ repo_id = results[0]["id"]
2062
+ api_url = f"https://huggingface.co/api/models/{repo_id}?blobs=true"
2063
+ req = urllib.request.Request(api_url, headers={"User-Agent": "localcoder/1.0"})
2064
+ with urllib.request.urlopen(req, timeout=10) as resp:
2065
+ data = json.loads(resp.read())
2066
+ except Exception:
2067
+ pass
2068
+
2069
+ if not data:
2070
+ return None
2071
+
2072
+ # Extract GGUF files with sizes
2073
+ gguf_files = []
2074
+ for s in data.get("siblings", []):
2075
+ name = s.get("rfilename", "")
2076
+ size = s.get("size", 0)
2077
+ if not name.endswith(".gguf") or size < 500_000_000: # skip tiny/split files
2078
+ continue
2079
+ if "mmproj" in name.lower():
2080
+ continue # skip vision projectors
2081
+ if "-0000" in name:
2082
+ continue # skip split file parts (except first)
2083
+
2084
+ # Parse quant from filename
2085
+ quant = "unknown"
2086
+ qm = _re.search(r'(BF16|F16|Q\d+_K(?:_[A-Z]+)?|Q\d+_\d+|IQ\d+_[A-Z]+|MXFP\d+)', name, _re.IGNORECASE)
2087
+ if qm:
2088
+ quant = qm.group(1).upper()
2089
+
2090
+ gguf_files.append({
2091
+ "filename": name,
2092
+ "size_bytes": size,
2093
+ "size_gb": round(size / (1024**3), 1),
2094
+ "quant": quant,
2095
+ })
2096
+
2097
+ # Sort by size ascending
2098
+ gguf_files.sort(key=lambda x: x["size_bytes"])
2099
+
2100
+ return {
2101
+ "repo_id": repo_id,
2102
+ "name": data.get("id", repo_id).split("/")[-1],
2103
+ "tags": data.get("tags", []),
2104
+ "downloads": data.get("downloads", 0),
2105
+ "gguf_files": gguf_files,
2106
+ }
2107
+
2108
+
2109
+ def simulate_hf_model(query):
2110
+ """Fetch a model from HuggingFace and show which quants fit.
2111
+
2112
+ The "holy shit" feature: paste a URL, see instant fit analysis for every quant.
2113
+ """
2114
+ specs = get_machine_specs()
2115
+ metal = get_metal_gpu_stats()
2116
+ gpu_total = metal.get("total_mb") or specs["gpu_total_mb"]
2117
+ gpu_used = metal.get("alloc_mb", 0)
2118
+
2119
+ console.clear()
2120
+ loading = console.status("[bold cyan] Fetching from HuggingFace...[/]", spinner="dots")
2121
+ loading.start()
2122
+
2123
+ model = fetch_hf_model(query)
2124
+ loading.stop()
2125
+
2126
+ if not model:
2127
+ console.print(f"\n [red]Model not found: {query}[/]")
2128
+ console.print(f" [dim]Try a HuggingFace URL or search term like 'llama 3.1 70b gguf'[/]\n")
2129
+ return
2130
+
2131
+ console.clear()
2132
+ console.print()
2133
+ console.print(f" [bold]{model['repo_id']}[/]")
2134
+ console.print(f" [dim]{specs['chip']} · {specs['ram_gb']}GB RAM · GPU budget: {gpu_total // 1024}GB · In use: {gpu_used // 1024}GB[/]\n")
2135
+
2136
+ if not model["gguf_files"]:
2137
+ console.print(f" [yellow]No GGUF files found in this repo.[/]\n")
2138
+ return
2139
+
2140
+ # Show all quants with fit status
2141
+ table = Table(
2142
+ title=f"Available Quants ({len(model['gguf_files'])})",
2143
+ show_header=True, header_style="bold", border_style="dim", padding=(0, 1),
2144
+ )
2145
+ table.add_column("#", style="dim", width=3)
2146
+ table.add_column("Quant", width=14)
2147
+ table.add_column("Size", justify="right", width=8)
2148
+ table.add_column("Fits GPU?", width=18)
2149
+ table.add_column("Est. Speed", width=12)
2150
+ table.add_column("", width=18)
2151
+
2152
+ best_fit_idx = None
2153
+ for i, f in enumerate(model["gguf_files"], 1):
2154
+ size_gb = f["size_gb"]
2155
+ size_mb = int(size_gb * 1024)
2156
+ fits = size_mb < gpu_total
2157
+ fits_free = size_mb < (gpu_total - gpu_used)
2158
+
2159
+ if fits_free:
2160
+ status = "[green]✓ fits[/]"
2161
+ if best_fit_idx is None or f["size_gb"] > model["gguf_files"][best_fit_idx - 1]["size_gb"]:
2162
+ best_fit_idx = i
2163
+ elif fits:
2164
+ status = "[yellow]⚠ tight[/]"
2165
+ if best_fit_idx is None:
2166
+ best_fit_idx = i
2167
+ else:
2168
+ status = "[red]✗ too big[/]"
2169
+
2170
+ # Speed estimate
2171
+ tps = min(120, max(1, int(49 * 12 / max(1, size_gb)))) if fits else max(1, int(5 * 16 / max(1, size_gb)))
2172
+ speed = f"~{tps} tok/s" if fits else f"[red]~{tps} tok/s[/]"
2173
+
2174
+ # Visual bar
2175
+ bar_pct = min(1.0, size_mb / gpu_total) if gpu_total else 0
2176
+ bar_w = int(bar_pct * 16)
2177
+ bar_color = "green" if fits_free else "yellow" if fits else "red"
2178
+ bar = f"[{bar_color}]{'█' * bar_w}[/{bar_color}][dim]{'░' * (16 - bar_w)}[/]"
2179
+
2180
+ table.add_row(str(i), f["quant"], f"{size_gb}GB", status, speed, bar)
2181
+
2182
+ console.print(table)
2183
+
2184
+ # Recommendation
2185
+ is_unsloth = "unsloth" in model["repo_id"].lower()
2186
+ if best_fit_idx:
2187
+ bf = model["gguf_files"][best_fit_idx - 1]
2188
+ console.print(f"\n [green bold]→ Best fit: #{best_fit_idx} {bf['quant']} ({bf['size_gb']}GB)[/]")
2189
+ console.print(f" [dim]Highest quality that fits your {gpu_total // 1024}GB GPU[/]")
2190
+ if is_unsloth:
2191
+ console.print(f" [dim]Unsloth quants use imatrix calibration — better quality than standard GGUF[/]")
2192
+ else:
2193
+ console.print(f"\n [red]No quant fits your {gpu_total // 1024}GB GPU budget.[/]")
2194
+ smallest = model["gguf_files"][0]
2195
+ console.print(f" [dim]Smallest: {smallest['quant']} at {smallest['size_gb']}GB (need {gpu_total // 1024}GB GPU)[/]")
2196
+ if not is_unsloth:
2197
+ console.print(f" [dim]Tip: check unsloth/ on HuggingFace — they often have smaller K_XL quants[/]")
2198
+
2199
+ # Interactive: pick one to simulate in detail or download
2200
+ console.print(f"\n [dim]Enter # for detailed analysis, 'd #' to download, or 'q' to quit[/]\n")
2201
+ try:
2202
+ ans = input(" > ").strip().lower()
2203
+ except (EOFError, KeyboardInterrupt):
2204
+ return
2205
+
2206
+ if ans == "q" or not ans:
2207
+ return
2208
+
2209
+ download = False
2210
+ if ans.startswith("d ") or ans.startswith("d"):
2211
+ download = True
2212
+ ans = ans.lstrip("d ").strip()
2213
+
2214
+ try:
2215
+ idx = int(ans) - 1
2216
+ if 0 <= idx < len(model["gguf_files"]):
2217
+ chosen = model["gguf_files"][idx]
2218
+ # Run detailed simulation with real size
2219
+ _simulate_with_real_size(chosen, model["repo_id"], specs, gpu_total, gpu_used)
2220
+
2221
+ if download:
2222
+ console.print(f"\n [yellow]Downloading {chosen['filename']}...[/]")
2223
+ _download_gguf(model["repo_id"], chosen["filename"])
2224
+ except ValueError:
2225
+ pass
2226
+
2227
+
2228
+ def _simulate_with_real_size(gguf, repo_id, specs, gpu_total, gpu_used):
2229
+ """Show detailed fit analysis for a specific GGUF file with real size."""
2230
+ size_gb = gguf["size_gb"]
2231
+ size_mb = int(size_gb * 1024)
2232
+ fits = size_mb < gpu_total
2233
+ fits_free = size_mb < (gpu_total - gpu_used)
2234
+
2235
+ kv_per_1k = max(2, int(size_gb * 0.4))
2236
+ tps = min(120, max(1, int(49 * 12 / max(1, size_gb)))) if fits else max(1, int(5))
2237
+
2238
+ console.print(f"\n [bold]{repo_id}[/] · [cyan]{gguf['quant']}[/] · [bold]{size_gb}GB[/]")
2239
+
2240
+ # Memory bar
2241
+ bw = 50
2242
+ mb = int(min(1.0, size_mb / gpu_total) * bw) if gpu_total else 0
2243
+ ub = int(min(1.0, gpu_used / gpu_total) * bw) if gpu_total else 0
2244
+ fb = max(0, bw - mb - ub)
2245
+ console.print(f"\n [cyan]{'█' * ub}[/][{'green' if fits else 'red'}]{'█' * mb}[/][dim]{'░' * fb}[/] {gpu_total // 1024}GB")
2246
+ console.print(f" [cyan]■[/] used:{gpu_used // 1024}G [{'green' if fits else 'red'}]■[/] model:{size_gb}G [dim]░[/] free:{max(0, gpu_total - gpu_used - size_mb) // 1024}G")
2247
+
2248
+ # Context table
2249
+ console.print()
2250
+ for ctx in [8192, 32768, 65536, 131072]:
2251
+ kv = kv_per_1k * (ctx // 1024)
2252
+ tot = size_mb + kv
2253
+ h = gpu_total - tot
2254
+ icon = "[green]✓[/]" if h > 2000 else "[yellow]⚠[/]" if h > 0 else "[red]✗[/]"
2255
+ kv_s = f"{kv}M" if kv < 1024 else f"{kv / 1024:.1f}G"
2256
+ console.print(f" {icon} {ctx // 1024}K ctx → model {size_gb}G + KV {kv_s} = {tot / 1024:.1f}G")
2257
+
2258
+ console.print(f"\n Est. speed: [bold]~{tps} tok/s[/]" + ("" if fits else " [red](CPU swap)[/]"))
2259
+
2260
+ if not fits:
2261
+ console.print(f" [yellow]→ Try a smaller quant or: sudo sysctl iogpu.wired_limit_mb={int(specs['ram_gb'] * 1024 * 0.9)}[/]")
2262
+
2263
+
2264
+ def _download_gguf(repo_id, filename):
2265
+ """Download a GGUF file from HuggingFace.
2266
+
2267
+ Uses huggingface_hub if available (supports gated models with token).
2268
+ Falls back to curl. No token needed for public repos (Unsloth, bartowski, etc).
2269
+ Gated models (Meta Llama) need: huggingface-cli login
2270
+ """
2271
+ local_dir = MODELS_DIR / repo_id.replace("/", "--")
2272
+ local_dir.mkdir(parents=True, exist_ok=True)
2273
+ dest = local_dir / os.path.basename(filename)
2274
+
2275
+ if dest.exists():
2276
+ console.print(f" [green]Already downloaded: {dest}[/]")
2277
+ return str(dest)
2278
+
2279
+ try:
2280
+ from huggingface_hub import hf_hub_download
2281
+ path = hf_hub_download(repo_id=repo_id, filename=filename, local_dir=str(local_dir))
2282
+ console.print(f" [green]✓ Downloaded: {path}[/]")
2283
+ return path
2284
+ except ImportError:
2285
+ url = f"https://huggingface.co/{repo_id}/resolve/main/{filename}"
2286
+ console.print(f" [dim]Downloading {os.path.basename(filename)}...[/]")
2287
+ cmd = ["curl", "-L", "-o", str(dest), "--progress-bar", url]
2288
+ r = subprocess.run(cmd)
2289
+ if r.returncode == 0 and dest.exists():
2290
+ console.print(f" [green]✓ Downloaded: {dest}[/]")
2291
+ else:
2292
+ console.print(f" [red]Download failed. If gated model, run: huggingface-cli login[/]")
2293
+ return str(dest)
2294
+ except Exception as e:
2295
+ if "401" in str(e) or "403" in str(e) or "gated" in str(e).lower():
2296
+ console.print(f" [red]Gated model — run: huggingface-cli login[/]")
2297
+ else:
2298
+ console.print(f" [red]Download failed: {e}[/]")
2299
+ return None
2300
+
2301
+
2302
+ def simulate_model_fit(model_query):
2303
+ """Predict if a model will fit BEFORE downloading."""
2304
+ import re as _re
2305
+
2306
+ specs = get_machine_specs()
2307
+ metal = get_metal_gpu_stats()
2308
+
2309
+ gpu_total = metal.get("total_mb") or specs["gpu_total_mb"]
2310
+ gpu_used = metal.get("alloc_mb", 0)
2311
+ gpu_free = max(0, gpu_total - gpu_used)
2312
+
2313
+ # Find in known models
2314
+ model_id = None
2315
+ model_info = None
2316
+ query = model_query.lower().replace("-", "").replace("_", "").replace(" ", "")
2317
+ for mid, m in MODELS.items():
2318
+ mid_clean = mid.lower().replace("-", "").replace("_", "")
2319
+ name_clean = m["name"].lower().replace("-", "").replace("_", "").replace(" ", "")
2320
+ if query in mid_clean or query in name_clean:
2321
+ model_id = mid
2322
+ model_info = m
2323
+ break
2324
+
2325
+ if not model_info:
2326
+ param_match = _re.search(r'(\d+)b', query)
2327
+ quant_match = _re.search(r'q(\d)', query)
2328
+ if param_match:
2329
+ params_b = int(param_match.group(1))
2330
+ quant = int(quant_match.group(1)) if quant_match else 4
2331
+ bpw = {2: 2.5, 3: 3.5, 4: 4.5, 5: 5.5, 6: 6.5, 8: 8.5}.get(quant, 4.5)
2332
+ size_gb = round(params_b * bpw / 8, 1)
2333
+ model_info = {"name": f"{params_b}B Q{quant}", "size_gb": size_gb}
2334
+ else:
2335
+ console.print(f"\n [red]Unknown model: {model_query}[/]")
2336
+ console.print(f" [dim]Known: {', '.join(MODELS.keys())} or '70b q4'[/]\n")
2337
+ return
2338
+
2339
+ name = model_info["name"]
2340
+ size_gb = model_info["size_gb"]
2341
+ size_mb = int(size_gb * 1024)
2342
+ kv_per_1k = max(2, int(size_gb * 0.4)) # MB per 1K ctx
2343
+
2344
+ fits_gpu = size_mb < gpu_total
2345
+ fits_free = size_mb < gpu_free
2346
+ base_tps = min(120, max(1, int(49 * 12 / max(1, size_gb)))) if fits_gpu else max(1, int(10 * 16 / max(1, size_gb)))
2347
+
2348
+ # Render
2349
+ console.clear()
2350
+ console.print()
2351
+
2352
+ if fits_free:
2353
+ console.print(f" [green bold]✓ {name} WILL FIT[/] · {size_gb}GB model · {gpu_free // 1024}GB free")
2354
+ elif fits_gpu:
2355
+ console.print(f" [yellow bold]⚠ {name} TIGHT FIT[/] · {size_gb}GB · close apps first")
2356
+ else:
2357
+ console.print(f" [red bold]✗ {name} WON'T FIT[/] · {size_gb}GB model · {gpu_total // 1024}GB limit")
2358
+
2359
+ console.print(f" [dim]{specs['chip']} · {specs['ram_gb']}GB RAM · GPU budget: {gpu_total // 1024}GB[/]\n")
2360
+
2361
+ # Memory bar
2362
+ bw = 60
2363
+ mb = int(min(1.0, size_mb / gpu_total) * bw) if gpu_total else 0
2364
+ ub = int(min(1.0, gpu_used / gpu_total) * bw) if gpu_total else 0
2365
+ fb = max(0, bw - mb - ub)
2366
+ console.print(f" GPU Memory: [cyan]{'█' * ub}[/][{'green' if fits_gpu else 'red'}]{'█' * mb}[/][dim]{'░' * fb}[/]")
2367
+ console.print(f" [cyan]■[/] used:{gpu_used // 1024}G [{'green' if fits_gpu else 'red'}]■[/] model:{size_gb}G [dim]░[/] free:{max(0, gpu_total - gpu_used - size_mb) // 1024}G\n")
2368
+
2369
+ # Performance
2370
+ perf = Table(show_header=True, header_style="bold", border_style="dim", padding=(0, 1))
2371
+ perf.add_column("", width=18)
2372
+ perf.add_column("Value", width=16)
2373
+ perf.add_column("", width=38)
2374
+ perf.add_row("Model", f"{size_gb} GB", "Fits GPU" if fits_gpu else "[red]Exceeds GPU → swap[/]")
2375
+ perf.add_row("Compute", "GPU" if fits_gpu else "[red]CPU[/]", "All layers on GPU" if fits_gpu else "[red]5-10x slower[/]")
2376
+ perf.add_row("Speed", f"~{base_tps} tok/s", "" if fits_gpu else "[red]swap thrashing[/]")
2377
+ perf.add_row("Download", f"~{max(1, int(size_gb * 12))}s", f"at 100MB/s ({size_gb}GB)")
2378
+ console.print(perf)
2379
+ console.print()
2380
+
2381
+ # Context table
2382
+ ct = Table(title="Context Length vs Memory", show_header=True, header_style="bold", border_style="dim", padding=(0, 1))
2383
+ ct.add_column("Context", width=8)
2384
+ ct.add_column("KV Cache", width=8, justify="right")
2385
+ ct.add_column("Total", width=8, justify="right")
2386
+ ct.add_column("Verdict", width=25)
2387
+ for ctx in [4096, 8192, 32768, 65536, 131072]:
2388
+ kv = kv_per_1k * (ctx // 1024)
2389
+ tot = size_mb + kv
2390
+ h = gpu_total - tot
2391
+ s = "[green]✓ fits[/]" if h > 2000 else f"[yellow]⚠ tight[/]" if h > 0 else f"[red]✗ OOM ({-h // 1024}GB over)[/]"
2392
+ ct.add_row(f"{ctx // 1024}K", f"{kv}M" if kv < 1024 else f"{kv / 1024:.1f}G", f"{tot / 1024:.1f}G", s)
2393
+ console.print(ct)
2394
+
2395
+ console.print()
2396
+ if not fits_gpu:
2397
+ for mid, m in sorted(MODELS.items(), key=lambda x: x[1]["size_gb"], reverse=True):
2398
+ if m["size_gb"] * 1024 < gpu_total:
2399
+ console.print(f" [green]→ Try:[/] {m['name']} ({m['size_gb']}GB) — {m.get('description', '')}")
2400
+ break
2401
+ console.print(f" [green]→ Or:[/] sudo sysctl iogpu.wired_limit_mb={int(specs['ram_gb'] * 1024 * 0.9)}")
2402
+ elif not fits_free:
2403
+ console.print(f" [yellow]→[/] localcoder --cleanup [dim](free {gpu_used // 1024}GB)[/]")
2404
+ else:
2405
+ console.print(f" [green]→[/] localcoder{' -m ' + model_id if model_id else ''} [dim](ready to run)[/]")
2406
+ console.print()
2407
+
2408
+
2409
+ def recommend_model(ram_gb):
2410
+ """Recommend the best model for given RAM."""
2411
+ if ram_gb >= 48:
2412
+ return "gemma4-26b", "26B Q4_K_M (best quality) + vision + 128K context. Plenty of headroom."
2413
+ elif ram_gb >= 36:
2414
+ return "qwen35b-a3b", "Qwen 3.5 35B-A3B Q3_K_XL — best coding quality at 36GB+."
2415
+ elif ram_gb >= 24:
2416
+ return "gemma4-26b", "Gemma 4 26B Q3_K_XL — 49 tok/s, best overall for 24GB. Also try Qwen 35B Q2."
2417
+ elif ram_gb >= 16:
2418
+ return "gemma4-e4b", "E4B is the sweet spot for 16GB. Audio + image + code, 57 tok/s."
2419
+ elif ram_gb >= 8:
2420
+ return "qwen35-4b", "Qwen 3.5 4B — ultrafast at 50 tok/s, only 2.7GB GPU."
2421
+ else:
2422
+ return "gemma4-e2b", "E2B is the only option under 8GB."
2423
+
2424
+
2425
+ def can_run_simultaneously(ram_gb, model1_gb, model2_gb):
2426
+ """Check if two models can run at the same time."""
2427
+ gpu_limit = ram_gb * 0.67 # Metal limit ~67% of unified memory
2428
+ return (model1_gb + model2_gb) < gpu_limit
2429
+
2430
+
2431
+ def stop_conflicting_backends(target_backend):
2432
+ """Stop other backends to free GPU memory."""
2433
+ if target_backend == "ollama":
2434
+ # Kill llama-server if running (frees GPU for Ollama)
2435
+ if check_backend_running("llamacpp"):
2436
+ console.print(f" [yellow]Stopping llama-server to free GPU for Ollama...[/]")
2437
+ try:
2438
+ subprocess.run(["pkill", "-f", "llama-server"], timeout=5)
2439
+ time.sleep(2)
2440
+ except:
2441
+ pass
2442
+ elif target_backend == "llamacpp":
2443
+ # Unload Ollama models to free GPU
2444
+ if check_backend_running("ollama"):
2445
+ console.print(f" [yellow]Unloading Ollama models to free GPU...[/]")
2446
+ try:
2447
+ models = get_running_models("ollama")
2448
+ for m in models:
2449
+ urllib.request.urlopen(
2450
+ urllib.request.Request(
2451
+ "http://127.0.0.1:11434/api/generate",
2452
+ data=json.dumps({"model": m, "keep_alive": 0}).encode(),
2453
+ headers={"Content-Type": "application/json"}
2454
+ ), timeout=5
2455
+ )
2456
+ time.sleep(2)
2457
+ except:
2458
+ pass
2459
+
2460
+
2461
+ def start_ollama_serve():
2462
+ """Ensure Ollama is serving."""
2463
+ if check_backend_running("ollama"):
2464
+ return True
2465
+ try:
2466
+ subprocess.Popen(["ollama", "serve"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
2467
+ time.sleep(2)
2468
+ return check_backend_running("ollama")
2469
+ except:
2470
+ return False