localcoder 0.1.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- localcoder/__init__.py +2 -0
- localcoder/__main__.py +2 -0
- localcoder/agent.py +35 -0
- localcoder/backends.py +2470 -0
- localcoder/bench.py +335 -0
- localcoder/cli.py +827 -0
- localcoder/gemma4coder_display.py +583 -0
- localcoder/setup.py +321 -0
- localcoder/tui.py +276 -0
- localcoder/voice.py +187 -0
- localcoder-0.1.0.dist-info/METADATA +187 -0
- localcoder-0.1.0.dist-info/RECORD +15 -0
- localcoder-0.1.0.dist-info/WHEEL +4 -0
- localcoder-0.1.0.dist-info/entry_points.txt +2 -0
- localcoder-0.1.0.dist-info/licenses/LICENSE +4 -0
localcoder/backends.py
ADDED
|
@@ -0,0 +1,2470 @@
|
|
|
1
|
+
"""Backend discovery, installation, and model management."""
|
|
2
|
+
import json, os, shutil, subprocess, sys, time, urllib.request, urllib.parse
|
|
3
|
+
from pathlib import Path
|
|
4
|
+
|
|
5
|
+
from rich.console import Console
|
|
6
|
+
from rich.panel import Panel
|
|
7
|
+
from rich.table import Table
|
|
8
|
+
from rich.progress import Progress, SpinnerColumn, TextColumn, BarColumn
|
|
9
|
+
|
|
10
|
+
console = Console()
|
|
11
|
+
|
|
12
|
+
# ── Platform detection ──
|
|
13
|
+
HOME = Path.home()
|
|
14
|
+
CONFIG_DIR = HOME / ".localcoder"
|
|
15
|
+
MODELS_DIR = HOME / "models"
|
|
16
|
+
IS_MAC = sys.platform == "darwin"
|
|
17
|
+
IS_LINUX = sys.platform == "linux"
|
|
18
|
+
IS_WSL = IS_LINUX and "microsoft" in (Path("/proc/version").read_text().lower() if Path("/proc/version").exists() else "")
|
|
19
|
+
|
|
20
|
+
def _find_binary(name, extra_paths=None):
|
|
21
|
+
"""Find a binary in PATH or known locations."""
|
|
22
|
+
found = shutil.which(name)
|
|
23
|
+
if found:
|
|
24
|
+
return Path(found)
|
|
25
|
+
for p in (extra_paths or []):
|
|
26
|
+
if Path(p).exists():
|
|
27
|
+
return Path(p)
|
|
28
|
+
return Path(name) # fallback — will fail on check
|
|
29
|
+
|
|
30
|
+
# ── Known backends ──
|
|
31
|
+
BACKENDS = {
|
|
32
|
+
"llamacpp": {
|
|
33
|
+
"name": "llama.cpp",
|
|
34
|
+
"default_port": 8089,
|
|
35
|
+
"binary": _find_binary("llama-server", [
|
|
36
|
+
HOME / ".unsloth/llama.cpp/llama-server",
|
|
37
|
+
Path("/usr/local/bin/llama-server"),
|
|
38
|
+
]),
|
|
39
|
+
"install_cmd": "curl -fsSL https://unsloth.ai/install.sh | sh",
|
|
40
|
+
},
|
|
41
|
+
"ollama": {
|
|
42
|
+
"name": "Ollama",
|
|
43
|
+
"default_port": 11434,
|
|
44
|
+
"binary": _find_binary("ollama", [
|
|
45
|
+
Path("/opt/homebrew/bin/ollama"),
|
|
46
|
+
Path("/usr/local/bin/ollama"),
|
|
47
|
+
HOME / ".local/bin/ollama",
|
|
48
|
+
]),
|
|
49
|
+
"install_cmd": "curl -fsSL https://ollama.com/install.sh | sh" if IS_LINUX else "brew install ollama",
|
|
50
|
+
},
|
|
51
|
+
}
|
|
52
|
+
|
|
53
|
+
# ── Known models ──
|
|
54
|
+
MODELS = {
|
|
55
|
+
"gemma4-26b": {
|
|
56
|
+
"name": "Gemma 4 26B Q3_K_XL",
|
|
57
|
+
"hf_repo": "unsloth/gemma-4-26B-A4B-it-GGUF",
|
|
58
|
+
"hf_pattern": "*UD-Q3_K_XL*",
|
|
59
|
+
"size_gb": 12,
|
|
60
|
+
"ram_required": 16,
|
|
61
|
+
"description": "Best quality on 24GB Mac. MoE, 49 tok/s, perfect tool calling.",
|
|
62
|
+
"ollama_tag": "gemma4:26b",
|
|
63
|
+
"backend": "llamacpp",
|
|
64
|
+
"server_flags": "-ngl 99 -c 131072 -np 1 -fa on -ctk q4_0 -ctv q4_0 --no-warmup --jinja",
|
|
65
|
+
},
|
|
66
|
+
"qwen35b-a3b": {
|
|
67
|
+
"name": "Qwen 3.5 35B-A3B Q2_K_XL",
|
|
68
|
+
"hf_repo": "unsloth/Qwen3.5-35B-A3B-GGUF",
|
|
69
|
+
"hf_pattern": "*UD-Q2_K_XL*",
|
|
70
|
+
"size_gb": 11.3,
|
|
71
|
+
"ram_required": 16,
|
|
72
|
+
"description": "MoE coding beast. 49 tok/s, 256 experts, tool calling, vision.",
|
|
73
|
+
"ollama_tag": None,
|
|
74
|
+
"backend": "llamacpp",
|
|
75
|
+
"server_flags": "-ngl 99 -c 32768 -np 1 -fa on -ctk q4_0 -ctv q4_0 --no-warmup --jinja --reasoning-budget 0",
|
|
76
|
+
},
|
|
77
|
+
"qwen35-4b": {
|
|
78
|
+
"name": "Qwen 3.5 4B",
|
|
79
|
+
"hf_repo": "unsloth/Qwen3.5-4B-GGUF",
|
|
80
|
+
"hf_pattern": "*UD-Q4_K_XL*",
|
|
81
|
+
"size_gb": 2.7,
|
|
82
|
+
"ram_required": 8,
|
|
83
|
+
"description": "Ultrafast at 50 tok/s. Great for quick tasks, only 2.7GB GPU.",
|
|
84
|
+
"ollama_tag": None,
|
|
85
|
+
"backend": "llamacpp",
|
|
86
|
+
"server_flags": "-ngl 99 -c 32768 --jinja --reasoning-budget 0",
|
|
87
|
+
},
|
|
88
|
+
"gemma4-e4b": {
|
|
89
|
+
"name": "Gemma 4 E4B",
|
|
90
|
+
"hf_repo": None,
|
|
91
|
+
"size_gb": 5.5,
|
|
92
|
+
"ram_required": 8,
|
|
93
|
+
"description": "Sweet spot for 16GB. Audio + image + code, 57 tok/s.",
|
|
94
|
+
"ollama_tag": "gemma4:e4b",
|
|
95
|
+
"backend": "ollama",
|
|
96
|
+
},
|
|
97
|
+
"gemma4-e2b": {
|
|
98
|
+
"name": "Gemma 4 E2B",
|
|
99
|
+
"hf_repo": None,
|
|
100
|
+
"size_gb": 4,
|
|
101
|
+
"ram_required": 8,
|
|
102
|
+
"description": "Speed demon. 95 tok/s, basic tasks.",
|
|
103
|
+
"ollama_tag": "gemma4:e2b",
|
|
104
|
+
"backend": "ollama",
|
|
105
|
+
},
|
|
106
|
+
"qwen3.5-27b": {
|
|
107
|
+
"name": "Qwen 3.5 27B",
|
|
108
|
+
"hf_repo": None,
|
|
109
|
+
"size_gb": 17,
|
|
110
|
+
"ram_required": 24,
|
|
111
|
+
"description": "Strong alternative. Dense 27B, good tool calling.",
|
|
112
|
+
"ollama_tag": "qwen3.5:27b",
|
|
113
|
+
"backend": "ollama",
|
|
114
|
+
},
|
|
115
|
+
}
|
|
116
|
+
|
|
117
|
+
|
|
118
|
+
def _parse_footprint_mb(pid):
|
|
119
|
+
"""Get process memory footprint in MB using macOS footprint command."""
|
|
120
|
+
if not IS_MAC:
|
|
121
|
+
return 0
|
|
122
|
+
try:
|
|
123
|
+
fp = subprocess.run(
|
|
124
|
+
["/usr/bin/footprint", str(pid)],
|
|
125
|
+
capture_output=True, text=True, timeout=5,
|
|
126
|
+
)
|
|
127
|
+
for line in fp.stdout.splitlines():
|
|
128
|
+
if "Footprint:" in line:
|
|
129
|
+
parts = line.split("Footprint:")[1].strip().split()
|
|
130
|
+
val = float(parts[0])
|
|
131
|
+
unit = parts[1] if len(parts) > 1 else "KB"
|
|
132
|
+
if "GB" in unit:
|
|
133
|
+
return int(val * 1024)
|
|
134
|
+
elif "MB" in unit:
|
|
135
|
+
return int(val)
|
|
136
|
+
elif "KB" in unit:
|
|
137
|
+
return max(1, int(val / 1024))
|
|
138
|
+
return int(val)
|
|
139
|
+
except Exception:
|
|
140
|
+
pass
|
|
141
|
+
return 0
|
|
142
|
+
|
|
143
|
+
|
|
144
|
+
def get_system_ram_gb():
|
|
145
|
+
"""Get total system RAM in GB (macOS, Linux, WSL)."""
|
|
146
|
+
try:
|
|
147
|
+
if IS_MAC:
|
|
148
|
+
out = subprocess.run(["sysctl", "-n", "hw.memsize"], capture_output=True, text=True, timeout=3)
|
|
149
|
+
return int(out.stdout.strip()) // (1024**3)
|
|
150
|
+
else:
|
|
151
|
+
# Linux / WSL
|
|
152
|
+
with open("/proc/meminfo") as f:
|
|
153
|
+
for line in f:
|
|
154
|
+
if line.startswith("MemTotal:"):
|
|
155
|
+
return int(line.split()[1]) // (1024 * 1024)
|
|
156
|
+
except:
|
|
157
|
+
pass
|
|
158
|
+
return 0
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def get_machine_specs():
|
|
162
|
+
"""Get full machine specs: chip, cores, RAM, GPU memory breakdown."""
|
|
163
|
+
specs = {
|
|
164
|
+
"chip": "Unknown",
|
|
165
|
+
"cpu_cores": 0,
|
|
166
|
+
"gpu_cores": 0,
|
|
167
|
+
"ram_gb": get_system_ram_gb(),
|
|
168
|
+
"gpu_total_mb": 0,
|
|
169
|
+
"gpu_used_mb": 0,
|
|
170
|
+
"gpu_free_mb": 0,
|
|
171
|
+
"gpu_processes": [], # list of {name, pid, rss_mb}
|
|
172
|
+
"mem_pressure": "unknown",
|
|
173
|
+
}
|
|
174
|
+
|
|
175
|
+
if IS_MAC:
|
|
176
|
+
# Chip name
|
|
177
|
+
try:
|
|
178
|
+
out = subprocess.run(
|
|
179
|
+
["sysctl", "-n", "machdep.cpu.brand_string"],
|
|
180
|
+
capture_output=True, text=True, timeout=3,
|
|
181
|
+
)
|
|
182
|
+
specs["chip"] = out.stdout.strip()
|
|
183
|
+
if not specs["chip"] or "Apple" not in specs["chip"]:
|
|
184
|
+
# Fallback for Apple Silicon
|
|
185
|
+
out2 = subprocess.run(
|
|
186
|
+
["system_profiler", "SPHardwareDataType"],
|
|
187
|
+
capture_output=True, text=True, timeout=5,
|
|
188
|
+
)
|
|
189
|
+
for line in out2.stdout.splitlines():
|
|
190
|
+
if "Chip" in line and ":" in line:
|
|
191
|
+
specs["chip"] = line.split(":", 1)[1].strip()
|
|
192
|
+
break
|
|
193
|
+
except Exception:
|
|
194
|
+
pass
|
|
195
|
+
|
|
196
|
+
# CPU / GPU core counts
|
|
197
|
+
try:
|
|
198
|
+
out = subprocess.run(
|
|
199
|
+
["sysctl", "-n", "hw.ncpu"], capture_output=True, text=True, timeout=3,
|
|
200
|
+
)
|
|
201
|
+
specs["cpu_cores"] = int(out.stdout.strip())
|
|
202
|
+
except Exception:
|
|
203
|
+
pass
|
|
204
|
+
try:
|
|
205
|
+
out = subprocess.run(
|
|
206
|
+
["system_profiler", "SPDisplaysDataType"],
|
|
207
|
+
capture_output=True, text=True, timeout=5,
|
|
208
|
+
)
|
|
209
|
+
for line in out.stdout.splitlines():
|
|
210
|
+
if "Total Number of Cores" in line:
|
|
211
|
+
specs["gpu_cores"] = int(line.split(":")[-1].strip())
|
|
212
|
+
break
|
|
213
|
+
except Exception:
|
|
214
|
+
pass
|
|
215
|
+
|
|
216
|
+
# Metal GPU budget — use real ioreg value, then check sysctl override
|
|
217
|
+
# 1. Try ioreg for real Metal VRAM,totalMB
|
|
218
|
+
try:
|
|
219
|
+
import re as _re_ioreg
|
|
220
|
+
_ioreg_out = subprocess.run(["ioreg", "-l"], capture_output=True, text=True, timeout=10)
|
|
221
|
+
for _line in _ioreg_out.stdout.splitlines():
|
|
222
|
+
if "VRAM,totalMB" in _line:
|
|
223
|
+
_m = _re_ioreg.search(r'"VRAM,totalMB"=(\d+)', _line)
|
|
224
|
+
if _m:
|
|
225
|
+
specs["gpu_total_mb"] = int(_m.group(1))
|
|
226
|
+
break
|
|
227
|
+
except Exception:
|
|
228
|
+
pass
|
|
229
|
+
|
|
230
|
+
# 2. Check if user overrode with iogpu.wired_limit_mb
|
|
231
|
+
if specs["gpu_total_mb"] == 0:
|
|
232
|
+
try:
|
|
233
|
+
out = subprocess.run(
|
|
234
|
+
["sysctl", "-n", "iogpu.wired_limit_mb"],
|
|
235
|
+
capture_output=True, text=True, timeout=3,
|
|
236
|
+
)
|
|
237
|
+
custom_limit = int(out.stdout.strip())
|
|
238
|
+
if custom_limit > 0:
|
|
239
|
+
specs["gpu_total_mb"] = custom_limit
|
|
240
|
+
except Exception:
|
|
241
|
+
pass
|
|
242
|
+
|
|
243
|
+
# 3. Fallback to estimate
|
|
244
|
+
if specs["gpu_total_mb"] == 0:
|
|
245
|
+
specs["gpu_total_mb"] = int(specs["ram_gb"] * 1024 * 0.67)
|
|
246
|
+
|
|
247
|
+
# Find GPU-heavy processes (llama-server, ollama, any ML inference)
|
|
248
|
+
gpu_proc_names = ["llama-server", "ollama", "ollama_llama_server",
|
|
249
|
+
"mlx_lm", "whisper"]
|
|
250
|
+
try:
|
|
251
|
+
out = subprocess.run(
|
|
252
|
+
["ps", "axo", "pid,comm"],
|
|
253
|
+
capture_output=True, text=True, timeout=3,
|
|
254
|
+
)
|
|
255
|
+
for line in out.stdout.splitlines()[1:]:
|
|
256
|
+
parts = line.split()
|
|
257
|
+
if len(parts) < 2:
|
|
258
|
+
continue
|
|
259
|
+
pid, comm = parts[0], parts[1]
|
|
260
|
+
name = os.path.basename(comm)
|
|
261
|
+
if not any(gp in name for gp in gpu_proc_names):
|
|
262
|
+
continue
|
|
263
|
+
mem_mb = _parse_footprint_mb(pid)
|
|
264
|
+
if mem_mb < 10:
|
|
265
|
+
# Fallback to RSS
|
|
266
|
+
try:
|
|
267
|
+
rss = subprocess.run(
|
|
268
|
+
["ps", "-o", "rss=", "-p", pid],
|
|
269
|
+
capture_output=True, text=True,
|
|
270
|
+
)
|
|
271
|
+
if rss.stdout.strip():
|
|
272
|
+
mem_mb = int(rss.stdout.strip()) // 1024
|
|
273
|
+
except Exception:
|
|
274
|
+
pass
|
|
275
|
+
|
|
276
|
+
if mem_mb > 100:
|
|
277
|
+
specs["gpu_processes"].append({
|
|
278
|
+
"name": name, "pid": int(pid), "rss_mb": mem_mb,
|
|
279
|
+
})
|
|
280
|
+
except Exception:
|
|
281
|
+
pass
|
|
282
|
+
|
|
283
|
+
specs["gpu_used_mb"] = sum(p["rss_mb"] for p in specs["gpu_processes"])
|
|
284
|
+
specs["gpu_free_mb"] = max(0, specs["gpu_total_mb"] - specs["gpu_used_mb"])
|
|
285
|
+
|
|
286
|
+
# Memory pressure
|
|
287
|
+
try:
|
|
288
|
+
out = subprocess.run(
|
|
289
|
+
["sysctl", "-n", "kern.memorystatus_vm_pressure_level"],
|
|
290
|
+
capture_output=True, text=True, timeout=3,
|
|
291
|
+
)
|
|
292
|
+
level = int(out.stdout.strip())
|
|
293
|
+
specs["mem_pressure"] = {0: "normal", 1: "warn", 2: "critical", 4: "critical"}.get(level, "unknown")
|
|
294
|
+
except Exception:
|
|
295
|
+
pass
|
|
296
|
+
|
|
297
|
+
elif IS_LINUX:
|
|
298
|
+
# Linux / WSL
|
|
299
|
+
try:
|
|
300
|
+
with open("/proc/cpuinfo") as f:
|
|
301
|
+
specs["cpu_cores"] = sum(1 for line in f if line.startswith("processor"))
|
|
302
|
+
with open("/proc/meminfo") as f:
|
|
303
|
+
for line in f:
|
|
304
|
+
if line.startswith("MemAvailable:"):
|
|
305
|
+
avail_kb = int(line.split()[1])
|
|
306
|
+
specs["gpu_free_mb"] = avail_kb // 1024
|
|
307
|
+
except Exception:
|
|
308
|
+
pass
|
|
309
|
+
|
|
310
|
+
# Check for NVIDIA GPU
|
|
311
|
+
try:
|
|
312
|
+
out = subprocess.run(
|
|
313
|
+
["nvidia-smi", "--query-gpu=name,memory.total,memory.used,memory.free",
|
|
314
|
+
"--format=csv,noheader,nounits"],
|
|
315
|
+
capture_output=True, text=True, timeout=5,
|
|
316
|
+
)
|
|
317
|
+
if out.returncode == 0:
|
|
318
|
+
parts = out.stdout.strip().split(",")
|
|
319
|
+
specs["chip"] = parts[0].strip()
|
|
320
|
+
specs["gpu_total_mb"] = int(parts[1].strip())
|
|
321
|
+
specs["gpu_used_mb"] = int(parts[2].strip())
|
|
322
|
+
specs["gpu_free_mb"] = int(parts[3].strip())
|
|
323
|
+
except FileNotFoundError:
|
|
324
|
+
specs["gpu_total_mb"] = specs["ram_gb"] * 1024
|
|
325
|
+
specs["gpu_free_mb"] = specs["gpu_total_mb"]
|
|
326
|
+
|
|
327
|
+
return specs
|
|
328
|
+
|
|
329
|
+
|
|
330
|
+
def cleanup_gpu_memory(force=False):
|
|
331
|
+
"""Free GPU memory by unloading idle models and killing stale processes.
|
|
332
|
+
|
|
333
|
+
Returns dict with what was cleaned up.
|
|
334
|
+
"""
|
|
335
|
+
cleaned = {"ollama_unloaded": [], "processes_killed": [], "freed_mb": 0}
|
|
336
|
+
|
|
337
|
+
# 1. Unload Ollama models (set keep_alive=0)
|
|
338
|
+
if check_backend_running("ollama"):
|
|
339
|
+
try:
|
|
340
|
+
models = get_running_models("ollama")
|
|
341
|
+
for m in models:
|
|
342
|
+
urllib.request.urlopen(
|
|
343
|
+
urllib.request.Request(
|
|
344
|
+
"http://127.0.0.1:11434/api/generate",
|
|
345
|
+
data=json.dumps({"model": m, "keep_alive": 0}).encode(),
|
|
346
|
+
headers={"Content-Type": "application/json"},
|
|
347
|
+
), timeout=5,
|
|
348
|
+
)
|
|
349
|
+
cleaned["ollama_unloaded"].append(m)
|
|
350
|
+
except Exception:
|
|
351
|
+
pass
|
|
352
|
+
|
|
353
|
+
# 2. Kill stale llama-server processes (if force or not our session)
|
|
354
|
+
if force:
|
|
355
|
+
try:
|
|
356
|
+
out = subprocess.run(
|
|
357
|
+
["pgrep", "-f", "llama-server"], capture_output=True, text=True,
|
|
358
|
+
)
|
|
359
|
+
for pid in out.stdout.strip().splitlines():
|
|
360
|
+
pid = pid.strip()
|
|
361
|
+
if pid:
|
|
362
|
+
rss = subprocess.run(
|
|
363
|
+
["ps", "-o", "rss=", "-p", pid],
|
|
364
|
+
capture_output=True, text=True,
|
|
365
|
+
)
|
|
366
|
+
mb = int(rss.stdout.strip()) // 1024 if rss.stdout.strip() else 0
|
|
367
|
+
subprocess.run(["kill", pid], timeout=3)
|
|
368
|
+
cleaned["processes_killed"].append({"pid": int(pid), "freed_mb": mb})
|
|
369
|
+
cleaned["freed_mb"] += mb
|
|
370
|
+
except Exception:
|
|
371
|
+
pass
|
|
372
|
+
|
|
373
|
+
# Give time for memory to be released
|
|
374
|
+
if cleaned["ollama_unloaded"] or cleaned["processes_killed"]:
|
|
375
|
+
time.sleep(2)
|
|
376
|
+
|
|
377
|
+
return cleaned
|
|
378
|
+
|
|
379
|
+
|
|
380
|
+
def get_top_memory_processes(min_mb=80, limit=12):
|
|
381
|
+
"""Get top memory-consuming processes with accurate footprint.
|
|
382
|
+
|
|
383
|
+
Categorizes processes as:
|
|
384
|
+
- 'ml': ML inference servers (llama-server, ollama)
|
|
385
|
+
- 'app': User apps (Chrome, Slack, etc.)
|
|
386
|
+
- 'system': System processes (WindowServer, kernel_task)
|
|
387
|
+
"""
|
|
388
|
+
SYSTEM_PROCS = {
|
|
389
|
+
"WindowServer", "WindowManager", "kernel_task", "launchd",
|
|
390
|
+
"mds", "mds_stores", "opendirectoryd", "fseventsd",
|
|
391
|
+
"corebrightnessd", "bluetoothd", "nearbyd", "systemstats",
|
|
392
|
+
"loginwindow", "Dock", "Finder", "SystemUIServer",
|
|
393
|
+
"ControlCenter", "NotificationCenter", "Terminal", "iTerm2",
|
|
394
|
+
"zsh", "bash", "sh",
|
|
395
|
+
}
|
|
396
|
+
# System procs safe to kill (macOS auto-restarts them lean, freeing bloated memory)
|
|
397
|
+
# Maps name → description for the debloat wizard
|
|
398
|
+
SYSTEM_RESTARTABLE = {
|
|
399
|
+
"CoreLocationAgent": "Location services cache — often leaks to 8GB+",
|
|
400
|
+
"CacheDeleteExtension": "Storage cleanup daemon — bloats during disk scans",
|
|
401
|
+
"remindd": "Reminders sync daemon — known memory leak on macOS 15",
|
|
402
|
+
"suggestd": "Siri suggestions indexer — heavy background ML",
|
|
403
|
+
"photoanalysisd": "Photos face/scene ML analysis — runs after imports",
|
|
404
|
+
"mediaanalysisd": "Media ML classifier — visual lookup, Live Text",
|
|
405
|
+
"nsurlsessiond": "Background network downloads — iCloud sync cache",
|
|
406
|
+
"cloudd": "iCloud Drive sync daemon — bloats with many files",
|
|
407
|
+
"bird": "CloudKit/iCloud container daemon",
|
|
408
|
+
"callservicesd": "FaceTime/phone call routing daemon",
|
|
409
|
+
"SafariLaunchAgent": "Safari preload — keeps old pages in memory",
|
|
410
|
+
"SoftwareUpdateNotificationManager": "macOS update checker — safe to kill",
|
|
411
|
+
"com.apple.WebKit.Networking": "WebKit network process — cache bloat",
|
|
412
|
+
}
|
|
413
|
+
ML_PROCS = {
|
|
414
|
+
"llama-server", "ollama", "ollama_llama_server",
|
|
415
|
+
"mlx_lm", "whisper", "vllm", "tgi",
|
|
416
|
+
}
|
|
417
|
+
|
|
418
|
+
procs = []
|
|
419
|
+
try:
|
|
420
|
+
out = subprocess.run(
|
|
421
|
+
["ps", "-eo", "pid=,rss=,comm="], capture_output=True, text=True, timeout=5,
|
|
422
|
+
)
|
|
423
|
+
# Pre-filter by RSS to avoid calling footprint on hundreds of tiny processes
|
|
424
|
+
candidates = []
|
|
425
|
+
for line in out.stdout.strip().splitlines():
|
|
426
|
+
parts = line.split()
|
|
427
|
+
if len(parts) < 3:
|
|
428
|
+
continue
|
|
429
|
+
pid, rss_kb, comm = parts[0], parts[1], " ".join(parts[2:])
|
|
430
|
+
try:
|
|
431
|
+
rss_mb = int(rss_kb) // 1024
|
|
432
|
+
except ValueError:
|
|
433
|
+
continue
|
|
434
|
+
if rss_mb < min_mb // 4: # loose pre-filter
|
|
435
|
+
continue
|
|
436
|
+
candidates.append((pid, rss_mb, comm))
|
|
437
|
+
|
|
438
|
+
# Sort by RSS descending, only footprint top N candidates (fast)
|
|
439
|
+
candidates.sort(key=lambda x: x[1], reverse=True)
|
|
440
|
+
candidates = candidates[:limit * 3] # check 3x limit, take top N
|
|
441
|
+
|
|
442
|
+
for pid, rss_mb, comm in candidates:
|
|
443
|
+
name = os.path.basename(comm.split()[0]) if comm else "?"
|
|
444
|
+
|
|
445
|
+
# Use RSS directly (fast) — footprint is 0.3s per process
|
|
446
|
+
fp_mb = rss_mb
|
|
447
|
+
|
|
448
|
+
if fp_mb < min_mb:
|
|
449
|
+
continue
|
|
450
|
+
|
|
451
|
+
# Categorize
|
|
452
|
+
if name in ML_PROCS or any(ml in name for ml in ML_PROCS):
|
|
453
|
+
category = "ml"
|
|
454
|
+
elif name in SYSTEM_PROCS:
|
|
455
|
+
category = "system"
|
|
456
|
+
elif name in SYSTEM_RESTARTABLE or any(sr in name for sr in SYSTEM_RESTARTABLE):
|
|
457
|
+
category = "bloat"
|
|
458
|
+
else:
|
|
459
|
+
category = "app"
|
|
460
|
+
|
|
461
|
+
procs.append({
|
|
462
|
+
"pid": int(pid),
|
|
463
|
+
"name": name,
|
|
464
|
+
"mb": fp_mb,
|
|
465
|
+
"category": category,
|
|
466
|
+
"killable": category not in ("system",),
|
|
467
|
+
})
|
|
468
|
+
except Exception:
|
|
469
|
+
pass
|
|
470
|
+
|
|
471
|
+
# Normalize names for grouping
|
|
472
|
+
def _group_name(name):
|
|
473
|
+
# Group all Chrome helpers under "Chrome"
|
|
474
|
+
if "Google" in name or "Chrome" in name:
|
|
475
|
+
return "Chrome"
|
|
476
|
+
return name
|
|
477
|
+
|
|
478
|
+
grouped = {}
|
|
479
|
+
for p in procs:
|
|
480
|
+
key = _group_name(p["name"])
|
|
481
|
+
if key in grouped:
|
|
482
|
+
grouped[key]["mb"] += p["mb"]
|
|
483
|
+
grouped[key]["count"] += 1
|
|
484
|
+
grouped[key]["pids"].append(p["pid"])
|
|
485
|
+
else:
|
|
486
|
+
grouped[key] = {**p, "name": key, "count": 1, "pids": [p["pid"]]}
|
|
487
|
+
|
|
488
|
+
result = sorted(grouped.values(), key=lambda x: x["mb"], reverse=True)
|
|
489
|
+
return result[:limit]
|
|
490
|
+
|
|
491
|
+
|
|
492
|
+
def print_machine_specs(specs=None):
|
|
493
|
+
"""Print a compact machine specs panel using Rich."""
|
|
494
|
+
if specs is None:
|
|
495
|
+
specs = get_machine_specs()
|
|
496
|
+
|
|
497
|
+
ram = specs["ram_gb"]
|
|
498
|
+
gpu_total = specs["gpu_total_mb"]
|
|
499
|
+
gpu_used = specs["gpu_used_mb"]
|
|
500
|
+
gpu_free = specs["gpu_free_mb"]
|
|
501
|
+
|
|
502
|
+
# Color code free GPU memory
|
|
503
|
+
if gpu_free > 14000:
|
|
504
|
+
free_color = "green"
|
|
505
|
+
elif gpu_free > 8000:
|
|
506
|
+
free_color = "yellow"
|
|
507
|
+
else:
|
|
508
|
+
free_color = "red"
|
|
509
|
+
|
|
510
|
+
pressure_color = {"normal": "green", "warn": "yellow", "critical": "red"}.get(
|
|
511
|
+
specs["mem_pressure"], "dim"
|
|
512
|
+
)
|
|
513
|
+
|
|
514
|
+
lines = [
|
|
515
|
+
f" [bold]{specs['chip']}[/] · {specs['cpu_cores']} CPU"
|
|
516
|
+
+ (f" · {specs['gpu_cores']} GPU cores" if specs['gpu_cores'] else ""),
|
|
517
|
+
f" RAM: [bold]{ram}GB[/] total · Metal GPU budget: [bold]{gpu_total // 1024}GB[/]"
|
|
518
|
+
+ (f" · pressure: [{pressure_color}]{specs['mem_pressure']}[/{pressure_color}]"
|
|
519
|
+
if specs["mem_pressure"] != "unknown" else ""),
|
|
520
|
+
f" GPU VRAM: [{free_color}]{gpu_free // 1024}GB free[/{free_color}]"
|
|
521
|
+
+ f" · {gpu_used // 1024}GB used · {gpu_total // 1024}GB total",
|
|
522
|
+
]
|
|
523
|
+
|
|
524
|
+
if specs["gpu_processes"]:
|
|
525
|
+
procs = " GPU processes: " + ", ".join(
|
|
526
|
+
f"[cyan]{p['name']}[/] ({p['rss_mb']//1024}GB)" for p in specs["gpu_processes"]
|
|
527
|
+
)
|
|
528
|
+
lines.append(procs)
|
|
529
|
+
|
|
530
|
+
console.print(Panel(
|
|
531
|
+
"\n".join(lines),
|
|
532
|
+
title="[bold]Machine Specs[/]",
|
|
533
|
+
border_style="dim",
|
|
534
|
+
padding=(0, 1),
|
|
535
|
+
))
|
|
536
|
+
|
|
537
|
+
|
|
538
|
+
def _detect_model_info(server_config, model_id=None):
|
|
539
|
+
"""Detect model name, quant level, and file size from model path or model_id."""
|
|
540
|
+
info = {"name": None, "quant": None, "size_gb": None}
|
|
541
|
+
|
|
542
|
+
# Try model_id first
|
|
543
|
+
if model_id and model_id in MODELS:
|
|
544
|
+
m = MODELS[model_id]
|
|
545
|
+
info["name"] = m["name"].split(" Q")[0] if " Q" in m["name"] else m["name"]
|
|
546
|
+
info["size_gb"] = m["size_gb"]
|
|
547
|
+
# Extract quant from name
|
|
548
|
+
for part in m["name"].split():
|
|
549
|
+
if part.startswith("Q") and "_" in part:
|
|
550
|
+
info["quant"] = part
|
|
551
|
+
break
|
|
552
|
+
|
|
553
|
+
# Try to parse from model path
|
|
554
|
+
model_path = server_config.get("model_path", "") or ""
|
|
555
|
+
if model_path:
|
|
556
|
+
import re
|
|
557
|
+
basename = os.path.basename(model_path)
|
|
558
|
+
|
|
559
|
+
# Detect quant from filename (e.g., Q3_K_XL, Q4_K_M, Q8_0)
|
|
560
|
+
quant_match = re.search(r'(Q\d+_K(?:_[A-Z]+)?|Q\d+_\d+|IQ\d+_[A-Z]+)', basename, re.IGNORECASE)
|
|
561
|
+
if quant_match:
|
|
562
|
+
info["quant"] = quant_match.group(1).upper()
|
|
563
|
+
|
|
564
|
+
# Detect model name from path
|
|
565
|
+
name_patterns = [
|
|
566
|
+
(r'gemma[-_]?4[-_]?(\d+[bB])', 'Gemma 4'),
|
|
567
|
+
(r'qwen[-_]?3\.?5[-_]?(\d+[bB])', 'Qwen 3.5'),
|
|
568
|
+
(r'llama[-_]?3[-_.]?(\d+[bB])', 'Llama 3'),
|
|
569
|
+
(r'mistral[-_]?(\d+[bB])', 'Mistral'),
|
|
570
|
+
(r'phi[-_]?(\d+)', 'Phi'),
|
|
571
|
+
]
|
|
572
|
+
for pattern, prefix in name_patterns:
|
|
573
|
+
m = re.search(pattern, basename, re.IGNORECASE)
|
|
574
|
+
if m:
|
|
575
|
+
info["name"] = f"{prefix} {m.group(1).upper()}"
|
|
576
|
+
break
|
|
577
|
+
|
|
578
|
+
# Detect file size
|
|
579
|
+
if os.path.exists(model_path):
|
|
580
|
+
try:
|
|
581
|
+
size_bytes = os.path.getsize(model_path)
|
|
582
|
+
info["size_gb"] = round(size_bytes / (1024**3), 1)
|
|
583
|
+
except OSError:
|
|
584
|
+
pass
|
|
585
|
+
|
|
586
|
+
return info
|
|
587
|
+
|
|
588
|
+
|
|
589
|
+
def _build_dashboard_layout(model_id=None):
|
|
590
|
+
"""Build the full dashboard as a single Rich renderable (for clear-screen rendering)."""
|
|
591
|
+
from rich.columns import Columns
|
|
592
|
+
from rich.text import Text
|
|
593
|
+
from rich.rule import Rule
|
|
594
|
+
|
|
595
|
+
specs = get_machine_specs()
|
|
596
|
+
diag = diagnose_gpu_health(model_id)
|
|
597
|
+
top_procs = get_top_memory_processes(min_mb=80, limit=8)
|
|
598
|
+
swap_mb = get_swap_usage_mb()
|
|
599
|
+
|
|
600
|
+
# ── Status Bar (full-width colored line) ──
|
|
601
|
+
status_map = {
|
|
602
|
+
"healthy": ("green", "HEALTHY"),
|
|
603
|
+
"degraded": ("yellow", "DEGRADED"),
|
|
604
|
+
"critical": ("red", "CRITICAL"),
|
|
605
|
+
"unknown": ("dim", "UNKNOWN"),
|
|
606
|
+
}
|
|
607
|
+
sc, sl = status_map.get(diag["status"], ("dim", "?"))
|
|
608
|
+
status_bar = Rule(title=f"[bold {sc}] {sl} [/bold {sc}]", style=sc)
|
|
609
|
+
|
|
610
|
+
# ── Header ──
|
|
611
|
+
header = Text()
|
|
612
|
+
header.append(f" {specs['chip']} · {specs['ram_gb']}GB RAM · ", style="bold")
|
|
613
|
+
header.append(f"{specs.get('gpu_cores', '?')} GPU cores", style="bold")
|
|
614
|
+
|
|
615
|
+
# ── Model Info Line ──
|
|
616
|
+
model_info_obj = _detect_model_info(diag["server_config"], model_id)
|
|
617
|
+
model_line = None
|
|
618
|
+
if model_info_obj["name"]:
|
|
619
|
+
parts = []
|
|
620
|
+
parts.append(f"[bold cyan]{model_info_obj['name']}[/bold cyan]")
|
|
621
|
+
if model_info_obj["quant"]:
|
|
622
|
+
parts.append(f"[yellow]{model_info_obj['quant']}[/yellow]")
|
|
623
|
+
if model_info_obj["size_gb"]:
|
|
624
|
+
parts.append(f"[dim]{model_info_obj['size_gb']}GB[/dim]")
|
|
625
|
+
model_line = Text.from_markup(" " + " · ".join(parts))
|
|
626
|
+
|
|
627
|
+
# ── Status Cards (equal height, horizontal row) ──
|
|
628
|
+
CARD_HEIGHT = 6 # content lines per card (excluding border)
|
|
629
|
+
|
|
630
|
+
gpu_on = diag["on_gpu"]
|
|
631
|
+
compute_lines = []
|
|
632
|
+
if diag["server_config"].get("running"):
|
|
633
|
+
icon = "[green]●[/]" if gpu_on else "[red]●[/]"
|
|
634
|
+
compute_lines.append(f"{icon} {'GPU (Metal)' if gpu_on else 'CPU — SLOW!'}")
|
|
635
|
+
compute_lines.append(f" Layers: {diag['gpu_layers']}/99")
|
|
636
|
+
compute_lines.append(f" Util: {diag['gpu_util_pct']}%")
|
|
637
|
+
compute_lines.append(f" Model: {diag['server_config'].get('footprint_mb', 0)} MB")
|
|
638
|
+
if not gpu_on:
|
|
639
|
+
compute_lines.append("[dim]GPU = 20x faster[/]")
|
|
640
|
+
compute_lines.append("[dim]Use -ngl 99[/]")
|
|
641
|
+
else:
|
|
642
|
+
compute_lines.append("[dim]Server not running[/]")
|
|
643
|
+
|
|
644
|
+
kv_lines = []
|
|
645
|
+
kv_ok = diag["kv_quantized"]
|
|
646
|
+
kv_icon = "[green]●[/]" if kv_ok else "[red]●[/]"
|
|
647
|
+
kv_lines.append(f"{kv_icon} {'Quantized' if kv_ok else 'Full (2x mem!)'}")
|
|
648
|
+
if diag["kv_type"]:
|
|
649
|
+
kv_lines.append(f" Type: {diag['kv_type']}")
|
|
650
|
+
kv_lines.append(f" Size: ~{diag['kv_cache_est_mb']} MB")
|
|
651
|
+
kv_lines.append(f" Ctx: {diag['context_size'] // 1024}K")
|
|
652
|
+
fa_icon = "[green]●[/]" if diag["flash_attn"] else "[yellow]●[/]"
|
|
653
|
+
kv_lines.append(f"{fa_icon} FlashAttn: {'on' if diag['flash_attn'] else 'off'}")
|
|
654
|
+
|
|
655
|
+
pressure_color = {"normal": "green", "warn": "yellow", "critical": "red"}.get(diag["mem_pressure"], "dim")
|
|
656
|
+
swap_color = "red" if diag["swap_thrashing"] else "green"
|
|
657
|
+
gpu_headroom = diag["gpu_total_mb"] - diag["gpu_alloc_mb"]
|
|
658
|
+
hr_color = "green" if gpu_headroom > 2048 else "yellow" if gpu_headroom > 0 else "red"
|
|
659
|
+
mem_lines = [
|
|
660
|
+
f" Pressure: [{pressure_color}]{diag['mem_pressure']}[/{pressure_color}]",
|
|
661
|
+
f" Swap: [{swap_color}]{swap_mb // 1024}GB[/{swap_color}]",
|
|
662
|
+
f" GPU: {diag['gpu_alloc_mb'] // 1024}/{diag['gpu_total_mb'] // 1024}GB",
|
|
663
|
+
f" Free: [{hr_color}]{gpu_headroom // 1024}GB[/{hr_color}]",
|
|
664
|
+
]
|
|
665
|
+
if diag["swap_thrashing"]:
|
|
666
|
+
mem_lines.append("[dim]Swap = 100x slower[/]")
|
|
667
|
+
|
|
668
|
+
# Pad all cards to the same height
|
|
669
|
+
for card_lines in (compute_lines, kv_lines, mem_lines):
|
|
670
|
+
while len(card_lines) < CARD_HEIGHT:
|
|
671
|
+
card_lines.append("")
|
|
672
|
+
|
|
673
|
+
cards = Columns([
|
|
674
|
+
Panel("\n".join(compute_lines), title="[bold]Compute[/]", border_style="cyan", width=26, padding=(0, 1)),
|
|
675
|
+
Panel("\n".join(kv_lines), title="[bold]KV Cache[/]", border_style="cyan", width=26, padding=(0, 1)),
|
|
676
|
+
Panel("\n".join(mem_lines), title="[bold]Memory[/]", border_style="cyan", width=26, padding=(0, 1)),
|
|
677
|
+
], padding=1)
|
|
678
|
+
|
|
679
|
+
# ── VRAM Usage Bar ──
|
|
680
|
+
gpu_budget_mb = diag["gpu_total_mb"] if diag["gpu_total_mb"] > 0 else (specs["ram_gb"] * 1024 * 75 // 100)
|
|
681
|
+
model_mb = diag.get("model_size_mb", 0) or (diag["server_config"].get("footprint_mb", 0))
|
|
682
|
+
kv_mb = diag["kv_cache_est_mb"]
|
|
683
|
+
apps_mb = max(0, diag["gpu_alloc_mb"] - model_mb - kv_mb)
|
|
684
|
+
free_mb = max(0, gpu_budget_mb - model_mb - kv_mb - apps_mb)
|
|
685
|
+
|
|
686
|
+
BAR_WIDTH = 50
|
|
687
|
+
total_for_bar = max(1, gpu_budget_mb)
|
|
688
|
+
seg_model = max(0, int(BAR_WIDTH * model_mb / total_for_bar))
|
|
689
|
+
seg_kv = max(0, int(BAR_WIDTH * kv_mb / total_for_bar))
|
|
690
|
+
seg_apps = max(0, int(BAR_WIDTH * apps_mb / total_for_bar))
|
|
691
|
+
seg_free = max(0, BAR_WIDTH - seg_model - seg_kv - seg_apps)
|
|
692
|
+
|
|
693
|
+
vram_bar = Text()
|
|
694
|
+
vram_bar.append(" VRAM ", style="bold")
|
|
695
|
+
vram_bar.append("\u2588" * seg_model, style="cyan")
|
|
696
|
+
vram_bar.append("\u2588" * seg_kv, style="magenta")
|
|
697
|
+
vram_bar.append("\u2588" * seg_apps, style="yellow")
|
|
698
|
+
vram_bar.append("\u2591" * seg_free, style="dim")
|
|
699
|
+
vram_bar.append(f" {gpu_budget_mb // 1024}GB", style="dim")
|
|
700
|
+
|
|
701
|
+
vram_legend = Text.from_markup(
|
|
702
|
+
" [cyan]\u2588[/] Model"
|
|
703
|
+
f" ({model_mb // 1024}G)"
|
|
704
|
+
" [magenta]\u2588[/] KV Cache"
|
|
705
|
+
f" ({kv_mb // 1024}G)"
|
|
706
|
+
" [yellow]\u2588[/] Apps"
|
|
707
|
+
f" ({apps_mb // 1024}G)"
|
|
708
|
+
" [dim]\u2591[/] Free"
|
|
709
|
+
f" ({free_mb // 1024}G)"
|
|
710
|
+
)
|
|
711
|
+
|
|
712
|
+
# ── Process Table ──
|
|
713
|
+
table = Table(
|
|
714
|
+
show_header=True, header_style="bold",
|
|
715
|
+
border_style="dim", padding=(0, 1), expand=False, width=82,
|
|
716
|
+
)
|
|
717
|
+
table.add_column("#", style="dim", width=3)
|
|
718
|
+
table.add_column("Process", min_width=18)
|
|
719
|
+
table.add_column("Memory", justify="right", width=8)
|
|
720
|
+
table.add_column("Type", width=6)
|
|
721
|
+
table.add_column("", min_width=14)
|
|
722
|
+
|
|
723
|
+
total_reclaimable = 0
|
|
724
|
+
for i, p in enumerate(top_procs, 1):
|
|
725
|
+
mb = p["mb"]
|
|
726
|
+
name = p["name"]
|
|
727
|
+
count = p.get("count", 1)
|
|
728
|
+
label = f"{name}" + (f" \u00d7{count}" if count > 1 else "")
|
|
729
|
+
|
|
730
|
+
cat_style = {"ml": "[cyan]ML[/]", "app": "[yellow]app[/]", "system": "[dim]sys[/]", "bloat": "[red]bloat[/]"}
|
|
731
|
+
cat = cat_style.get(p["category"], "[dim]?[/]")
|
|
732
|
+
|
|
733
|
+
bar_width = min(14, max(1, mb // 300))
|
|
734
|
+
bar_color = "red" if mb > 2000 else "yellow" if mb > 500 else "green"
|
|
735
|
+
bar = f"[{bar_color}]{'\u2588' * bar_width}[/{bar_color}]"
|
|
736
|
+
|
|
737
|
+
size_str = f"{mb / 1024:.1f}G" if mb >= 1024 else f"{mb}M"
|
|
738
|
+
table.add_row(str(i), label, size_str, cat, bar)
|
|
739
|
+
|
|
740
|
+
if p["category"] in ("app", "bloat") and p["killable"]:
|
|
741
|
+
total_reclaimable += mb
|
|
742
|
+
|
|
743
|
+
# ── Fixes ──
|
|
744
|
+
fix_lines = []
|
|
745
|
+
if diag["issues"]:
|
|
746
|
+
for issue in diag["issues"]:
|
|
747
|
+
fix_lines.append(f" [red]\u25cf[/] {issue}")
|
|
748
|
+
fix_lines.append("")
|
|
749
|
+
|
|
750
|
+
# Bloat fixes
|
|
751
|
+
for p in top_procs:
|
|
752
|
+
if p["category"] == "bloat" and p["mb"] > 500:
|
|
753
|
+
freed = p["mb"] // 1024
|
|
754
|
+
fix_lines.append(f" [green]\u2192[/] Kill {p['name']} [dim](~{freed}GB — auto-restarts lean)[/]")
|
|
755
|
+
# App fixes
|
|
756
|
+
for p in top_procs:
|
|
757
|
+
if p["category"] == "app" and p["mb"] > 500:
|
|
758
|
+
count = p.get("count", 1)
|
|
759
|
+
freed = p["mb"] // 1024
|
|
760
|
+
name = p["name"]
|
|
761
|
+
if name == "Chrome":
|
|
762
|
+
fix_lines.append(f" [green]\u2192[/] Close Chrome tabs [dim]({count} procs = ~{freed}GB)[/]")
|
|
763
|
+
elif "claude" in name.lower():
|
|
764
|
+
fix_lines.append(f" [green]\u2192[/] Close Claude windows [dim]({count} = ~{freed}GB)[/]")
|
|
765
|
+
elif freed >= 1:
|
|
766
|
+
fix_lines.append(f" [green]\u2192[/] Quit {name} [dim](~{freed}GB)[/]")
|
|
767
|
+
|
|
768
|
+
if total_reclaimable > 2000:
|
|
769
|
+
fix_lines.append("")
|
|
770
|
+
fix_lines.append(f" [bold]Reclaimable: ~{total_reclaimable // 1024}GB[/] \u00b7 [dim]localcoder --cleanup[/]")
|
|
771
|
+
|
|
772
|
+
fixes_panel = None
|
|
773
|
+
if fix_lines:
|
|
774
|
+
border = "red" if diag["status"] == "critical" else "yellow" if diag["status"] == "degraded" else "dim"
|
|
775
|
+
fixes_panel = Panel("\n".join(fix_lines), title="[bold]Fixes[/]", border_style=border, padding=(0, 1))
|
|
776
|
+
|
|
777
|
+
# ── Glossary (noob-friendly, using a borderless Rich Table for alignment) ──
|
|
778
|
+
glossary_table = Table(show_header=False, show_edge=False, show_lines=False,
|
|
779
|
+
box=None, padding=(0, 1), expand=False)
|
|
780
|
+
glossary_table.add_column("Term", style="dim", width=14, no_wrap=True)
|
|
781
|
+
glossary_table.add_column("Description", style="dim")
|
|
782
|
+
|
|
783
|
+
glossary_entries = [
|
|
784
|
+
("KV Cache", "Stores conversation history in GPU. Grows with context length.\n"
|
|
785
|
+
"128K ctx = 630MB (q4_0) or 1.2GB (f16). Use -ctk q4_0 to halve it."),
|
|
786
|
+
("Quantization", "Compresses model weights: Q3=small Q4=sweet spot Q8=best quality.\n"
|
|
787
|
+
"Rule: ~0.7GB per 1B params at Q4. 26B Q3 = 12GB, Q4 = 18GB."),
|
|
788
|
+
("GPU Layers", "-ngl 99 = all on GPU (fast). Partial offload = 5-10x slower."),
|
|
789
|
+
("Flash Attn", "-fa on = memory-efficient attention. Always enable it."),
|
|
790
|
+
("Swap", "RAM overflow to disk. 100x slower. Keep under 2GB."),
|
|
791
|
+
("MoE", "Mixture of Experts -- only 4B of 26B active per token."),
|
|
792
|
+
("Metal Limit", "macOS reserves ~25% RAM. Override: sudo sysctl iogpu.wired_limit_mb=N"),
|
|
793
|
+
]
|
|
794
|
+
for term, desc in glossary_entries:
|
|
795
|
+
glossary_table.add_row(term, desc)
|
|
796
|
+
|
|
797
|
+
glossary = Panel(glossary_table, title="[bold dim]What do these mean?[/]", border_style="dim", padding=(0, 1))
|
|
798
|
+
|
|
799
|
+
return status_bar, header, model_line, cards, vram_bar, vram_legend, table, fixes_panel, glossary, diag
|
|
800
|
+
|
|
801
|
+
|
|
802
|
+
def _build_status_bar(diag, specs):
|
|
803
|
+
"""Build a pinned bottom status bar like Claude Code / btop."""
|
|
804
|
+
from rich.text import Text
|
|
805
|
+
|
|
806
|
+
swap_mb = get_swap_usage_mb()
|
|
807
|
+
gpu_alloc = diag.get("gpu_alloc_mb", 0)
|
|
808
|
+
gpu_total = diag.get("gpu_total_mb", 0)
|
|
809
|
+
pressure = diag.get("mem_pressure", "?")
|
|
810
|
+
|
|
811
|
+
# Color-code values
|
|
812
|
+
pc = {"normal": "green", "warn": "yellow", "critical": "red"}.get(pressure, "dim")
|
|
813
|
+
sc = "red" if swap_mb > 4000 else "yellow" if swap_mb > 1000 else "green"
|
|
814
|
+
gc = "red" if gpu_alloc > gpu_total else "yellow" if gpu_alloc > gpu_total * 0.8 else "green"
|
|
815
|
+
|
|
816
|
+
bar = Text()
|
|
817
|
+
bar.append(" GPU ", style="bold white on blue")
|
|
818
|
+
bar.append(" ")
|
|
819
|
+
bar.append(f"{gpu_alloc // 1024}/{gpu_total // 1024}GB", style=gc)
|
|
820
|
+
bar.append(" ")
|
|
821
|
+
bar.append(" SWAP ", style="bold white on blue")
|
|
822
|
+
bar.append(" ")
|
|
823
|
+
bar.append(f"{swap_mb // 1024}GB", style=sc)
|
|
824
|
+
bar.append(" ")
|
|
825
|
+
bar.append(" MEM ", style="bold white on blue")
|
|
826
|
+
bar.append(" ")
|
|
827
|
+
bar.append(f"{pressure}", style=pc)
|
|
828
|
+
bar.append(" ")
|
|
829
|
+
|
|
830
|
+
# Shortcuts
|
|
831
|
+
bar.append(" h ", style="bold black on white")
|
|
832
|
+
bar.append(" health ", style="dim")
|
|
833
|
+
bar.append(" c ", style="bold black on white")
|
|
834
|
+
bar.append(" cleanup ", style="dim")
|
|
835
|
+
bar.append(" d ", style="bold black on white")
|
|
836
|
+
bar.append(" debloat ", style="dim")
|
|
837
|
+
bar.append(" s ", style="bold black on white")
|
|
838
|
+
bar.append(" simulate ", style="dim")
|
|
839
|
+
|
|
840
|
+
return bar
|
|
841
|
+
|
|
842
|
+
|
|
843
|
+
def print_health_dashboard(model_id=None):
|
|
844
|
+
"""Render GPU health dashboard — clear screen, fixed width, status bar at bottom."""
|
|
845
|
+
import shutil
|
|
846
|
+
|
|
847
|
+
term_w, term_h = shutil.get_terminal_size()
|
|
848
|
+
|
|
849
|
+
# Use a fixed-width console to prevent stretching
|
|
850
|
+
from rich.console import Console as _Console
|
|
851
|
+
out = _Console(width=min(90, term_w), highlight=False)
|
|
852
|
+
|
|
853
|
+
# Phase 1: Loading spinner
|
|
854
|
+
out.clear()
|
|
855
|
+
loading = out.status("[bold cyan] Scanning GPU, processes, server...[/]", spinner="dots")
|
|
856
|
+
loading.start()
|
|
857
|
+
|
|
858
|
+
specs = get_machine_specs()
|
|
859
|
+
diag = diagnose_gpu_health(model_id)
|
|
860
|
+
|
|
861
|
+
loading.stop()
|
|
862
|
+
|
|
863
|
+
# Phase 2: Build layout
|
|
864
|
+
result = _build_dashboard_layout(model_id)
|
|
865
|
+
status_bar_top, header, model_line, cards, vram_bar, vram_legend, table, fixes_panel, glossary, _diag = result
|
|
866
|
+
|
|
867
|
+
# Phase 3: Clear and render all at once
|
|
868
|
+
out.clear()
|
|
869
|
+
|
|
870
|
+
out.print(status_bar_top)
|
|
871
|
+
out.print(header)
|
|
872
|
+
if model_line:
|
|
873
|
+
out.print(model_line)
|
|
874
|
+
out.print()
|
|
875
|
+
out.print(cards)
|
|
876
|
+
out.print(vram_bar)
|
|
877
|
+
out.print(vram_legend)
|
|
878
|
+
out.print()
|
|
879
|
+
out.print(table)
|
|
880
|
+
if fixes_panel:
|
|
881
|
+
out.print(fixes_panel)
|
|
882
|
+
|
|
883
|
+
# Glossary only if space
|
|
884
|
+
if term_h > 42:
|
|
885
|
+
out.print(glossary)
|
|
886
|
+
|
|
887
|
+
# Status bar at bottom (no ANSI cursor tricks — just print it)
|
|
888
|
+
status_bar_widget = _build_status_bar(diag, specs)
|
|
889
|
+
out.print()
|
|
890
|
+
out.print(status_bar_widget)
|
|
891
|
+
out.print()
|
|
892
|
+
|
|
893
|
+
return diag
|
|
894
|
+
|
|
895
|
+
|
|
896
|
+
def check_backend_installed(backend_id):
|
|
897
|
+
"""Check if a backend binary exists."""
|
|
898
|
+
b = BACKENDS[backend_id]
|
|
899
|
+
# Also check in PATH
|
|
900
|
+
binary = b["binary"]
|
|
901
|
+
if binary.exists():
|
|
902
|
+
return True
|
|
903
|
+
if shutil.which(binary.name):
|
|
904
|
+
return True
|
|
905
|
+
return False
|
|
906
|
+
|
|
907
|
+
|
|
908
|
+
def check_backend_running(backend_id):
|
|
909
|
+
"""Check if backend server is responding."""
|
|
910
|
+
b = BACKENDS[backend_id]
|
|
911
|
+
port = b["default_port"]
|
|
912
|
+
try:
|
|
913
|
+
url = f"http://127.0.0.1:{port}/v1/models"
|
|
914
|
+
req = urllib.request.Request(url, headers={"Content-Type": "application/json"})
|
|
915
|
+
with urllib.request.urlopen(req, timeout=2) as resp:
|
|
916
|
+
return True
|
|
917
|
+
except:
|
|
918
|
+
return False
|
|
919
|
+
|
|
920
|
+
|
|
921
|
+
def get_running_models(backend_id):
|
|
922
|
+
"""Get list of models from a running backend."""
|
|
923
|
+
b = BACKENDS[backend_id]
|
|
924
|
+
port = b["default_port"]
|
|
925
|
+
try:
|
|
926
|
+
url = f"http://127.0.0.1:{port}/v1/models"
|
|
927
|
+
req = urllib.request.Request(url, headers={"Content-Type": "application/json"})
|
|
928
|
+
with urllib.request.urlopen(req, timeout=2) as resp:
|
|
929
|
+
data = json.loads(resp.read())
|
|
930
|
+
return [m.get("id", "") for m in data.get("data", [])]
|
|
931
|
+
except:
|
|
932
|
+
return []
|
|
933
|
+
|
|
934
|
+
|
|
935
|
+
def discover_all():
|
|
936
|
+
"""Discover all backends and their models."""
|
|
937
|
+
results = []
|
|
938
|
+
for bid, b in BACKENDS.items():
|
|
939
|
+
installed = check_backend_installed(bid)
|
|
940
|
+
running = check_backend_running(bid) if installed else False
|
|
941
|
+
models = get_running_models(bid) if running else []
|
|
942
|
+
results.append({
|
|
943
|
+
"id": bid,
|
|
944
|
+
"name": b["name"],
|
|
945
|
+
"installed": installed,
|
|
946
|
+
"running": running,
|
|
947
|
+
"models": models,
|
|
948
|
+
"port": b["default_port"],
|
|
949
|
+
})
|
|
950
|
+
return results
|
|
951
|
+
|
|
952
|
+
|
|
953
|
+
def install_backend(backend_id):
|
|
954
|
+
"""Install a backend (macOS, Linux, WSL)."""
|
|
955
|
+
b = BACKENDS[backend_id]
|
|
956
|
+
console.print(f"\n [bold]Installing {b['name']}...[/]")
|
|
957
|
+
console.print(f" [dim]{b['install_cmd']}[/]\n")
|
|
958
|
+
|
|
959
|
+
r = subprocess.run(["bash", "-c", b["install_cmd"]], timeout=600)
|
|
960
|
+
if r.returncode == 0:
|
|
961
|
+
# Re-discover binary path after install
|
|
962
|
+
BACKENDS[backend_id]["binary"] = _find_binary(
|
|
963
|
+
"llama-server" if backend_id == "llamacpp" else "ollama",
|
|
964
|
+
[BACKENDS[backend_id]["binary"]]
|
|
965
|
+
)
|
|
966
|
+
return r.returncode == 0
|
|
967
|
+
|
|
968
|
+
|
|
969
|
+
def download_model_hf(model_id):
|
|
970
|
+
"""Download a model from HuggingFace."""
|
|
971
|
+
m = MODELS[model_id]
|
|
972
|
+
if not m.get("hf_repo"):
|
|
973
|
+
console.print(f" [red]No HuggingFace repo for {model_id}[/]")
|
|
974
|
+
return None
|
|
975
|
+
|
|
976
|
+
local_dir = MODELS_DIR / model_id
|
|
977
|
+
console.print(f"\n [bold]Downloading {m['name']}...[/]")
|
|
978
|
+
console.print(f" [dim]From: {m['hf_repo']}[/]")
|
|
979
|
+
console.print(f" [dim]To: {local_dir}[/]")
|
|
980
|
+
console.print(f" [dim]Size: ~{m['size_gb']} GB[/]\n")
|
|
981
|
+
|
|
982
|
+
try:
|
|
983
|
+
from huggingface_hub import snapshot_download
|
|
984
|
+
snapshot_download(
|
|
985
|
+
repo_id=m["hf_repo"],
|
|
986
|
+
local_dir=str(local_dir),
|
|
987
|
+
allow_patterns=m.get("hf_pattern", "*").split(",") if m.get("hf_pattern") else None,
|
|
988
|
+
)
|
|
989
|
+
return str(local_dir)
|
|
990
|
+
except ImportError:
|
|
991
|
+
# Fallback to CLI
|
|
992
|
+
cmd = ["huggingface-cli", "download", m["hf_repo"], "--local-dir", str(local_dir)]
|
|
993
|
+
if m.get("hf_pattern"):
|
|
994
|
+
cmd += ["--include", m["hf_pattern"]]
|
|
995
|
+
r = subprocess.run(cmd, timeout=1800)
|
|
996
|
+
return str(local_dir) if r.returncode == 0 else None
|
|
997
|
+
|
|
998
|
+
|
|
999
|
+
def download_model_ollama(model_id):
|
|
1000
|
+
"""Pull a model via Ollama."""
|
|
1001
|
+
m = MODELS[model_id]
|
|
1002
|
+
tag = m.get("ollama_tag")
|
|
1003
|
+
if not tag:
|
|
1004
|
+
return False
|
|
1005
|
+
console.print(f"\n [bold]Pulling {tag} via Ollama...[/]")
|
|
1006
|
+
r = subprocess.run(["ollama", "pull", tag], timeout=1800)
|
|
1007
|
+
return r.returncode == 0
|
|
1008
|
+
|
|
1009
|
+
|
|
1010
|
+
def find_model_file(model_id):
|
|
1011
|
+
"""Find the GGUF file for a model."""
|
|
1012
|
+
local_dir = MODELS_DIR / model_id
|
|
1013
|
+
if not local_dir.exists():
|
|
1014
|
+
# Check HF cache
|
|
1015
|
+
cache_dir = HOME / ".cache/huggingface/hub"
|
|
1016
|
+
m = MODELS.get(model_id, {})
|
|
1017
|
+
if m.get("hf_repo"):
|
|
1018
|
+
repo_dir = cache_dir / f"models--{m['hf_repo'].replace('/', '--')}"
|
|
1019
|
+
if repo_dir.exists():
|
|
1020
|
+
for f in repo_dir.rglob("*.gguf"):
|
|
1021
|
+
if "mmproj" not in f.name:
|
|
1022
|
+
return str(f)
|
|
1023
|
+
return None
|
|
1024
|
+
|
|
1025
|
+
# Find the GGUF file in local dir
|
|
1026
|
+
for f in local_dir.rglob("*.gguf"):
|
|
1027
|
+
if "mmproj" not in f.name:
|
|
1028
|
+
return str(f)
|
|
1029
|
+
return None
|
|
1030
|
+
|
|
1031
|
+
|
|
1032
|
+
def find_mmproj_file(model_id):
|
|
1033
|
+
"""Find the vision projector file for a model."""
|
|
1034
|
+
local_dir = MODELS_DIR / model_id
|
|
1035
|
+
search_dirs = [local_dir]
|
|
1036
|
+
|
|
1037
|
+
# Also check HF cache
|
|
1038
|
+
m = MODELS.get(model_id, {})
|
|
1039
|
+
if m.get("hf_repo"):
|
|
1040
|
+
cache_dir = HOME / ".cache/huggingface/hub" / f"models--{m['hf_repo'].replace('/', '--')}"
|
|
1041
|
+
search_dirs.append(cache_dir)
|
|
1042
|
+
|
|
1043
|
+
for d in search_dirs:
|
|
1044
|
+
if d.exists():
|
|
1045
|
+
for f in d.rglob("*mmproj*"):
|
|
1046
|
+
return str(f)
|
|
1047
|
+
return None
|
|
1048
|
+
|
|
1049
|
+
|
|
1050
|
+
def start_llama_server(model_id, port=8089):
|
|
1051
|
+
"""Start llama-server with a model."""
|
|
1052
|
+
m = MODELS.get(model_id, {})
|
|
1053
|
+
model_file = find_model_file(model_id)
|
|
1054
|
+
if not model_file:
|
|
1055
|
+
console.print(f" [red]Model file not found for {model_id}[/]")
|
|
1056
|
+
return None
|
|
1057
|
+
|
|
1058
|
+
binary = str(BACKENDS["llamacpp"]["binary"])
|
|
1059
|
+
if not os.path.exists(binary):
|
|
1060
|
+
binary = shutil.which("llama-server")
|
|
1061
|
+
if not binary:
|
|
1062
|
+
console.print(f" [red]llama-server not found[/]")
|
|
1063
|
+
return None
|
|
1064
|
+
|
|
1065
|
+
flags = m.get("server_flags", "-ngl 99 -c 32768 --jinja").split()
|
|
1066
|
+
cmd = [binary, "-m", model_file, "--port", str(port)] + flags
|
|
1067
|
+
|
|
1068
|
+
# Add mmproj if available
|
|
1069
|
+
mmproj = find_mmproj_file(model_id)
|
|
1070
|
+
if mmproj:
|
|
1071
|
+
cmd += ["--mmproj", mmproj]
|
|
1072
|
+
else:
|
|
1073
|
+
cmd += ["--no-mmproj"]
|
|
1074
|
+
|
|
1075
|
+
console.print(f" [dim]Starting: {' '.join(os.path.basename(c) if '/' in c else c for c in cmd[:6])}...[/]")
|
|
1076
|
+
|
|
1077
|
+
proc = subprocess.Popen(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
|
1078
|
+
|
|
1079
|
+
# Wait for server
|
|
1080
|
+
for i in range(60):
|
|
1081
|
+
try:
|
|
1082
|
+
req = urllib.request.Request(f"http://127.0.0.1:{port}/health")
|
|
1083
|
+
with urllib.request.urlopen(req, timeout=1):
|
|
1084
|
+
console.print(f" [green]✓ Server ready on port {port}[/]")
|
|
1085
|
+
return proc
|
|
1086
|
+
except:
|
|
1087
|
+
time.sleep(1)
|
|
1088
|
+
|
|
1089
|
+
console.print(f" [red]Server failed to start[/]")
|
|
1090
|
+
proc.kill()
|
|
1091
|
+
return None
|
|
1092
|
+
|
|
1093
|
+
|
|
1094
|
+
def get_gpu_memory_info():
|
|
1095
|
+
"""Get GPU memory total and available (macOS Metal)."""
|
|
1096
|
+
info = {"total_mb": 0, "free_mb": 0, "used_by_llama_mb": 0}
|
|
1097
|
+
ram = get_system_ram_gb()
|
|
1098
|
+
if IS_MAC:
|
|
1099
|
+
# Metal GPU limit is ~67% of unified memory
|
|
1100
|
+
info["total_mb"] = int(ram * 1024 * 0.67)
|
|
1101
|
+
info["free_mb"] = info["total_mb"]
|
|
1102
|
+
|
|
1103
|
+
# Check if llama-server is using GPU
|
|
1104
|
+
try:
|
|
1105
|
+
out = subprocess.run(["pgrep", "-f", "llama-server"], capture_output=True, text=True)
|
|
1106
|
+
if out.stdout.strip():
|
|
1107
|
+
pid = out.stdout.strip().split()[0]
|
|
1108
|
+
rss = subprocess.run(["ps", "-o", "rss=", "-p", pid], capture_output=True, text=True)
|
|
1109
|
+
if rss.stdout.strip():
|
|
1110
|
+
info["used_by_llama_mb"] = int(rss.stdout.strip()) // 1024
|
|
1111
|
+
info["free_mb"] = max(0, info["total_mb"] - info["used_by_llama_mb"])
|
|
1112
|
+
except:
|
|
1113
|
+
pass
|
|
1114
|
+
else:
|
|
1115
|
+
info["total_mb"] = ram * 1024
|
|
1116
|
+
info["free_mb"] = info["total_mb"]
|
|
1117
|
+
return info
|
|
1118
|
+
|
|
1119
|
+
|
|
1120
|
+
def get_llama_server_config():
|
|
1121
|
+
"""Parse running llama-server process flags and API state."""
|
|
1122
|
+
config = {
|
|
1123
|
+
"running": False,
|
|
1124
|
+
"pid": None,
|
|
1125
|
+
"model_path": None,
|
|
1126
|
+
"ngl": 0, # GPU layers (-ngl)
|
|
1127
|
+
"n_ctx": 0, # Context size (-c)
|
|
1128
|
+
"kv_quant": None, # KV cache quantization type (-ctk/-ctv)
|
|
1129
|
+
"flash_attn": False, # Flash attention (-fa)
|
|
1130
|
+
"footprint_mb": 0, # Process memory footprint
|
|
1131
|
+
"flags": [],
|
|
1132
|
+
}
|
|
1133
|
+
|
|
1134
|
+
try:
|
|
1135
|
+
out = subprocess.run(
|
|
1136
|
+
["pgrep", "-f", "llama-server"], capture_output=True, text=True,
|
|
1137
|
+
)
|
|
1138
|
+
pids = out.stdout.strip().splitlines()
|
|
1139
|
+
if not pids:
|
|
1140
|
+
return config
|
|
1141
|
+
config["running"] = True
|
|
1142
|
+
config["pid"] = int(pids[0].strip())
|
|
1143
|
+
|
|
1144
|
+
# Get full command line
|
|
1145
|
+
cmd_out = subprocess.run(
|
|
1146
|
+
["ps", "-o", "args=", "-p", str(config["pid"])],
|
|
1147
|
+
capture_output=True, text=True,
|
|
1148
|
+
)
|
|
1149
|
+
args = cmd_out.stdout.strip().split()
|
|
1150
|
+
config["flags"] = args
|
|
1151
|
+
|
|
1152
|
+
# Parse flags
|
|
1153
|
+
for i, arg in enumerate(args):
|
|
1154
|
+
if arg == "-ngl" and i + 1 < len(args):
|
|
1155
|
+
config["ngl"] = int(args[i + 1])
|
|
1156
|
+
elif arg == "-c" and i + 1 < len(args):
|
|
1157
|
+
config["n_ctx"] = int(args[i + 1])
|
|
1158
|
+
elif arg == "-ctk" and i + 1 < len(args):
|
|
1159
|
+
config["kv_quant"] = args[i + 1]
|
|
1160
|
+
elif arg == "-fa":
|
|
1161
|
+
config["flash_attn"] = True
|
|
1162
|
+
elif arg == "-m" and i + 1 < len(args):
|
|
1163
|
+
config["model_path"] = args[i + 1]
|
|
1164
|
+
|
|
1165
|
+
# Get process memory footprint
|
|
1166
|
+
if IS_MAC:
|
|
1167
|
+
config["footprint_mb"] = _parse_footprint_mb(config["pid"])
|
|
1168
|
+
else:
|
|
1169
|
+
try:
|
|
1170
|
+
rss = subprocess.run(
|
|
1171
|
+
["ps", "-o", "rss=", "-p", str(config["pid"])],
|
|
1172
|
+
capture_output=True, text=True,
|
|
1173
|
+
)
|
|
1174
|
+
if rss.stdout.strip():
|
|
1175
|
+
config["footprint_mb"] = int(rss.stdout.strip()) // 1024
|
|
1176
|
+
except Exception:
|
|
1177
|
+
pass
|
|
1178
|
+
|
|
1179
|
+
except Exception:
|
|
1180
|
+
pass
|
|
1181
|
+
|
|
1182
|
+
return config
|
|
1183
|
+
|
|
1184
|
+
|
|
1185
|
+
def get_metal_gpu_stats():
|
|
1186
|
+
"""Get real GPU stats — Metal on macOS, nvidia-smi on Linux."""
|
|
1187
|
+
stats = {
|
|
1188
|
+
"total_mb": 0,
|
|
1189
|
+
"alloc_mb": 0,
|
|
1190
|
+
"in_use_mb": 0,
|
|
1191
|
+
"free_vram_bytes": 0,
|
|
1192
|
+
"utilization_pct": 0,
|
|
1193
|
+
"temperature_c": None,
|
|
1194
|
+
"fan_pct": None,
|
|
1195
|
+
"power_w": None,
|
|
1196
|
+
"gpu_name": None,
|
|
1197
|
+
}
|
|
1198
|
+
|
|
1199
|
+
if IS_MAC:
|
|
1200
|
+
try:
|
|
1201
|
+
import re
|
|
1202
|
+
out = subprocess.run(
|
|
1203
|
+
["ioreg", "-l"], capture_output=True, text=True, timeout=10,
|
|
1204
|
+
)
|
|
1205
|
+
for line in out.stdout.splitlines():
|
|
1206
|
+
if "VRAM,totalMB" in line:
|
|
1207
|
+
m = re.search(r'"VRAM,totalMB"=(\d+)', line)
|
|
1208
|
+
if m:
|
|
1209
|
+
stats["total_mb"] = int(m.group(1))
|
|
1210
|
+
if "PerformanceStatistics" in line and "Alloc system memory" in line:
|
|
1211
|
+
m = re.search(r'"Alloc system memory"=(\d+)', line)
|
|
1212
|
+
if m:
|
|
1213
|
+
stats["alloc_mb"] = int(m.group(1)) // (1024 * 1024)
|
|
1214
|
+
m2 = re.search(r'"In use system memory"=(\d+)', line)
|
|
1215
|
+
if m2:
|
|
1216
|
+
stats["in_use_mb"] = int(m2.group(1)) // (1024 * 1024)
|
|
1217
|
+
m3 = re.search(r'"Device Utilization %"=(\d+)', line)
|
|
1218
|
+
if m3:
|
|
1219
|
+
stats["utilization_pct"] = int(m3.group(1))
|
|
1220
|
+
|
|
1221
|
+
# Thermal state on macOS (approximate — no direct GPU temp on Apple Silicon)
|
|
1222
|
+
try:
|
|
1223
|
+
therm = subprocess.run(
|
|
1224
|
+
["pmset", "-g", "therm"], capture_output=True, text=True, timeout=3,
|
|
1225
|
+
)
|
|
1226
|
+
if "CPU_Scheduler_Limit" in therm.stdout:
|
|
1227
|
+
# Thermal throttling active
|
|
1228
|
+
stats["temperature_c"] = 95 # approximate
|
|
1229
|
+
except Exception:
|
|
1230
|
+
pass
|
|
1231
|
+
|
|
1232
|
+
except Exception:
|
|
1233
|
+
pass
|
|
1234
|
+
|
|
1235
|
+
elif IS_LINUX:
|
|
1236
|
+
# nvidia-smi for NVIDIA GPUs
|
|
1237
|
+
try:
|
|
1238
|
+
out = subprocess.run(
|
|
1239
|
+
["nvidia-smi",
|
|
1240
|
+
"--query-gpu=name,memory.total,memory.used,memory.free,utilization.gpu,temperature.gpu,fan.speed,power.draw",
|
|
1241
|
+
"--format=csv,noheader,nounits"],
|
|
1242
|
+
capture_output=True, text=True, timeout=5,
|
|
1243
|
+
)
|
|
1244
|
+
if out.returncode == 0:
|
|
1245
|
+
parts = [p.strip() for p in out.stdout.strip().split(",")]
|
|
1246
|
+
if len(parts) >= 8:
|
|
1247
|
+
stats["gpu_name"] = parts[0]
|
|
1248
|
+
stats["total_mb"] = int(float(parts[1]))
|
|
1249
|
+
stats["alloc_mb"] = int(float(parts[2]))
|
|
1250
|
+
stats["in_use_mb"] = int(float(parts[2]))
|
|
1251
|
+
stats["utilization_pct"] = int(float(parts[5]))
|
|
1252
|
+
try:
|
|
1253
|
+
stats["temperature_c"] = int(float(parts[6]))
|
|
1254
|
+
except (ValueError, IndexError):
|
|
1255
|
+
pass
|
|
1256
|
+
try:
|
|
1257
|
+
stats["fan_pct"] = int(float(parts[7].replace("%", "")))
|
|
1258
|
+
except (ValueError, IndexError):
|
|
1259
|
+
pass
|
|
1260
|
+
try:
|
|
1261
|
+
stats["power_w"] = float(parts[8])
|
|
1262
|
+
except (ValueError, IndexError):
|
|
1263
|
+
pass
|
|
1264
|
+
except FileNotFoundError:
|
|
1265
|
+
# No NVIDIA GPU — check for AMD via rocm-smi
|
|
1266
|
+
try:
|
|
1267
|
+
out = subprocess.run(
|
|
1268
|
+
["rocm-smi", "--showmeminfo", "vram", "--csv"],
|
|
1269
|
+
capture_output=True, text=True, timeout=5,
|
|
1270
|
+
)
|
|
1271
|
+
if out.returncode == 0:
|
|
1272
|
+
for line in out.stdout.splitlines()[1:]:
|
|
1273
|
+
parts = line.split(",")
|
|
1274
|
+
if len(parts) >= 3:
|
|
1275
|
+
stats["total_mb"] = int(parts[0]) // (1024 * 1024)
|
|
1276
|
+
stats["alloc_mb"] = int(parts[1]) // (1024 * 1024)
|
|
1277
|
+
except FileNotFoundError:
|
|
1278
|
+
pass
|
|
1279
|
+
|
|
1280
|
+
return stats
|
|
1281
|
+
|
|
1282
|
+
|
|
1283
|
+
def get_disk_info():
|
|
1284
|
+
"""Get disk space and model storage info."""
|
|
1285
|
+
info = {
|
|
1286
|
+
"disk_total_gb": 0,
|
|
1287
|
+
"disk_free_gb": 0,
|
|
1288
|
+
"hf_cache_gb": 0,
|
|
1289
|
+
"models": [], # list of {name, size_gb, path}
|
|
1290
|
+
"docker_gb": 0,
|
|
1291
|
+
}
|
|
1292
|
+
try:
|
|
1293
|
+
# Disk space
|
|
1294
|
+
st = os.statvfs(HOME)
|
|
1295
|
+
info["disk_total_gb"] = round((st.f_blocks * st.f_frsize) / (1024**3))
|
|
1296
|
+
info["disk_free_gb"] = round((st.f_bavail * st.f_frsize) / (1024**3))
|
|
1297
|
+
|
|
1298
|
+
# HuggingFace cache total
|
|
1299
|
+
hf_cache = HOME / ".cache/huggingface/hub"
|
|
1300
|
+
if hf_cache.exists():
|
|
1301
|
+
total = 0
|
|
1302
|
+
# Sum blob sizes (the real files, not symlinks)
|
|
1303
|
+
blobs_dir = hf_cache
|
|
1304
|
+
for blob in blobs_dir.rglob("*"):
|
|
1305
|
+
if blob.is_file() and not blob.is_symlink():
|
|
1306
|
+
total += blob.stat().st_size
|
|
1307
|
+
info["hf_cache_gb"] = round(total / (1024**3))
|
|
1308
|
+
|
|
1309
|
+
# Individual GGUF models
|
|
1310
|
+
for gguf in hf_cache.rglob("*.gguf") if hf_cache.exists() else []:
|
|
1311
|
+
name = gguf.name
|
|
1312
|
+
if "mmproj" in name.lower():
|
|
1313
|
+
continue
|
|
1314
|
+
real = gguf.resolve()
|
|
1315
|
+
try:
|
|
1316
|
+
sz = real.stat().st_size / (1024**3)
|
|
1317
|
+
info["models"].append({"name": name, "size_gb": round(sz, 1), "path": str(real)})
|
|
1318
|
+
except OSError:
|
|
1319
|
+
pass
|
|
1320
|
+
info["models"].sort(key=lambda x: x["size_gb"], reverse=True)
|
|
1321
|
+
|
|
1322
|
+
# Docker (if running)
|
|
1323
|
+
try:
|
|
1324
|
+
out = subprocess.run(["docker", "system", "df", "--format", "{{.Size}}"],
|
|
1325
|
+
capture_output=True, text=True, timeout=3)
|
|
1326
|
+
if out.returncode == 0:
|
|
1327
|
+
for line in out.stdout.strip().splitlines():
|
|
1328
|
+
line = line.strip().upper()
|
|
1329
|
+
if "GB" in line:
|
|
1330
|
+
info["docker_gb"] += float(line.replace("GB", ""))
|
|
1331
|
+
elif "MB" in line:
|
|
1332
|
+
info["docker_gb"] += float(line.replace("MB", "")) / 1024
|
|
1333
|
+
info["docker_gb"] = round(info["docker_gb"])
|
|
1334
|
+
except (FileNotFoundError, Exception):
|
|
1335
|
+
pass
|
|
1336
|
+
except Exception:
|
|
1337
|
+
pass
|
|
1338
|
+
return info
|
|
1339
|
+
|
|
1340
|
+
|
|
1341
|
+
def get_swap_usage_mb():
|
|
1342
|
+
"""Get swap usage in MB."""
|
|
1343
|
+
try:
|
|
1344
|
+
if IS_MAC:
|
|
1345
|
+
out = subprocess.run(
|
|
1346
|
+
["sysctl", "-n", "vm.swapusage"], capture_output=True, text=True, timeout=3,
|
|
1347
|
+
)
|
|
1348
|
+
# "total = 10240.00M used = 8538.06M free = 1701.94M"
|
|
1349
|
+
for part in out.stdout.split():
|
|
1350
|
+
if part.endswith("M") and "used" not in out.stdout.split()[out.stdout.split().index(part) - 1]:
|
|
1351
|
+
continue
|
|
1352
|
+
import re
|
|
1353
|
+
m = re.search(r'used\s*=\s*([\d.]+)M', out.stdout)
|
|
1354
|
+
if m:
|
|
1355
|
+
return int(float(m.group(1)))
|
|
1356
|
+
else:
|
|
1357
|
+
with open("/proc/meminfo") as f:
|
|
1358
|
+
for line in f:
|
|
1359
|
+
if line.startswith("SwapTotal:"):
|
|
1360
|
+
total = int(line.split()[1]) // 1024
|
|
1361
|
+
if line.startswith("SwapFree:"):
|
|
1362
|
+
free = int(line.split()[1]) // 1024
|
|
1363
|
+
return total - free
|
|
1364
|
+
except Exception:
|
|
1365
|
+
pass
|
|
1366
|
+
return 0
|
|
1367
|
+
|
|
1368
|
+
|
|
1369
|
+
def diagnose_gpu_health(model_id=None):
|
|
1370
|
+
"""Full GPU health diagnostic. Returns dict with status and recommendations.
|
|
1371
|
+
|
|
1372
|
+
Checks:
|
|
1373
|
+
1. Is model running on GPU or CPU?
|
|
1374
|
+
2. Is KV cache optimized?
|
|
1375
|
+
3. Is context size appropriate?
|
|
1376
|
+
4. Is swap thrashing happening?
|
|
1377
|
+
5. Are flags optimal?
|
|
1378
|
+
"""
|
|
1379
|
+
diag = {
|
|
1380
|
+
"status": "unknown", # "healthy", "degraded", "critical"
|
|
1381
|
+
"on_gpu": False,
|
|
1382
|
+
"gpu_layers": 0,
|
|
1383
|
+
"total_layers": 99,
|
|
1384
|
+
"kv_quantized": False,
|
|
1385
|
+
"kv_type": None,
|
|
1386
|
+
"flash_attn": False,
|
|
1387
|
+
"context_size": 0,
|
|
1388
|
+
"kv_cache_est_mb": 0,
|
|
1389
|
+
"model_size_mb": 0,
|
|
1390
|
+
"gpu_total_mb": 0,
|
|
1391
|
+
"gpu_alloc_mb": 0,
|
|
1392
|
+
"gpu_util_pct": 0,
|
|
1393
|
+
"swap_used_mb": 0,
|
|
1394
|
+
"swap_thrashing": False,
|
|
1395
|
+
"mem_pressure": "unknown",
|
|
1396
|
+
"issues": [],
|
|
1397
|
+
"fixes": [],
|
|
1398
|
+
"server_config": {},
|
|
1399
|
+
}
|
|
1400
|
+
|
|
1401
|
+
# Get server config
|
|
1402
|
+
srv = get_llama_server_config()
|
|
1403
|
+
diag["server_config"] = srv
|
|
1404
|
+
|
|
1405
|
+
if not srv["running"]:
|
|
1406
|
+
diag["status"] = "unknown"
|
|
1407
|
+
diag["issues"].append("llama-server not running")
|
|
1408
|
+
return diag
|
|
1409
|
+
|
|
1410
|
+
# GPU layer offload
|
|
1411
|
+
diag["gpu_layers"] = srv["ngl"]
|
|
1412
|
+
diag["on_gpu"] = srv["ngl"] >= 90 # -ngl 99 means all on GPU
|
|
1413
|
+
diag["context_size"] = srv["n_ctx"]
|
|
1414
|
+
diag["flash_attn"] = srv["flash_attn"]
|
|
1415
|
+
|
|
1416
|
+
if not diag["on_gpu"]:
|
|
1417
|
+
diag["issues"].append(f"Only {srv['ngl']} layers on GPU — model partially on CPU")
|
|
1418
|
+
diag["fixes"].append("Restart with -ngl 99 to offload all layers to GPU")
|
|
1419
|
+
|
|
1420
|
+
# KV cache
|
|
1421
|
+
diag["kv_type"] = srv["kv_quant"]
|
|
1422
|
+
diag["kv_quantized"] = srv["kv_quant"] in ("q4_0", "q8_0", "q4_1", "f16")
|
|
1423
|
+
if not diag["kv_quantized"]:
|
|
1424
|
+
diag["issues"].append("KV cache not quantized — using full precision (2x memory)")
|
|
1425
|
+
diag["fixes"].append("Add -ctk q4_0 -ctv q4_0 to quantize KV cache (saves ~50% KV memory)")
|
|
1426
|
+
|
|
1427
|
+
if not diag["flash_attn"]:
|
|
1428
|
+
diag["issues"].append("Flash attention disabled — slower and more memory")
|
|
1429
|
+
diag["fixes"].append("Add -fa on to enable flash attention")
|
|
1430
|
+
|
|
1431
|
+
# Estimate KV cache memory
|
|
1432
|
+
# For Gemma 4 26B: 5 global layers × 128K context × 2 (K+V) × hidden_dim
|
|
1433
|
+
# With q4_0: ~630MB. Without quantization: ~1.2GB
|
|
1434
|
+
if diag["context_size"] > 0:
|
|
1435
|
+
# Rough estimate: 128K ctx with q4_0 KV ≈ 630MB, without ≈ 1200MB
|
|
1436
|
+
ctx_ratio = diag["context_size"] / 131072
|
|
1437
|
+
if diag["kv_quantized"]:
|
|
1438
|
+
diag["kv_cache_est_mb"] = int(630 * ctx_ratio)
|
|
1439
|
+
else:
|
|
1440
|
+
diag["kv_cache_est_mb"] = int(1200 * ctx_ratio)
|
|
1441
|
+
|
|
1442
|
+
# Model size
|
|
1443
|
+
if model_id and model_id in MODELS:
|
|
1444
|
+
diag["model_size_mb"] = int(MODELS[model_id]["size_gb"] * 1024)
|
|
1445
|
+
|
|
1446
|
+
# Metal GPU stats
|
|
1447
|
+
metal = get_metal_gpu_stats()
|
|
1448
|
+
diag["gpu_total_mb"] = metal["total_mb"]
|
|
1449
|
+
diag["gpu_alloc_mb"] = metal["alloc_mb"]
|
|
1450
|
+
diag["gpu_util_pct"] = metal["utilization_pct"]
|
|
1451
|
+
|
|
1452
|
+
# Swap check
|
|
1453
|
+
diag["swap_used_mb"] = get_swap_usage_mb()
|
|
1454
|
+
diag["swap_thrashing"] = diag["swap_used_mb"] > 4000 # >4GB swap = bad
|
|
1455
|
+
|
|
1456
|
+
if diag["swap_thrashing"]:
|
|
1457
|
+
diag["issues"].append(f"Swap thrashing: {diag['swap_used_mb'] // 1024}GB in swap — major slowdown")
|
|
1458
|
+
diag["fixes"].append("Reduce context size (-c 32768) or use smaller quant to free GPU memory")
|
|
1459
|
+
|
|
1460
|
+
# Memory pressure
|
|
1461
|
+
try:
|
|
1462
|
+
out = subprocess.run(
|
|
1463
|
+
["sysctl", "-n", "kern.memorystatus_vm_pressure_level"],
|
|
1464
|
+
capture_output=True, text=True, timeout=3,
|
|
1465
|
+
)
|
|
1466
|
+
level = int(out.stdout.strip())
|
|
1467
|
+
diag["mem_pressure"] = {0: "normal", 1: "warn", 2: "critical", 4: "critical"}.get(level, "unknown")
|
|
1468
|
+
except Exception:
|
|
1469
|
+
pass
|
|
1470
|
+
|
|
1471
|
+
if diag["mem_pressure"] == "critical":
|
|
1472
|
+
diag["issues"].append("Critical memory pressure — system may kill processes")
|
|
1473
|
+
diag["fixes"].append("Run: localcoder --cleanup")
|
|
1474
|
+
|
|
1475
|
+
# Context size warnings
|
|
1476
|
+
if diag["context_size"] > 65536 and not diag["kv_quantized"]:
|
|
1477
|
+
diag["issues"].append(f"Large context ({diag['context_size']//1024}K) without KV quantization")
|
|
1478
|
+
diag["fixes"].append("Either reduce context or add -ctk q4_0 -ctv q4_0")
|
|
1479
|
+
|
|
1480
|
+
# Check if Metal limit could be raised
|
|
1481
|
+
if IS_MAC and diag["gpu_total_mb"] > 0:
|
|
1482
|
+
ram_mb = get_system_ram_gb() * 1024
|
|
1483
|
+
current_limit = diag["gpu_total_mb"]
|
|
1484
|
+
max_safe = int(ram_mb * 0.90) # leave 10% for system
|
|
1485
|
+
if current_limit < max_safe and diag["swap_thrashing"]:
|
|
1486
|
+
new_limit = max_safe
|
|
1487
|
+
diag["fixes"].append(
|
|
1488
|
+
f"Raise Metal GPU limit: sudo sysctl iogpu.wired_limit_mb={new_limit}"
|
|
1489
|
+
f" (current: {current_limit}MB, max safe: {new_limit}MB)"
|
|
1490
|
+
)
|
|
1491
|
+
|
|
1492
|
+
# Overall status
|
|
1493
|
+
if not diag["issues"]:
|
|
1494
|
+
diag["status"] = "healthy"
|
|
1495
|
+
elif diag["swap_thrashing"] or not diag["on_gpu"] or diag["mem_pressure"] == "critical":
|
|
1496
|
+
diag["status"] = "critical"
|
|
1497
|
+
else:
|
|
1498
|
+
diag["status"] = "degraded"
|
|
1499
|
+
|
|
1500
|
+
return diag
|
|
1501
|
+
|
|
1502
|
+
|
|
1503
|
+
def print_gpu_health(diag=None, model_id=None):
|
|
1504
|
+
"""Print GPU health diagnostic panel."""
|
|
1505
|
+
if diag is None:
|
|
1506
|
+
diag = diagnose_gpu_health(model_id)
|
|
1507
|
+
|
|
1508
|
+
status_style = {
|
|
1509
|
+
"healthy": ("green", "✓ Healthy"),
|
|
1510
|
+
"degraded": ("yellow", "⚠ Degraded"),
|
|
1511
|
+
"critical": ("red", "✗ Critical"),
|
|
1512
|
+
"unknown": ("dim", "? Unknown"),
|
|
1513
|
+
}
|
|
1514
|
+
color, label = status_style.get(diag["status"], ("dim", "?"))
|
|
1515
|
+
|
|
1516
|
+
lines = []
|
|
1517
|
+
|
|
1518
|
+
# GPU offload status
|
|
1519
|
+
if diag["server_config"].get("running"):
|
|
1520
|
+
gpu_icon = "[green]●[/] GPU" if diag["on_gpu"] else "[red]●[/] CPU (SLOW!)"
|
|
1521
|
+
lines.append(f" Compute: {gpu_icon} · {diag['gpu_layers']} layers offloaded · GPU util: {diag['gpu_util_pct']}%")
|
|
1522
|
+
|
|
1523
|
+
# KV cache
|
|
1524
|
+
kv_icon = "[green]●[/]" if diag["kv_quantized"] else "[red]●[/]"
|
|
1525
|
+
kv_info = f"quantized ({diag['kv_type']})" if diag["kv_quantized"] else "full precision (2x memory!)"
|
|
1526
|
+
lines.append(
|
|
1527
|
+
f" KV cache: {kv_icon} {kv_info} · ~{diag['kv_cache_est_mb']}MB"
|
|
1528
|
+
f" · context: {diag['context_size'] // 1024}K tokens"
|
|
1529
|
+
)
|
|
1530
|
+
|
|
1531
|
+
# Flash attention
|
|
1532
|
+
fa_icon = "[green]●[/]" if diag["flash_attn"] else "[yellow]●[/]"
|
|
1533
|
+
lines.append(f" Flash attn: {fa_icon} {'on' if diag['flash_attn'] else 'off'}"
|
|
1534
|
+
f" · footprint: {diag['server_config'].get('footprint_mb', 0)}MB")
|
|
1535
|
+
|
|
1536
|
+
# Memory
|
|
1537
|
+
swap_color = "red" if diag["swap_thrashing"] else "green"
|
|
1538
|
+
pressure_color = {"normal": "green", "warn": "yellow", "critical": "red"}.get(
|
|
1539
|
+
diag["mem_pressure"], "dim"
|
|
1540
|
+
)
|
|
1541
|
+
lines.append(
|
|
1542
|
+
f" Memory: [{pressure_color}]{diag['mem_pressure']}[/{pressure_color}]"
|
|
1543
|
+
f" · swap: [{swap_color}]{diag['swap_used_mb'] // 1024}GB[/{swap_color}]"
|
|
1544
|
+
f" · GPU alloc: {diag['gpu_alloc_mb'] // 1024}GB / {diag['gpu_total_mb'] // 1024}GB"
|
|
1545
|
+
)
|
|
1546
|
+
|
|
1547
|
+
# Issues
|
|
1548
|
+
if diag["issues"]:
|
|
1549
|
+
lines.append("")
|
|
1550
|
+
for issue in diag["issues"]:
|
|
1551
|
+
lines.append(f" [red]✗[/] {issue}")
|
|
1552
|
+
if diag["fixes"]:
|
|
1553
|
+
lines.append("")
|
|
1554
|
+
for fix in diag["fixes"]:
|
|
1555
|
+
lines.append(f" [green]→[/] {fix}")
|
|
1556
|
+
|
|
1557
|
+
console.print(Panel(
|
|
1558
|
+
"\n".join(lines),
|
|
1559
|
+
title=f"[bold]GPU Health [{color}]{label}[/{color}][/]",
|
|
1560
|
+
border_style=color,
|
|
1561
|
+
padding=(0, 1),
|
|
1562
|
+
))
|
|
1563
|
+
|
|
1564
|
+
return diag
|
|
1565
|
+
|
|
1566
|
+
|
|
1567
|
+
def auto_optimize_server(model_id=None):
|
|
1568
|
+
"""Check if server needs optimization and apply fixes.
|
|
1569
|
+
|
|
1570
|
+
Returns True if server was restarted with better flags.
|
|
1571
|
+
"""
|
|
1572
|
+
diag = diagnose_gpu_health(model_id)
|
|
1573
|
+
|
|
1574
|
+
if diag["status"] == "healthy":
|
|
1575
|
+
return False
|
|
1576
|
+
|
|
1577
|
+
needs_restart = False
|
|
1578
|
+
srv = diag["server_config"]
|
|
1579
|
+
model_info = MODELS.get(model_id, {}) if model_id else {}
|
|
1580
|
+
optimal_flags = model_info.get("server_flags", "").split() if model_info else []
|
|
1581
|
+
|
|
1582
|
+
# Check if current flags are suboptimal
|
|
1583
|
+
if not diag["on_gpu"] and "-ngl" not in " ".join(srv.get("flags", [])):
|
|
1584
|
+
needs_restart = True
|
|
1585
|
+
if not diag["kv_quantized"] and "-ctk" not in " ".join(srv.get("flags", [])):
|
|
1586
|
+
needs_restart = True
|
|
1587
|
+
if not diag["flash_attn"] and "-fa" not in " ".join(srv.get("flags", [])):
|
|
1588
|
+
needs_restart = True
|
|
1589
|
+
|
|
1590
|
+
if needs_restart and model_id:
|
|
1591
|
+
console.print("\n [yellow]Server running with suboptimal flags — restarting with optimizations...[/]")
|
|
1592
|
+
# Kill current server
|
|
1593
|
+
if srv.get("pid"):
|
|
1594
|
+
try:
|
|
1595
|
+
subprocess.run(["kill", str(srv["pid"])], timeout=5)
|
|
1596
|
+
time.sleep(2)
|
|
1597
|
+
except Exception:
|
|
1598
|
+
pass
|
|
1599
|
+
# Start with optimal flags
|
|
1600
|
+
proc = start_llama_server(model_id)
|
|
1601
|
+
if proc:
|
|
1602
|
+
console.print(" [green]✓ Server restarted with optimal GPU flags[/]")
|
|
1603
|
+
return True
|
|
1604
|
+
else:
|
|
1605
|
+
console.print(" [red]Failed to restart server[/]")
|
|
1606
|
+
|
|
1607
|
+
# If swap thrashing, try to free memory without restart
|
|
1608
|
+
if diag["swap_thrashing"] and not needs_restart:
|
|
1609
|
+
console.print("\n [yellow]Swap thrashing detected — cleaning up GPU memory...[/]")
|
|
1610
|
+
cleanup_gpu_memory(force=False)
|
|
1611
|
+
|
|
1612
|
+
return False
|
|
1613
|
+
|
|
1614
|
+
|
|
1615
|
+
# ── macOS Debloat categories for ML workloads ──
|
|
1616
|
+
DEBLOAT_CATEGORIES = {
|
|
1617
|
+
"ml_hogs": {
|
|
1618
|
+
"name": "ML & Analysis Daemons",
|
|
1619
|
+
"desc": "Apple's background ML that competes with your model for GPU",
|
|
1620
|
+
"safe": True,
|
|
1621
|
+
"services": {
|
|
1622
|
+
"com.apple.photoanalysisd": "Photos face/scene ML — uses GPU + 2-8GB RAM",
|
|
1623
|
+
"com.apple.mediaanalysisd": "Visual Lookup, Live Text ML — GPU heavy",
|
|
1624
|
+
"com.apple.suggestd": "Siri suggestions indexer — background ML",
|
|
1625
|
+
"com.apple.intelligenced": "Apple Intelligence (Sequoia) — GPU heavy",
|
|
1626
|
+
"com.apple.mlruntime": "Core ML runtime — shared GPU compute",
|
|
1627
|
+
},
|
|
1628
|
+
},
|
|
1629
|
+
"location_bloat": {
|
|
1630
|
+
"name": "Location & Sync Bloat",
|
|
1631
|
+
"desc": "Known memory leakers on macOS 14/15",
|
|
1632
|
+
"safe": True,
|
|
1633
|
+
"services": {
|
|
1634
|
+
"com.apple.CoreLocationAgent": "Location cache — leaks to 8GB+ (notorious)",
|
|
1635
|
+
"com.apple.remindd": "Reminders sync — memory leak on macOS 15",
|
|
1636
|
+
"com.apple.cloudd": "iCloud Drive sync — bloats with many files",
|
|
1637
|
+
"com.apple.bird": "CloudKit container daemon",
|
|
1638
|
+
},
|
|
1639
|
+
},
|
|
1640
|
+
"telemetry": {
|
|
1641
|
+
"name": "Telemetry & Analytics",
|
|
1642
|
+
"desc": "Crash reports, analytics, diagnostics — zero impact to disable",
|
|
1643
|
+
"safe": True,
|
|
1644
|
+
"services": {
|
|
1645
|
+
"com.apple.analyticsd": "Analytics collection",
|
|
1646
|
+
"com.apple.ReportCrash": "Crash report generation",
|
|
1647
|
+
"com.apple.spindump": "CPU sampling diagnostics",
|
|
1648
|
+
"com.apple.DiagnosticReportCleanup": "Diagnostic cleanup",
|
|
1649
|
+
"com.apple.ap.adprivacyd": "Ad privacy daemon",
|
|
1650
|
+
"com.apple.ap.adservicesd": "Ad services",
|
|
1651
|
+
"com.apple.triald": "A/B testing framework",
|
|
1652
|
+
},
|
|
1653
|
+
},
|
|
1654
|
+
"siri_ai": {
|
|
1655
|
+
"name": "Siri & Apple AI",
|
|
1656
|
+
"desc": "Siri, assistant, Apple Intelligence",
|
|
1657
|
+
"safe": True,
|
|
1658
|
+
"services": {
|
|
1659
|
+
"com.apple.Siri.agent": "Siri main service",
|
|
1660
|
+
"com.apple.assistantd": "Assistant daemon",
|
|
1661
|
+
"com.apple.parsec.fbf": "Siri search suggestions",
|
|
1662
|
+
"com.apple.tipsd": "Tips and suggestions",
|
|
1663
|
+
"com.apple.ScreenTimeAgent": "Screen time tracking",
|
|
1664
|
+
},
|
|
1665
|
+
},
|
|
1666
|
+
}
|
|
1667
|
+
|
|
1668
|
+
|
|
1669
|
+
def debloat_wizard():
|
|
1670
|
+
"""Interactive debloat wizard for ML workloads.
|
|
1671
|
+
|
|
1672
|
+
Shows categories of services that can be disabled to free GPU/memory.
|
|
1673
|
+
User picks categories, we disable via launchctl.
|
|
1674
|
+
Creates restore script.
|
|
1675
|
+
"""
|
|
1676
|
+
import shutil
|
|
1677
|
+
|
|
1678
|
+
console.clear()
|
|
1679
|
+
console.print()
|
|
1680
|
+
console.print(" [bold]localcoder debloat wizard[/]")
|
|
1681
|
+
console.print(" [dim]Disable macOS services that compete with your model for GPU & memory[/]")
|
|
1682
|
+
console.print(" [dim]All changes are reversible — a restore script is saved automatically[/]\n")
|
|
1683
|
+
|
|
1684
|
+
# Show current bloated processes
|
|
1685
|
+
top_procs = get_top_memory_processes(min_mb=200, limit=5)
|
|
1686
|
+
bloat_procs = [p for p in top_procs if p["category"] == "bloat"]
|
|
1687
|
+
if bloat_procs:
|
|
1688
|
+
console.print(" [yellow]Currently bloated system processes:[/]")
|
|
1689
|
+
for p in bloat_procs:
|
|
1690
|
+
mb = p["mb"]
|
|
1691
|
+
size = f"{mb / 1024:.1f}GB" if mb >= 1024 else f"{mb}MB"
|
|
1692
|
+
desc = SYSTEM_RESTARTABLE.get(p["name"], "")
|
|
1693
|
+
console.print(f" [red]●[/] {p['name']} [bold]{size}[/] [dim]{desc}[/]")
|
|
1694
|
+
console.print()
|
|
1695
|
+
|
|
1696
|
+
# Show categories
|
|
1697
|
+
cats = list(DEBLOAT_CATEGORIES.items())
|
|
1698
|
+
for i, (key, cat) in enumerate(cats, 1):
|
|
1699
|
+
n_services = len(cat["services"])
|
|
1700
|
+
console.print(f" [bold]{i}.[/] {cat['name']} [dim]({n_services} services)[/]")
|
|
1701
|
+
console.print(f" [dim]{cat['desc']}[/]")
|
|
1702
|
+
for svc, desc in list(cat["services"].items())[:3]:
|
|
1703
|
+
console.print(f" [dim] · {svc.split('.')[-1]}: {desc}[/]")
|
|
1704
|
+
if n_services > 3:
|
|
1705
|
+
console.print(f" [dim] + {n_services - 3} more[/]")
|
|
1706
|
+
console.print()
|
|
1707
|
+
|
|
1708
|
+
console.print(f" [bold]k.[/] Kill bloated processes now [dim](one-time, they may restart)[/]")
|
|
1709
|
+
console.print(f" [bold]a.[/] All categories [dim](maximum GPU headroom)[/]")
|
|
1710
|
+
console.print(f" [bold]r.[/] Restore all [dim](re-enable everything)[/]")
|
|
1711
|
+
console.print(f" [bold]q.[/] Quit\n")
|
|
1712
|
+
|
|
1713
|
+
try:
|
|
1714
|
+
ans = input(" Choose (e.g. 1,2 or a): ").strip().lower()
|
|
1715
|
+
except (EOFError, KeyboardInterrupt):
|
|
1716
|
+
return
|
|
1717
|
+
|
|
1718
|
+
if ans == "q" or not ans:
|
|
1719
|
+
return
|
|
1720
|
+
|
|
1721
|
+
if ans == "r":
|
|
1722
|
+
_debloat_restore()
|
|
1723
|
+
return
|
|
1724
|
+
|
|
1725
|
+
if ans == "k":
|
|
1726
|
+
_kill_bloated_processes()
|
|
1727
|
+
return
|
|
1728
|
+
|
|
1729
|
+
# Parse selection
|
|
1730
|
+
selected_cats = []
|
|
1731
|
+
if ans == "a":
|
|
1732
|
+
selected_cats = list(DEBLOAT_CATEGORIES.keys())
|
|
1733
|
+
else:
|
|
1734
|
+
for part in ans.replace(" ", "").split(","):
|
|
1735
|
+
try:
|
|
1736
|
+
idx = int(part) - 1
|
|
1737
|
+
if 0 <= idx < len(cats):
|
|
1738
|
+
selected_cats.append(cats[idx][0])
|
|
1739
|
+
except ValueError:
|
|
1740
|
+
pass
|
|
1741
|
+
|
|
1742
|
+
if not selected_cats:
|
|
1743
|
+
console.print(" [dim]No categories selected.[/]")
|
|
1744
|
+
return
|
|
1745
|
+
|
|
1746
|
+
# Confirm
|
|
1747
|
+
total_services = sum(len(DEBLOAT_CATEGORIES[c]["services"]) for c in selected_cats)
|
|
1748
|
+
cat_names = ", ".join(DEBLOAT_CATEGORIES[c]["name"] for c in selected_cats)
|
|
1749
|
+
console.print(f"\n [yellow]Will disable {total_services} services: {cat_names}[/]")
|
|
1750
|
+
try:
|
|
1751
|
+
confirm = input(" Proceed? (y/n): ").strip().lower()
|
|
1752
|
+
except (EOFError, KeyboardInterrupt):
|
|
1753
|
+
return
|
|
1754
|
+
if confirm != "y":
|
|
1755
|
+
return
|
|
1756
|
+
|
|
1757
|
+
# Disable services
|
|
1758
|
+
disabled = []
|
|
1759
|
+
restore_cmds = []
|
|
1760
|
+
for cat_key in selected_cats:
|
|
1761
|
+
cat = DEBLOAT_CATEGORIES[cat_key]
|
|
1762
|
+
for svc, desc in cat["services"].items():
|
|
1763
|
+
# Try both user and system domains
|
|
1764
|
+
for domain in [f"gui/{os.getuid()}", "system"]:
|
|
1765
|
+
cmd = ["launchctl", "disable", f"{domain}/{svc}"]
|
|
1766
|
+
r = subprocess.run(cmd, capture_output=True, text=True)
|
|
1767
|
+
# Also bootout if currently loaded
|
|
1768
|
+
subprocess.run(
|
|
1769
|
+
["launchctl", "bootout", f"{domain}/{svc}"],
|
|
1770
|
+
capture_output=True, text=True,
|
|
1771
|
+
)
|
|
1772
|
+
restore_cmds.append(f"launchctl enable {domain}/{svc}")
|
|
1773
|
+
disabled.append(svc)
|
|
1774
|
+
console.print(f" [green]✓[/] {svc.split('.')[-1]} [dim]{desc}[/]")
|
|
1775
|
+
|
|
1776
|
+
# Also kill currently bloated processes
|
|
1777
|
+
for p in bloat_procs:
|
|
1778
|
+
for pid in p.get("pids", [p["pid"]]):
|
|
1779
|
+
try:
|
|
1780
|
+
import signal
|
|
1781
|
+
os.kill(pid, signal.SIGTERM)
|
|
1782
|
+
except (ProcessLookupError, PermissionError):
|
|
1783
|
+
pass
|
|
1784
|
+
console.print(f" [green]✓[/] Killed {p['name']} (was {p['mb'] // 1024}GB)")
|
|
1785
|
+
|
|
1786
|
+
# Save restore script
|
|
1787
|
+
restore_path = CONFIG_DIR / "restore_debloat.sh"
|
|
1788
|
+
CONFIG_DIR.mkdir(parents=True, exist_ok=True)
|
|
1789
|
+
with open(restore_path, "w") as f:
|
|
1790
|
+
f.write("#!/bin/bash\n# localcoder debloat restore script\n")
|
|
1791
|
+
f.write(f"# Generated: {time.strftime('%Y-%m-%d %H:%M')}\n\n")
|
|
1792
|
+
for cmd in restore_cmds:
|
|
1793
|
+
f.write(f"{cmd}\n")
|
|
1794
|
+
f.write('\necho "All services restored. Reboot recommended."\n')
|
|
1795
|
+
os.chmod(restore_path, 0o755)
|
|
1796
|
+
|
|
1797
|
+
console.print(f"\n [green]Disabled {len(disabled)} services.[/]")
|
|
1798
|
+
console.print(f" [dim]Restore script: {restore_path}[/]")
|
|
1799
|
+
console.print(f" [dim]Run: localcoder --debloat then choose 'r' to restore[/]\n")
|
|
1800
|
+
|
|
1801
|
+
|
|
1802
|
+
def _kill_bloated_processes():
|
|
1803
|
+
"""Kill all currently bloated system processes (one-time)."""
|
|
1804
|
+
import signal
|
|
1805
|
+
procs = get_top_memory_processes(min_mb=300)
|
|
1806
|
+
bloat = [p for p in procs if p["category"] == "bloat"]
|
|
1807
|
+
if not bloat:
|
|
1808
|
+
console.print(" [dim]No bloated processes found.[/]")
|
|
1809
|
+
return
|
|
1810
|
+
|
|
1811
|
+
freed = 0
|
|
1812
|
+
for p in bloat:
|
|
1813
|
+
for pid in p.get("pids", [p["pid"]]):
|
|
1814
|
+
try:
|
|
1815
|
+
os.kill(pid, signal.SIGTERM)
|
|
1816
|
+
except (ProcessLookupError, PermissionError):
|
|
1817
|
+
pass
|
|
1818
|
+
mb = p["mb"]
|
|
1819
|
+
freed += mb
|
|
1820
|
+
console.print(f" [green]✓[/] Killed {p['name']} ({mb // 1024}GB)")
|
|
1821
|
+
|
|
1822
|
+
console.print(f"\n [green]Freed ~{freed // 1024}GB[/] [dim](processes may restart smaller)[/]")
|
|
1823
|
+
|
|
1824
|
+
|
|
1825
|
+
def _debloat_restore():
|
|
1826
|
+
"""Restore all debloated services."""
|
|
1827
|
+
restore_path = CONFIG_DIR / "restore_debloat.sh"
|
|
1828
|
+
if not restore_path.exists():
|
|
1829
|
+
console.print(" [dim]No restore script found — nothing to restore.[/]")
|
|
1830
|
+
return
|
|
1831
|
+
|
|
1832
|
+
console.print(" [yellow]Restoring all disabled services...[/]")
|
|
1833
|
+
r = subprocess.run(["bash", str(restore_path)], capture_output=True, text=True, timeout=30)
|
|
1834
|
+
if r.returncode == 0:
|
|
1835
|
+
console.print(" [green]All services restored. Reboot recommended.[/]")
|
|
1836
|
+
restore_path.unlink()
|
|
1837
|
+
else:
|
|
1838
|
+
console.print(f" [red]Some services failed to restore: {r.stderr[:200]}[/]")
|
|
1839
|
+
|
|
1840
|
+
|
|
1841
|
+
# LocalLLaMA community favorites for coding — from Best LLMs 2025 megathread
|
|
1842
|
+
# Updated from r/LocalLLaMA actual user recommendations, not benchmarks
|
|
1843
|
+
COMMUNITY_CODING_MODELS = {
|
|
1844
|
+
# <=8GB VRAM
|
|
1845
|
+
"lfm2-8b-a1b": {"name": "LFM2 8B-A1B", "hf": "liquid/LFM2-8B-A1B-GGUF", "vram": "8GB", "note": "Crazy fast MoE, great general + tool calling"},
|
|
1846
|
+
"qwen3-4b": {"name": "Qwen 3 4B", "hf": "unsloth/Qwen3-4B-GGUF", "vram": "4GB", "note": "Best tool calling at 4B size"},
|
|
1847
|
+
# 12-24GB VRAM (most LocalLLaMA users)
|
|
1848
|
+
"qwen3-coder-30b": {"name": "Qwen 3 Coder 30B-A3B", "hf": "unsloth/Qwen3-Coder-30B-A3B-Instruct-GGUF", "vram": "12-24GB", "note": "Top agentic coder, MoE"},
|
|
1849
|
+
"nemotron-30b-a3b": {"name": "Nemotron 30B-A3B", "hf": "unsloth/Nemotron-3-Nano-30B-A3B-GGUF", "vram": "12-24GB", "note": "NVIDIA MoE, fastest generation"},
|
|
1850
|
+
"gemma4-26b": {"name": "Gemma 4 26B-A4B", "hf": "unsloth/gemma-4-26B-A4B-it-GGUF", "vram": "12-16GB", "note": "Best tool calling + vision, 49 tok/s"},
|
|
1851
|
+
"devstral-24b": {"name": "Devstral Small 24B", "hf": "lmstudio-community/Devstral-Small-2-24B-Instruct-2512-GGUF", "vram": "12-24GB", "note": "Reliable daily driver for coding"},
|
|
1852
|
+
"glm-4.6v-flash": {"name": "GLM 4.6V Flash", "hf": "THUDM/glm-4.6v-flash-9b-gguf", "vram": "8-12GB", "note": "Best small model of the year (r/LocalLLaMA)"},
|
|
1853
|
+
# 24-48GB VRAM
|
|
1854
|
+
"gpt-oss-20b": {"name": "GPT-OSS 20B", "hf": "unsloth/gpt-oss-20b-GGUF", "vram": "24GB", "note": "Best accuracy under 48GB"},
|
|
1855
|
+
"qwen3.5-35b-a3b": {"name": "Qwen 3.5 35B-A3B", "hf": "unsloth/Qwen3.5-35B-A3B-GGUF", "vram": "12-24GB", "note": "1.5M downloads, MoE coding beast"},
|
|
1856
|
+
# 48-96GB VRAM
|
|
1857
|
+
"glm-4.5-air": {"name": "GLM 4.5 Air", "hf": "THUDM/glm-4.5-9b-air-gguf", "vram": "48-96GB", "note": "Flat-out amazing for codegen (r/LocalLLaMA)"},
|
|
1858
|
+
# 96GB+
|
|
1859
|
+
"gpt-oss-120b": {"name": "GPT-OSS 120B", "hf": "unsloth/gpt-oss-120b-GGUF", "vram": "96GB+", "note": "Most recommended for agentic coding"},
|
|
1860
|
+
"devstral-123b": {"name": "Devstral 123B", "hf": "mistralai/Devstral-2-123B-GGUF", "vram": "96GB+", "note": "Compact 123B, fits 2x RTX Pro"},
|
|
1861
|
+
"minimax-m2": {"name": "MiniMax M2.1", "hf": "unsloth/MiniMax-M2.1-GGUF", "vram": "96GB+", "note": "Frontier performance, fantastic agentic coding"},
|
|
1862
|
+
}
|
|
1863
|
+
|
|
1864
|
+
|
|
1865
|
+
_hf_model_cache = {"data": None, "ts": 0}
|
|
1866
|
+
|
|
1867
|
+
|
|
1868
|
+
def _fetch_all_hf_models():
|
|
1869
|
+
"""Fetch GGUF models from all top providers in parallel. Cached for 10 minutes.
|
|
1870
|
+
|
|
1871
|
+
One call, returns everything — trending, liked, latest. No duplicate fetches.
|
|
1872
|
+
"""
|
|
1873
|
+
import concurrent.futures
|
|
1874
|
+
|
|
1875
|
+
# Return cache if fresh
|
|
1876
|
+
if _hf_model_cache["data"] and time.time() - _hf_model_cache["ts"] < 600:
|
|
1877
|
+
return _hf_model_cache["data"]
|
|
1878
|
+
|
|
1879
|
+
providers = ["unsloth", "bartowski", "lmstudio-community"]
|
|
1880
|
+
all_raw = []
|
|
1881
|
+
|
|
1882
|
+
def _fetch_one(author):
|
|
1883
|
+
"""Fetch from one provider — downloads sort gets us everything we need."""
|
|
1884
|
+
try:
|
|
1885
|
+
url = f"https://huggingface.co/api/models?author={author}&sort=downloads&direction=-1&limit=20"
|
|
1886
|
+
req = urllib.request.Request(url, headers={"User-Agent": "localcoder/1.0"})
|
|
1887
|
+
with urllib.request.urlopen(req, timeout=8) as resp:
|
|
1888
|
+
return json.loads(resp.read())
|
|
1889
|
+
except Exception:
|
|
1890
|
+
return []
|
|
1891
|
+
|
|
1892
|
+
# Parallel fetch — all 3 providers at once (~1 API call time instead of 3)
|
|
1893
|
+
try:
|
|
1894
|
+
with concurrent.futures.ThreadPoolExecutor(max_workers=3) as pool:
|
|
1895
|
+
futures = {pool.submit(_fetch_one, p): p for p in providers}
|
|
1896
|
+
for future in concurrent.futures.as_completed(futures, timeout=10):
|
|
1897
|
+
author = futures[future]
|
|
1898
|
+
try:
|
|
1899
|
+
for m in future.result():
|
|
1900
|
+
m["_author"] = author
|
|
1901
|
+
all_raw.append(m)
|
|
1902
|
+
except Exception:
|
|
1903
|
+
pass
|
|
1904
|
+
except Exception:
|
|
1905
|
+
return []
|
|
1906
|
+
|
|
1907
|
+
# Deduplicate by base model name, prefer unsloth > bartowski > lmstudio
|
|
1908
|
+
provider_rank = {"unsloth": 0, "bartowski": 1, "lmstudio-community": 2}
|
|
1909
|
+
seen = {}
|
|
1910
|
+
for m in all_raw:
|
|
1911
|
+
tags = m.get("tags", [])
|
|
1912
|
+
if "gguf" not in tags:
|
|
1913
|
+
continue
|
|
1914
|
+
dl = m.get("downloads", 0)
|
|
1915
|
+
if dl < 1000:
|
|
1916
|
+
continue
|
|
1917
|
+
|
|
1918
|
+
rid = m["id"]
|
|
1919
|
+
base = rid.split("/")[-1].replace("-GGUF", "").replace("-Instruct", "").replace("-it", "").lower()
|
|
1920
|
+
author = m.get("_author", "")
|
|
1921
|
+
rank = provider_rank.get(author, 9)
|
|
1922
|
+
|
|
1923
|
+
if base not in seen or rank < seen[base]["_rank"]:
|
|
1924
|
+
name = rid.split("/")[-1].replace("-GGUF", "").replace("-Instruct", "").replace("-it", "")
|
|
1925
|
+
tags = m.get("tags", [])
|
|
1926
|
+
|
|
1927
|
+
# Detect modalities from tags
|
|
1928
|
+
caps = []
|
|
1929
|
+
if "image-text-to-text" in tags:
|
|
1930
|
+
caps.append("vision")
|
|
1931
|
+
if any("audio" in t for t in tags):
|
|
1932
|
+
caps.append("audio")
|
|
1933
|
+
if any("code" in t.lower() or "coder" in t.lower() for t in tags) or "coder" in name.lower():
|
|
1934
|
+
caps.append("code")
|
|
1935
|
+
if any("moe" in t.lower() for t in tags) or "A3B" in name or "A4B" in name or "A10B" in name:
|
|
1936
|
+
caps.append("MoE")
|
|
1937
|
+
|
|
1938
|
+
# Estimate smallest quant size from model name
|
|
1939
|
+
# Rule: ~0.5GB per 1B params at Q2, MoE active params only
|
|
1940
|
+
import re as _re_est
|
|
1941
|
+
param_match = _re_est.search(r'(\d+)[bB]', name)
|
|
1942
|
+
active_match = _re_est.search(r'A(\d+)[bB]', name)
|
|
1943
|
+
est_smallest_gb = None
|
|
1944
|
+
if param_match:
|
|
1945
|
+
total_b = int(param_match.group(1))
|
|
1946
|
+
active_b = int(active_match.group(1)) if active_match else total_b
|
|
1947
|
+
# For MoE: estimate from total params, not active
|
|
1948
|
+
# Q2 quant ≈ 0.35 GB per 1B total params
|
|
1949
|
+
est_smallest_gb = round(total_b * 0.35, 1)
|
|
1950
|
+
|
|
1951
|
+
seen[base] = {
|
|
1952
|
+
"repo_id": rid,
|
|
1953
|
+
"label": name,
|
|
1954
|
+
"downloads": dl,
|
|
1955
|
+
"likes": m.get("likes", 0),
|
|
1956
|
+
"author": author,
|
|
1957
|
+
"caps": caps,
|
|
1958
|
+
"est_smallest_gb": est_smallest_gb,
|
|
1959
|
+
"_rank": rank,
|
|
1960
|
+
"_base": base,
|
|
1961
|
+
}
|
|
1962
|
+
|
|
1963
|
+
result = list(seen.values())
|
|
1964
|
+
_hf_model_cache["data"] = result
|
|
1965
|
+
_hf_model_cache["ts"] = time.time()
|
|
1966
|
+
return result
|
|
1967
|
+
|
|
1968
|
+
|
|
1969
|
+
def fetch_unsloth_top_models(limit=12):
|
|
1970
|
+
"""Top GGUF models sorted by downloads. Cached, parallel fetch."""
|
|
1971
|
+
models = _fetch_all_hf_models()
|
|
1972
|
+
models_sorted = sorted(models, key=lambda x: x["downloads"], reverse=True)
|
|
1973
|
+
return models_sorted[:limit]
|
|
1974
|
+
|
|
1975
|
+
|
|
1976
|
+
def fetch_hf_trending_models(limit=5, sort="downloads"):
|
|
1977
|
+
"""GGUF models sorted by downloads or likes. Cached, parallel fetch."""
|
|
1978
|
+
models = _fetch_all_hf_models()
|
|
1979
|
+
if sort == "likes":
|
|
1980
|
+
models_sorted = sorted(models, key=lambda x: x.get("likes", 0), reverse=True)
|
|
1981
|
+
else:
|
|
1982
|
+
models_sorted = sorted(models, key=lambda x: x["downloads"], reverse=True)
|
|
1983
|
+
return models_sorted[:limit]
|
|
1984
|
+
|
|
1985
|
+
|
|
1986
|
+
# Legacy compat — old code referenced this directly
|
|
1987
|
+
def _fetch_unsloth_top_compat(limit=12):
|
|
1988
|
+
return fetch_unsloth_top_models(limit)
|
|
1989
|
+
|
|
1990
|
+
|
|
1991
|
+
|
|
1992
|
+
|
|
1993
|
+
def fetch_hf_model(query):
|
|
1994
|
+
"""Fetch GGUF model info from HuggingFace.
|
|
1995
|
+
|
|
1996
|
+
Accepts:
|
|
1997
|
+
- Full URL: https://huggingface.co/unsloth/gemma-4-26B-A4B-it-GGUF
|
|
1998
|
+
- Repo ID: unsloth/gemma-4-26B-A4B-it-GGUF
|
|
1999
|
+
- Search term: gemma 4 26b gguf
|
|
2000
|
+
|
|
2001
|
+
Returns dict with model name, GGUF files with sizes, or None.
|
|
2002
|
+
"""
|
|
2003
|
+
import re as _re
|
|
2004
|
+
|
|
2005
|
+
repo_id = None
|
|
2006
|
+
|
|
2007
|
+
# Parse URL
|
|
2008
|
+
if "huggingface.co" in query:
|
|
2009
|
+
# https://huggingface.co/org/model or /org/model/...
|
|
2010
|
+
m = _re.search(r'huggingface\.co/([^/]+/[^/\s?#]+)', query)
|
|
2011
|
+
if m:
|
|
2012
|
+
repo_id = m.group(1)
|
|
2013
|
+
elif "/" in query and " " not in query:
|
|
2014
|
+
# Direct repo ID: unsloth/gemma-4-26B-A4B-it-GGUF
|
|
2015
|
+
repo_id = query
|
|
2016
|
+
elif "ollama.com" in query:
|
|
2017
|
+
# Ollama URL — extract model name for search
|
|
2018
|
+
m = _re.search(r'ollama\.com/library/([^/\s?#]+)', query)
|
|
2019
|
+
if m:
|
|
2020
|
+
query = m.group(1) + " gguf"
|
|
2021
|
+
|
|
2022
|
+
# If no repo_id, search HuggingFace
|
|
2023
|
+
if not repo_id:
|
|
2024
|
+
try:
|
|
2025
|
+
search_url = f"https://huggingface.co/api/models?search={urllib.parse.quote(query + ' gguf')}&sort=downloads&direction=-1&limit=5"
|
|
2026
|
+
req = urllib.request.Request(search_url, headers={"User-Agent": "localcoder/1.0"})
|
|
2027
|
+
with urllib.request.urlopen(req, timeout=10) as resp:
|
|
2028
|
+
results = json.loads(resp.read())
|
|
2029
|
+
# Pick first GGUF repo
|
|
2030
|
+
for r in results:
|
|
2031
|
+
if any("gguf" in t.lower() for t in r.get("tags", [])):
|
|
2032
|
+
repo_id = r["id"]
|
|
2033
|
+
break
|
|
2034
|
+
if not repo_id and results:
|
|
2035
|
+
repo_id = results[0]["id"]
|
|
2036
|
+
except Exception:
|
|
2037
|
+
return None
|
|
2038
|
+
|
|
2039
|
+
if not repo_id:
|
|
2040
|
+
return None
|
|
2041
|
+
|
|
2042
|
+
# Fetch model metadata + file sizes (with fallback to search)
|
|
2043
|
+
data = None
|
|
2044
|
+
try:
|
|
2045
|
+
api_url = f"https://huggingface.co/api/models/{repo_id}?blobs=true"
|
|
2046
|
+
req = urllib.request.Request(api_url, headers={"User-Agent": "localcoder/1.0"})
|
|
2047
|
+
with urllib.request.urlopen(req, timeout=10) as resp:
|
|
2048
|
+
data = json.loads(resp.read())
|
|
2049
|
+
except Exception:
|
|
2050
|
+
# Direct lookup failed — try searching with the repo name as query
|
|
2051
|
+
try:
|
|
2052
|
+
import re as _re2
|
|
2053
|
+
search_term = repo_id.split("/")[-1].replace("-", " ").replace("_", " ")
|
|
2054
|
+
# Strip version numbers for better search
|
|
2055
|
+
search_term = _re2.sub(r'\b\d{4}\b', '', search_term).strip()
|
|
2056
|
+
search_url = f"https://huggingface.co/api/models?search={urllib.parse.quote(search_term)}&sort=downloads&direction=-1&limit=3"
|
|
2057
|
+
req = urllib.request.Request(search_url, headers={"User-Agent": "localcoder/1.0"})
|
|
2058
|
+
with urllib.request.urlopen(req, timeout=10) as resp:
|
|
2059
|
+
results = json.loads(resp.read())
|
|
2060
|
+
if results:
|
|
2061
|
+
repo_id = results[0]["id"]
|
|
2062
|
+
api_url = f"https://huggingface.co/api/models/{repo_id}?blobs=true"
|
|
2063
|
+
req = urllib.request.Request(api_url, headers={"User-Agent": "localcoder/1.0"})
|
|
2064
|
+
with urllib.request.urlopen(req, timeout=10) as resp:
|
|
2065
|
+
data = json.loads(resp.read())
|
|
2066
|
+
except Exception:
|
|
2067
|
+
pass
|
|
2068
|
+
|
|
2069
|
+
if not data:
|
|
2070
|
+
return None
|
|
2071
|
+
|
|
2072
|
+
# Extract GGUF files with sizes
|
|
2073
|
+
gguf_files = []
|
|
2074
|
+
for s in data.get("siblings", []):
|
|
2075
|
+
name = s.get("rfilename", "")
|
|
2076
|
+
size = s.get("size", 0)
|
|
2077
|
+
if not name.endswith(".gguf") or size < 500_000_000: # skip tiny/split files
|
|
2078
|
+
continue
|
|
2079
|
+
if "mmproj" in name.lower():
|
|
2080
|
+
continue # skip vision projectors
|
|
2081
|
+
if "-0000" in name:
|
|
2082
|
+
continue # skip split file parts (except first)
|
|
2083
|
+
|
|
2084
|
+
# Parse quant from filename
|
|
2085
|
+
quant = "unknown"
|
|
2086
|
+
qm = _re.search(r'(BF16|F16|Q\d+_K(?:_[A-Z]+)?|Q\d+_\d+|IQ\d+_[A-Z]+|MXFP\d+)', name, _re.IGNORECASE)
|
|
2087
|
+
if qm:
|
|
2088
|
+
quant = qm.group(1).upper()
|
|
2089
|
+
|
|
2090
|
+
gguf_files.append({
|
|
2091
|
+
"filename": name,
|
|
2092
|
+
"size_bytes": size,
|
|
2093
|
+
"size_gb": round(size / (1024**3), 1),
|
|
2094
|
+
"quant": quant,
|
|
2095
|
+
})
|
|
2096
|
+
|
|
2097
|
+
# Sort by size ascending
|
|
2098
|
+
gguf_files.sort(key=lambda x: x["size_bytes"])
|
|
2099
|
+
|
|
2100
|
+
return {
|
|
2101
|
+
"repo_id": repo_id,
|
|
2102
|
+
"name": data.get("id", repo_id).split("/")[-1],
|
|
2103
|
+
"tags": data.get("tags", []),
|
|
2104
|
+
"downloads": data.get("downloads", 0),
|
|
2105
|
+
"gguf_files": gguf_files,
|
|
2106
|
+
}
|
|
2107
|
+
|
|
2108
|
+
|
|
2109
|
+
def simulate_hf_model(query):
|
|
2110
|
+
"""Fetch a model from HuggingFace and show which quants fit.
|
|
2111
|
+
|
|
2112
|
+
The "holy shit" feature: paste a URL, see instant fit analysis for every quant.
|
|
2113
|
+
"""
|
|
2114
|
+
specs = get_machine_specs()
|
|
2115
|
+
metal = get_metal_gpu_stats()
|
|
2116
|
+
gpu_total = metal.get("total_mb") or specs["gpu_total_mb"]
|
|
2117
|
+
gpu_used = metal.get("alloc_mb", 0)
|
|
2118
|
+
|
|
2119
|
+
console.clear()
|
|
2120
|
+
loading = console.status("[bold cyan] Fetching from HuggingFace...[/]", spinner="dots")
|
|
2121
|
+
loading.start()
|
|
2122
|
+
|
|
2123
|
+
model = fetch_hf_model(query)
|
|
2124
|
+
loading.stop()
|
|
2125
|
+
|
|
2126
|
+
if not model:
|
|
2127
|
+
console.print(f"\n [red]Model not found: {query}[/]")
|
|
2128
|
+
console.print(f" [dim]Try a HuggingFace URL or search term like 'llama 3.1 70b gguf'[/]\n")
|
|
2129
|
+
return
|
|
2130
|
+
|
|
2131
|
+
console.clear()
|
|
2132
|
+
console.print()
|
|
2133
|
+
console.print(f" [bold]{model['repo_id']}[/]")
|
|
2134
|
+
console.print(f" [dim]{specs['chip']} · {specs['ram_gb']}GB RAM · GPU budget: {gpu_total // 1024}GB · In use: {gpu_used // 1024}GB[/]\n")
|
|
2135
|
+
|
|
2136
|
+
if not model["gguf_files"]:
|
|
2137
|
+
console.print(f" [yellow]No GGUF files found in this repo.[/]\n")
|
|
2138
|
+
return
|
|
2139
|
+
|
|
2140
|
+
# Show all quants with fit status
|
|
2141
|
+
table = Table(
|
|
2142
|
+
title=f"Available Quants ({len(model['gguf_files'])})",
|
|
2143
|
+
show_header=True, header_style="bold", border_style="dim", padding=(0, 1),
|
|
2144
|
+
)
|
|
2145
|
+
table.add_column("#", style="dim", width=3)
|
|
2146
|
+
table.add_column("Quant", width=14)
|
|
2147
|
+
table.add_column("Size", justify="right", width=8)
|
|
2148
|
+
table.add_column("Fits GPU?", width=18)
|
|
2149
|
+
table.add_column("Est. Speed", width=12)
|
|
2150
|
+
table.add_column("", width=18)
|
|
2151
|
+
|
|
2152
|
+
best_fit_idx = None
|
|
2153
|
+
for i, f in enumerate(model["gguf_files"], 1):
|
|
2154
|
+
size_gb = f["size_gb"]
|
|
2155
|
+
size_mb = int(size_gb * 1024)
|
|
2156
|
+
fits = size_mb < gpu_total
|
|
2157
|
+
fits_free = size_mb < (gpu_total - gpu_used)
|
|
2158
|
+
|
|
2159
|
+
if fits_free:
|
|
2160
|
+
status = "[green]✓ fits[/]"
|
|
2161
|
+
if best_fit_idx is None or f["size_gb"] > model["gguf_files"][best_fit_idx - 1]["size_gb"]:
|
|
2162
|
+
best_fit_idx = i
|
|
2163
|
+
elif fits:
|
|
2164
|
+
status = "[yellow]⚠ tight[/]"
|
|
2165
|
+
if best_fit_idx is None:
|
|
2166
|
+
best_fit_idx = i
|
|
2167
|
+
else:
|
|
2168
|
+
status = "[red]✗ too big[/]"
|
|
2169
|
+
|
|
2170
|
+
# Speed estimate
|
|
2171
|
+
tps = min(120, max(1, int(49 * 12 / max(1, size_gb)))) if fits else max(1, int(5 * 16 / max(1, size_gb)))
|
|
2172
|
+
speed = f"~{tps} tok/s" if fits else f"[red]~{tps} tok/s[/]"
|
|
2173
|
+
|
|
2174
|
+
# Visual bar
|
|
2175
|
+
bar_pct = min(1.0, size_mb / gpu_total) if gpu_total else 0
|
|
2176
|
+
bar_w = int(bar_pct * 16)
|
|
2177
|
+
bar_color = "green" if fits_free else "yellow" if fits else "red"
|
|
2178
|
+
bar = f"[{bar_color}]{'█' * bar_w}[/{bar_color}][dim]{'░' * (16 - bar_w)}[/]"
|
|
2179
|
+
|
|
2180
|
+
table.add_row(str(i), f["quant"], f"{size_gb}GB", status, speed, bar)
|
|
2181
|
+
|
|
2182
|
+
console.print(table)
|
|
2183
|
+
|
|
2184
|
+
# Recommendation
|
|
2185
|
+
is_unsloth = "unsloth" in model["repo_id"].lower()
|
|
2186
|
+
if best_fit_idx:
|
|
2187
|
+
bf = model["gguf_files"][best_fit_idx - 1]
|
|
2188
|
+
console.print(f"\n [green bold]→ Best fit: #{best_fit_idx} {bf['quant']} ({bf['size_gb']}GB)[/]")
|
|
2189
|
+
console.print(f" [dim]Highest quality that fits your {gpu_total // 1024}GB GPU[/]")
|
|
2190
|
+
if is_unsloth:
|
|
2191
|
+
console.print(f" [dim]Unsloth quants use imatrix calibration — better quality than standard GGUF[/]")
|
|
2192
|
+
else:
|
|
2193
|
+
console.print(f"\n [red]No quant fits your {gpu_total // 1024}GB GPU budget.[/]")
|
|
2194
|
+
smallest = model["gguf_files"][0]
|
|
2195
|
+
console.print(f" [dim]Smallest: {smallest['quant']} at {smallest['size_gb']}GB (need {gpu_total // 1024}GB GPU)[/]")
|
|
2196
|
+
if not is_unsloth:
|
|
2197
|
+
console.print(f" [dim]Tip: check unsloth/ on HuggingFace — they often have smaller K_XL quants[/]")
|
|
2198
|
+
|
|
2199
|
+
# Interactive: pick one to simulate in detail or download
|
|
2200
|
+
console.print(f"\n [dim]Enter # for detailed analysis, 'd #' to download, or 'q' to quit[/]\n")
|
|
2201
|
+
try:
|
|
2202
|
+
ans = input(" > ").strip().lower()
|
|
2203
|
+
except (EOFError, KeyboardInterrupt):
|
|
2204
|
+
return
|
|
2205
|
+
|
|
2206
|
+
if ans == "q" or not ans:
|
|
2207
|
+
return
|
|
2208
|
+
|
|
2209
|
+
download = False
|
|
2210
|
+
if ans.startswith("d ") or ans.startswith("d"):
|
|
2211
|
+
download = True
|
|
2212
|
+
ans = ans.lstrip("d ").strip()
|
|
2213
|
+
|
|
2214
|
+
try:
|
|
2215
|
+
idx = int(ans) - 1
|
|
2216
|
+
if 0 <= idx < len(model["gguf_files"]):
|
|
2217
|
+
chosen = model["gguf_files"][idx]
|
|
2218
|
+
# Run detailed simulation with real size
|
|
2219
|
+
_simulate_with_real_size(chosen, model["repo_id"], specs, gpu_total, gpu_used)
|
|
2220
|
+
|
|
2221
|
+
if download:
|
|
2222
|
+
console.print(f"\n [yellow]Downloading {chosen['filename']}...[/]")
|
|
2223
|
+
_download_gguf(model["repo_id"], chosen["filename"])
|
|
2224
|
+
except ValueError:
|
|
2225
|
+
pass
|
|
2226
|
+
|
|
2227
|
+
|
|
2228
|
+
def _simulate_with_real_size(gguf, repo_id, specs, gpu_total, gpu_used):
|
|
2229
|
+
"""Show detailed fit analysis for a specific GGUF file with real size."""
|
|
2230
|
+
size_gb = gguf["size_gb"]
|
|
2231
|
+
size_mb = int(size_gb * 1024)
|
|
2232
|
+
fits = size_mb < gpu_total
|
|
2233
|
+
fits_free = size_mb < (gpu_total - gpu_used)
|
|
2234
|
+
|
|
2235
|
+
kv_per_1k = max(2, int(size_gb * 0.4))
|
|
2236
|
+
tps = min(120, max(1, int(49 * 12 / max(1, size_gb)))) if fits else max(1, int(5))
|
|
2237
|
+
|
|
2238
|
+
console.print(f"\n [bold]{repo_id}[/] · [cyan]{gguf['quant']}[/] · [bold]{size_gb}GB[/]")
|
|
2239
|
+
|
|
2240
|
+
# Memory bar
|
|
2241
|
+
bw = 50
|
|
2242
|
+
mb = int(min(1.0, size_mb / gpu_total) * bw) if gpu_total else 0
|
|
2243
|
+
ub = int(min(1.0, gpu_used / gpu_total) * bw) if gpu_total else 0
|
|
2244
|
+
fb = max(0, bw - mb - ub)
|
|
2245
|
+
console.print(f"\n [cyan]{'█' * ub}[/][{'green' if fits else 'red'}]{'█' * mb}[/][dim]{'░' * fb}[/] {gpu_total // 1024}GB")
|
|
2246
|
+
console.print(f" [cyan]■[/] used:{gpu_used // 1024}G [{'green' if fits else 'red'}]■[/] model:{size_gb}G [dim]░[/] free:{max(0, gpu_total - gpu_used - size_mb) // 1024}G")
|
|
2247
|
+
|
|
2248
|
+
# Context table
|
|
2249
|
+
console.print()
|
|
2250
|
+
for ctx in [8192, 32768, 65536, 131072]:
|
|
2251
|
+
kv = kv_per_1k * (ctx // 1024)
|
|
2252
|
+
tot = size_mb + kv
|
|
2253
|
+
h = gpu_total - tot
|
|
2254
|
+
icon = "[green]✓[/]" if h > 2000 else "[yellow]⚠[/]" if h > 0 else "[red]✗[/]"
|
|
2255
|
+
kv_s = f"{kv}M" if kv < 1024 else f"{kv / 1024:.1f}G"
|
|
2256
|
+
console.print(f" {icon} {ctx // 1024}K ctx → model {size_gb}G + KV {kv_s} = {tot / 1024:.1f}G")
|
|
2257
|
+
|
|
2258
|
+
console.print(f"\n Est. speed: [bold]~{tps} tok/s[/]" + ("" if fits else " [red](CPU swap)[/]"))
|
|
2259
|
+
|
|
2260
|
+
if not fits:
|
|
2261
|
+
console.print(f" [yellow]→ Try a smaller quant or: sudo sysctl iogpu.wired_limit_mb={int(specs['ram_gb'] * 1024 * 0.9)}[/]")
|
|
2262
|
+
|
|
2263
|
+
|
|
2264
|
+
def _download_gguf(repo_id, filename):
|
|
2265
|
+
"""Download a GGUF file from HuggingFace.
|
|
2266
|
+
|
|
2267
|
+
Uses huggingface_hub if available (supports gated models with token).
|
|
2268
|
+
Falls back to curl. No token needed for public repos (Unsloth, bartowski, etc).
|
|
2269
|
+
Gated models (Meta Llama) need: huggingface-cli login
|
|
2270
|
+
"""
|
|
2271
|
+
local_dir = MODELS_DIR / repo_id.replace("/", "--")
|
|
2272
|
+
local_dir.mkdir(parents=True, exist_ok=True)
|
|
2273
|
+
dest = local_dir / os.path.basename(filename)
|
|
2274
|
+
|
|
2275
|
+
if dest.exists():
|
|
2276
|
+
console.print(f" [green]Already downloaded: {dest}[/]")
|
|
2277
|
+
return str(dest)
|
|
2278
|
+
|
|
2279
|
+
try:
|
|
2280
|
+
from huggingface_hub import hf_hub_download
|
|
2281
|
+
path = hf_hub_download(repo_id=repo_id, filename=filename, local_dir=str(local_dir))
|
|
2282
|
+
console.print(f" [green]✓ Downloaded: {path}[/]")
|
|
2283
|
+
return path
|
|
2284
|
+
except ImportError:
|
|
2285
|
+
url = f"https://huggingface.co/{repo_id}/resolve/main/{filename}"
|
|
2286
|
+
console.print(f" [dim]Downloading {os.path.basename(filename)}...[/]")
|
|
2287
|
+
cmd = ["curl", "-L", "-o", str(dest), "--progress-bar", url]
|
|
2288
|
+
r = subprocess.run(cmd)
|
|
2289
|
+
if r.returncode == 0 and dest.exists():
|
|
2290
|
+
console.print(f" [green]✓ Downloaded: {dest}[/]")
|
|
2291
|
+
else:
|
|
2292
|
+
console.print(f" [red]Download failed. If gated model, run: huggingface-cli login[/]")
|
|
2293
|
+
return str(dest)
|
|
2294
|
+
except Exception as e:
|
|
2295
|
+
if "401" in str(e) or "403" in str(e) or "gated" in str(e).lower():
|
|
2296
|
+
console.print(f" [red]Gated model — run: huggingface-cli login[/]")
|
|
2297
|
+
else:
|
|
2298
|
+
console.print(f" [red]Download failed: {e}[/]")
|
|
2299
|
+
return None
|
|
2300
|
+
|
|
2301
|
+
|
|
2302
|
+
def simulate_model_fit(model_query):
|
|
2303
|
+
"""Predict if a model will fit BEFORE downloading."""
|
|
2304
|
+
import re as _re
|
|
2305
|
+
|
|
2306
|
+
specs = get_machine_specs()
|
|
2307
|
+
metal = get_metal_gpu_stats()
|
|
2308
|
+
|
|
2309
|
+
gpu_total = metal.get("total_mb") or specs["gpu_total_mb"]
|
|
2310
|
+
gpu_used = metal.get("alloc_mb", 0)
|
|
2311
|
+
gpu_free = max(0, gpu_total - gpu_used)
|
|
2312
|
+
|
|
2313
|
+
# Find in known models
|
|
2314
|
+
model_id = None
|
|
2315
|
+
model_info = None
|
|
2316
|
+
query = model_query.lower().replace("-", "").replace("_", "").replace(" ", "")
|
|
2317
|
+
for mid, m in MODELS.items():
|
|
2318
|
+
mid_clean = mid.lower().replace("-", "").replace("_", "")
|
|
2319
|
+
name_clean = m["name"].lower().replace("-", "").replace("_", "").replace(" ", "")
|
|
2320
|
+
if query in mid_clean or query in name_clean:
|
|
2321
|
+
model_id = mid
|
|
2322
|
+
model_info = m
|
|
2323
|
+
break
|
|
2324
|
+
|
|
2325
|
+
if not model_info:
|
|
2326
|
+
param_match = _re.search(r'(\d+)b', query)
|
|
2327
|
+
quant_match = _re.search(r'q(\d)', query)
|
|
2328
|
+
if param_match:
|
|
2329
|
+
params_b = int(param_match.group(1))
|
|
2330
|
+
quant = int(quant_match.group(1)) if quant_match else 4
|
|
2331
|
+
bpw = {2: 2.5, 3: 3.5, 4: 4.5, 5: 5.5, 6: 6.5, 8: 8.5}.get(quant, 4.5)
|
|
2332
|
+
size_gb = round(params_b * bpw / 8, 1)
|
|
2333
|
+
model_info = {"name": f"{params_b}B Q{quant}", "size_gb": size_gb}
|
|
2334
|
+
else:
|
|
2335
|
+
console.print(f"\n [red]Unknown model: {model_query}[/]")
|
|
2336
|
+
console.print(f" [dim]Known: {', '.join(MODELS.keys())} or '70b q4'[/]\n")
|
|
2337
|
+
return
|
|
2338
|
+
|
|
2339
|
+
name = model_info["name"]
|
|
2340
|
+
size_gb = model_info["size_gb"]
|
|
2341
|
+
size_mb = int(size_gb * 1024)
|
|
2342
|
+
kv_per_1k = max(2, int(size_gb * 0.4)) # MB per 1K ctx
|
|
2343
|
+
|
|
2344
|
+
fits_gpu = size_mb < gpu_total
|
|
2345
|
+
fits_free = size_mb < gpu_free
|
|
2346
|
+
base_tps = min(120, max(1, int(49 * 12 / max(1, size_gb)))) if fits_gpu else max(1, int(10 * 16 / max(1, size_gb)))
|
|
2347
|
+
|
|
2348
|
+
# Render
|
|
2349
|
+
console.clear()
|
|
2350
|
+
console.print()
|
|
2351
|
+
|
|
2352
|
+
if fits_free:
|
|
2353
|
+
console.print(f" [green bold]✓ {name} WILL FIT[/] · {size_gb}GB model · {gpu_free // 1024}GB free")
|
|
2354
|
+
elif fits_gpu:
|
|
2355
|
+
console.print(f" [yellow bold]⚠ {name} TIGHT FIT[/] · {size_gb}GB · close apps first")
|
|
2356
|
+
else:
|
|
2357
|
+
console.print(f" [red bold]✗ {name} WON'T FIT[/] · {size_gb}GB model · {gpu_total // 1024}GB limit")
|
|
2358
|
+
|
|
2359
|
+
console.print(f" [dim]{specs['chip']} · {specs['ram_gb']}GB RAM · GPU budget: {gpu_total // 1024}GB[/]\n")
|
|
2360
|
+
|
|
2361
|
+
# Memory bar
|
|
2362
|
+
bw = 60
|
|
2363
|
+
mb = int(min(1.0, size_mb / gpu_total) * bw) if gpu_total else 0
|
|
2364
|
+
ub = int(min(1.0, gpu_used / gpu_total) * bw) if gpu_total else 0
|
|
2365
|
+
fb = max(0, bw - mb - ub)
|
|
2366
|
+
console.print(f" GPU Memory: [cyan]{'█' * ub}[/][{'green' if fits_gpu else 'red'}]{'█' * mb}[/][dim]{'░' * fb}[/]")
|
|
2367
|
+
console.print(f" [cyan]■[/] used:{gpu_used // 1024}G [{'green' if fits_gpu else 'red'}]■[/] model:{size_gb}G [dim]░[/] free:{max(0, gpu_total - gpu_used - size_mb) // 1024}G\n")
|
|
2368
|
+
|
|
2369
|
+
# Performance
|
|
2370
|
+
perf = Table(show_header=True, header_style="bold", border_style="dim", padding=(0, 1))
|
|
2371
|
+
perf.add_column("", width=18)
|
|
2372
|
+
perf.add_column("Value", width=16)
|
|
2373
|
+
perf.add_column("", width=38)
|
|
2374
|
+
perf.add_row("Model", f"{size_gb} GB", "Fits GPU" if fits_gpu else "[red]Exceeds GPU → swap[/]")
|
|
2375
|
+
perf.add_row("Compute", "GPU" if fits_gpu else "[red]CPU[/]", "All layers on GPU" if fits_gpu else "[red]5-10x slower[/]")
|
|
2376
|
+
perf.add_row("Speed", f"~{base_tps} tok/s", "" if fits_gpu else "[red]swap thrashing[/]")
|
|
2377
|
+
perf.add_row("Download", f"~{max(1, int(size_gb * 12))}s", f"at 100MB/s ({size_gb}GB)")
|
|
2378
|
+
console.print(perf)
|
|
2379
|
+
console.print()
|
|
2380
|
+
|
|
2381
|
+
# Context table
|
|
2382
|
+
ct = Table(title="Context Length vs Memory", show_header=True, header_style="bold", border_style="dim", padding=(0, 1))
|
|
2383
|
+
ct.add_column("Context", width=8)
|
|
2384
|
+
ct.add_column("KV Cache", width=8, justify="right")
|
|
2385
|
+
ct.add_column("Total", width=8, justify="right")
|
|
2386
|
+
ct.add_column("Verdict", width=25)
|
|
2387
|
+
for ctx in [4096, 8192, 32768, 65536, 131072]:
|
|
2388
|
+
kv = kv_per_1k * (ctx // 1024)
|
|
2389
|
+
tot = size_mb + kv
|
|
2390
|
+
h = gpu_total - tot
|
|
2391
|
+
s = "[green]✓ fits[/]" if h > 2000 else f"[yellow]⚠ tight[/]" if h > 0 else f"[red]✗ OOM ({-h // 1024}GB over)[/]"
|
|
2392
|
+
ct.add_row(f"{ctx // 1024}K", f"{kv}M" if kv < 1024 else f"{kv / 1024:.1f}G", f"{tot / 1024:.1f}G", s)
|
|
2393
|
+
console.print(ct)
|
|
2394
|
+
|
|
2395
|
+
console.print()
|
|
2396
|
+
if not fits_gpu:
|
|
2397
|
+
for mid, m in sorted(MODELS.items(), key=lambda x: x[1]["size_gb"], reverse=True):
|
|
2398
|
+
if m["size_gb"] * 1024 < gpu_total:
|
|
2399
|
+
console.print(f" [green]→ Try:[/] {m['name']} ({m['size_gb']}GB) — {m.get('description', '')}")
|
|
2400
|
+
break
|
|
2401
|
+
console.print(f" [green]→ Or:[/] sudo sysctl iogpu.wired_limit_mb={int(specs['ram_gb'] * 1024 * 0.9)}")
|
|
2402
|
+
elif not fits_free:
|
|
2403
|
+
console.print(f" [yellow]→[/] localcoder --cleanup [dim](free {gpu_used // 1024}GB)[/]")
|
|
2404
|
+
else:
|
|
2405
|
+
console.print(f" [green]→[/] localcoder{' -m ' + model_id if model_id else ''} [dim](ready to run)[/]")
|
|
2406
|
+
console.print()
|
|
2407
|
+
|
|
2408
|
+
|
|
2409
|
+
def recommend_model(ram_gb):
|
|
2410
|
+
"""Recommend the best model for given RAM."""
|
|
2411
|
+
if ram_gb >= 48:
|
|
2412
|
+
return "gemma4-26b", "26B Q4_K_M (best quality) + vision + 128K context. Plenty of headroom."
|
|
2413
|
+
elif ram_gb >= 36:
|
|
2414
|
+
return "qwen35b-a3b", "Qwen 3.5 35B-A3B Q3_K_XL — best coding quality at 36GB+."
|
|
2415
|
+
elif ram_gb >= 24:
|
|
2416
|
+
return "gemma4-26b", "Gemma 4 26B Q3_K_XL — 49 tok/s, best overall for 24GB. Also try Qwen 35B Q2."
|
|
2417
|
+
elif ram_gb >= 16:
|
|
2418
|
+
return "gemma4-e4b", "E4B is the sweet spot for 16GB. Audio + image + code, 57 tok/s."
|
|
2419
|
+
elif ram_gb >= 8:
|
|
2420
|
+
return "qwen35-4b", "Qwen 3.5 4B — ultrafast at 50 tok/s, only 2.7GB GPU."
|
|
2421
|
+
else:
|
|
2422
|
+
return "gemma4-e2b", "E2B is the only option under 8GB."
|
|
2423
|
+
|
|
2424
|
+
|
|
2425
|
+
def can_run_simultaneously(ram_gb, model1_gb, model2_gb):
|
|
2426
|
+
"""Check if two models can run at the same time."""
|
|
2427
|
+
gpu_limit = ram_gb * 0.67 # Metal limit ~67% of unified memory
|
|
2428
|
+
return (model1_gb + model2_gb) < gpu_limit
|
|
2429
|
+
|
|
2430
|
+
|
|
2431
|
+
def stop_conflicting_backends(target_backend):
|
|
2432
|
+
"""Stop other backends to free GPU memory."""
|
|
2433
|
+
if target_backend == "ollama":
|
|
2434
|
+
# Kill llama-server if running (frees GPU for Ollama)
|
|
2435
|
+
if check_backend_running("llamacpp"):
|
|
2436
|
+
console.print(f" [yellow]Stopping llama-server to free GPU for Ollama...[/]")
|
|
2437
|
+
try:
|
|
2438
|
+
subprocess.run(["pkill", "-f", "llama-server"], timeout=5)
|
|
2439
|
+
time.sleep(2)
|
|
2440
|
+
except:
|
|
2441
|
+
pass
|
|
2442
|
+
elif target_backend == "llamacpp":
|
|
2443
|
+
# Unload Ollama models to free GPU
|
|
2444
|
+
if check_backend_running("ollama"):
|
|
2445
|
+
console.print(f" [yellow]Unloading Ollama models to free GPU...[/]")
|
|
2446
|
+
try:
|
|
2447
|
+
models = get_running_models("ollama")
|
|
2448
|
+
for m in models:
|
|
2449
|
+
urllib.request.urlopen(
|
|
2450
|
+
urllib.request.Request(
|
|
2451
|
+
"http://127.0.0.1:11434/api/generate",
|
|
2452
|
+
data=json.dumps({"model": m, "keep_alive": 0}).encode(),
|
|
2453
|
+
headers={"Content-Type": "application/json"}
|
|
2454
|
+
), timeout=5
|
|
2455
|
+
)
|
|
2456
|
+
time.sleep(2)
|
|
2457
|
+
except:
|
|
2458
|
+
pass
|
|
2459
|
+
|
|
2460
|
+
|
|
2461
|
+
def start_ollama_serve():
|
|
2462
|
+
"""Ensure Ollama is serving."""
|
|
2463
|
+
if check_backend_running("ollama"):
|
|
2464
|
+
return True
|
|
2465
|
+
try:
|
|
2466
|
+
subprocess.Popen(["ollama", "serve"], stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
|
2467
|
+
time.sleep(2)
|
|
2468
|
+
return check_backend_running("ollama")
|
|
2469
|
+
except:
|
|
2470
|
+
return False
|