offgrid-ai 0.9.6 → 0.10.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -2,7 +2,7 @@
2
2
 
3
3
  # offgrid-ai
4
4
 
5
- **Helper CLI for running local AI models on Mac with llama.cpp, ollama, and oMLX.**
5
+ **Helper CLI for running local AI models on Mac with llama-server, mlx-vlm, and oMLX.**
6
6
 
7
7
  [![node](https://img.shields.io/badge/node-20%2B-3c873a)](package.json)
8
8
  [![platform](https://img.shields.io/badge/platform-macOS%20%7C%20Linux-blue)]()
@@ -12,19 +12,19 @@
12
12
 
13
13
  ## What is offgrid-ai?
14
14
 
15
- offgrid-ai is a command-line tool that lets you run AI models locally. Running local models with llama.cpp, ollama, or oMLX have a steep learning curve compared to cloud-based models, so offgrid-ai is designed to abstract away the complexity, while still providing a powerful and flexible way to run local models.
15
+ offgrid-ai is a command-line tool that lets you run AI models locally. Running local models with llama-server, mlx-vlm, or oMLX have a steep learning curve compared to cloud-based models, so offgrid-ai is designed to abstract away the complexity, while still providing a powerful and flexible way to run local models.
16
16
 
17
17
  This is the recommended workflow:
18
18
 
19
- 1. Download models from **LM Studio**, **Ollama**, or **oMLX**
19
+ 1. Download models from **LM Studio** or **oMLX**
20
20
  2. Do minimal configuration using the `offgrid-ai` command
21
21
  3. Run the model with `offgrid-ai` with Pi in interactive mode
22
22
 
23
23
  ## Core Features
24
- - Auto-detects available models from LM Studio, Ollama, and oMLX
24
+ - Auto-detects available models from LM Studio, oMLX, and HuggingFace
25
25
  - Auto-detects MTP (multi-token prediction) or QAT (quantization aware training) models, and applies the correct flags for llama.cpp
26
- - Auto-applies the optimal flags for the model type in llama.cpp
27
- - Start / stop llama.cpp server automatically for chat sessions
26
+ - Auto-applies the optimal flags for the model type (llama.cpp server flags, mlx-vlm APC/thinking/context flags)
27
+ - Start / stop local servers automatically for chat sessions (llama-server and mlx-vlm)
28
28
 
29
29
  ## Quick start
30
30
 
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "offgrid-ai",
3
- "version": "0.9.6",
3
+ "version": "0.10.1",
4
4
  "description": "Privacy-first CLI for running local LLMs — discover, configure, run, benchmark",
5
5
  "author": "Eeshan Srivastava (https://eeshans.com)",
6
6
  "type": "module",
@@ -12,6 +12,8 @@
12
12
  "src/*.mjs",
13
13
  "src/commands/*.mjs",
14
14
  "src/benchmark/*.mjs",
15
+ "resources/*.py",
16
+ "resources/recommendations.json",
15
17
  "install.sh"
16
18
  ],
17
19
  "publishConfig": {
@@ -32,7 +34,7 @@
32
34
  "start": "node bin/offgrid-ai.mjs",
33
35
  "test": "node --test test/*.mjs",
34
36
  "test:integration": "OFFGRID_INTEGRATION=1 node --test test/integration/*.mjs",
35
- "lint": "eslint src/*.mjs src/commands/*.mjs src/benchmark/*.mjs bin/*.mjs",
37
+ "lint": "eslint src/*.mjs src/commands/*.mjs src/benchmark/*.mjs scripts/*.mjs bin/*.mjs",
36
38
  "check:privacy": "node scripts/privacy-gate.mjs",
37
39
  "release:check": "bash scripts/release-check.sh",
38
40
  "release:check:fast": "bash scripts/release-check.sh --skip-install --skip-manual",
@@ -46,7 +48,6 @@
46
48
  "keywords": [
47
49
  "local-llm",
48
50
  "llama-cpp",
49
- "ollama",
50
51
  "cli",
51
52
  "privacy",
52
53
  "llm",
@@ -0,0 +1,79 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ Download a HuggingFace model into the standard HF cache.
4
+
5
+ Usage:
6
+ python3 hf-download.py --repo mlx-community/gemma-4-e2b-it-4bit
7
+ python3 hf-download.py --repo unsloth/gemma-4-E2B-it-GGUF --file gemma-4-E2B-it-Q4_K_S.gguf
8
+
9
+ Streams NDJSON progress events to stdout.
10
+ """
11
+ import argparse
12
+ import json
13
+ import os
14
+ import sys
15
+
16
+
17
+ def emit(event):
18
+ print(json.dumps(event), flush=True)
19
+
20
+
21
+ def progress_callback(relative_path, downloaded, total):
22
+ emit({
23
+ "type": "progress",
24
+ "file": relative_path,
25
+ "downloadedBytes": downloaded,
26
+ "totalBytes": total,
27
+ })
28
+
29
+
30
+ def main():
31
+ parser = argparse.ArgumentParser(description="Download a HuggingFace model into the standard cache.")
32
+ parser.add_argument("--repo", required=True, help="HuggingFace repo ID (e.g. mlx-community/gemma-4-e2b-it-4bit)")
33
+ parser.add_argument("--file", help="Specific filename to download (for GGUF). Omit to download the full repo (MLX).")
34
+ parser.add_argument("--cache-dir", help="HF hub cache directory (where models--org--name/... live). Defaults to $HF_HUB_CACHE or $HF_HOME/hub or ~/.cache/huggingface/hub.")
35
+ args = parser.parse_args()
36
+
37
+ try:
38
+ from huggingface_hub import hf_hub_download, snapshot_download
39
+ except ImportError as e:
40
+ emit({"type": "error", "message": f"huggingface_hub is not installed: {e}"})
41
+ sys.exit(1)
42
+
43
+ cache_dir = args.cache_dir or os.environ.get("HF_HUB_CACHE") or os.path.join(
44
+ os.environ.get("HF_HOME") or os.path.join(os.path.expanduser("~"), ".cache", "huggingface"),
45
+ "hub",
46
+ )
47
+
48
+ try:
49
+ if args.file:
50
+ local_path = hf_hub_download(
51
+ repo_id=args.repo,
52
+ filename=args.file,
53
+ cache_dir=cache_dir,
54
+ resume_download=True,
55
+ )
56
+ emit({
57
+ "type": "complete",
58
+ "localDir": os.path.dirname(local_path),
59
+ "localPath": local_path,
60
+ "format": "gguf",
61
+ })
62
+ else:
63
+ local_dir = snapshot_download(
64
+ repo_id=args.repo,
65
+ cache_dir=cache_dir,
66
+ resume_download=True,
67
+ )
68
+ emit({
69
+ "type": "complete",
70
+ "localDir": local_dir,
71
+ "format": "mlx",
72
+ })
73
+ except Exception as e:
74
+ emit({"type": "error", "message": str(e)})
75
+ sys.exit(1)
76
+
77
+
78
+ if __name__ == "__main__":
79
+ main()
@@ -0,0 +1,112 @@
1
+ #!/usr/bin/env python3
2
+ """
3
+ mlx-vlm server wrapper with strict=False model loading + APC merge fix.
4
+
5
+ Two monkey-patches are applied before the server starts:
6
+
7
+ 1. strict=False model loading — needed for architectures with shared-KV weight
8
+ schemes (e.g. Gemma 4). Most models (Qwen, Llama, Mistral, Phi) load fine
9
+ with strict=True — strict=False is a no-op for them.
10
+
11
+ 2. BatchRotatingKVCache.merge() shape-mismatch fix — upstream mlx-lm bug
12
+ (ml-explore/mlx-lm PR #1116, Blaizzy/mlx-vlm Issue #923). The merge() method
13
+ crashes with `ValueError: [broadcast_shapes] Shapes (1,1,28,256) and
14
+ (1,1,512,256) cannot be broadcast` when APC merges exact-cache entries with
15
+ different fill levels. This affects all sliding-window attention models
16
+ (Gemma 4, Mistral, Mixtral). The fix uses explicit slicing instead of
17
+ negative indexing to guarantee exactly `l` elements are extracted.
18
+
19
+ This patch can be removed once mlx-lm fixes merge() upstream (not fixed in
20
+ 0.31.2 or 0.31.3 — the merge() method is identical in both).
21
+
22
+ Benchmark finding: mlx-vlm clears Metal cache after every request (GitHub Issue
23
+ #999) unless APC_ENABLED=1 is set. The env var is set by the Electron app at
24
+ spawn time, not in this wrapper.
25
+
26
+ Usage:
27
+ python3 mlxvlm-server-wrapper.py --model <path> --host 127.0.0.1 --port <port>
28
+ """
29
+ import sys
30
+
31
+ # ── Patch 1: strict=False model loading ──────────────────────────────────────
32
+
33
+ import mlx_vlm.utils as _utils
34
+ _orig_load_model = _utils.load_model
35
+
36
+ def _patched_load_model(model_path, lazy=False, strict=True, **kwargs):
37
+ return _orig_load_model(model_path, lazy=lazy, strict=False, **kwargs)
38
+
39
+ _utils.load_model = _patched_load_model
40
+
41
+ # ── Patch 2: BatchRotatingKVCache.merge() shape-mismatch fix ──────────────────
42
+ #
43
+ # Upstream bug: _temporal_order() can return a buffer whose seq dimension differs
44
+ # from c.size(). The negative slice [..., -l:, :] then produces a mismatched shape,
45
+ # crashing with ValueError: [broadcast_shapes].
46
+ #
47
+ # Fix: use explicit slicing to extract exactly `l` elements, right-aligning within
48
+ # the target slice when the buffer is shorter than `l` (left-padded by zeros from
49
+ # the pre-allocated target tensor).
50
+
51
+ import mlx.core as mx
52
+ from mlx_lm.models import cache as _lm_cache
53
+
54
+ _orig_merge = _lm_cache.BatchRotatingKVCache.merge
55
+
56
+ @classmethod
57
+ def _patched_merge(cls, caches):
58
+ if not all(c.max_size == caches[0].max_size for c in caches):
59
+ raise ValueError(
60
+ "BatchRotatingKVCache can only merge caches with the same maximum size"
61
+ )
62
+
63
+ offsets = [c.offset for c in caches]
64
+ lengths = [c.size() for c in caches]
65
+ max_length = max(lengths)
66
+
67
+ if max_length == 0:
68
+ return cls(caches[0].max_size, [0] * len(caches))
69
+
70
+ padding = [max_length - l for l in lengths]
71
+ B = len(caches)
72
+ H = max(c.keys.shape[1] for c in caches if c.keys is not None)
73
+ Dk = max(c.keys.shape[3] for c in caches if c.keys is not None)
74
+ Dv = max(c.values.shape[3] for c in caches if c.values is not None)
75
+ dt = next(iter(c.keys.dtype for c in caches if c.keys is not None))
76
+
77
+ keys = mx.zeros((B, H, max_length, Dk), dtype=dt)
78
+ values = mx.zeros((B, H, max_length, Dv), dtype=dt)
79
+ for i, (p, l, c) in enumerate(zip(padding, lengths, caches)):
80
+ if c.keys is None:
81
+ continue
82
+ ordered_k = c._temporal_order(c.keys)
83
+ ordered_v = c._temporal_order(c.values)
84
+ seq_len = ordered_k.shape[2]
85
+ if seq_len >= l:
86
+ # Normal case: extract the last `l` tokens.
87
+ start = seq_len - l
88
+ keys[i : i + 1, :, p : p + l] = ordered_k[..., start : start + l, :]
89
+ values[i : i + 1, :, p : p + l] = ordered_v[..., start : start + l, :]
90
+ else:
91
+ # Buffer shorter than l: right-align within the slice (left-padded
92
+ # by zeros from the pre-allocated target tensor).
93
+ gap = l - seq_len
94
+ keys[i : i + 1, :, p + gap : p + l] = ordered_k
95
+ values[i : i + 1, :, p + gap : p + l] = ordered_v
96
+
97
+ cache = cls(caches[0].max_size, padding)
98
+ cache.keys = keys
99
+ cache.values = values
100
+ cache.offset = mx.array(offsets)
101
+ cache._idx = keys.shape[2]
102
+ cache._offset = keys.shape[2]
103
+
104
+ return cache
105
+
106
+ _lm_cache.BatchRotatingKVCache.merge = _patched_merge
107
+
108
+ # ── Run the server ────────────────────────────────────────────────────────────
109
+ # main() parses sys.argv for --model, --host, --port, etc.
110
+ from mlx_vlm.server import main
111
+ main()
112
+
@@ -0,0 +1,60 @@
1
+ {
2
+ "models": [
3
+ {
4
+ "id": "gemma-4-e2b",
5
+ "label": "Gemma 4 E2B",
6
+ "minRamGb": 8,
7
+ "gguf": "unsloth/gemma-4-E2B-it-GGUF/gemma-4-E2B-it-Q4_K_S.gguf",
8
+ "mlx": "mlx-community/gemma-4-e2b-it-4bit"
9
+ },
10
+ {
11
+ "id": "qwen-3.5-9b",
12
+ "label": "Qwen 3.5 9B",
13
+ "minRamGb": 16,
14
+ "gguf": "unsloth/Qwen3.5-9B-GGUF/Qwen3.5-9B-UD-Q4_K_S.gguf",
15
+ "mlx": "lmstudio-community/Qwen3.5-9B-MLX-4bit"
16
+ },
17
+ {
18
+ "id": "gemma-4-12b-qat",
19
+ "label": "Gemma 4 12B",
20
+ "minRamGb": 24,
21
+ "gguf": "unsloth/gemma-4-12B-it-qat-GGUF/gemma-4-12B-it-qat-UD-Q4_K_XL.gguf",
22
+ "mlx": "mlx-community/gemma-4-12B-it-qat-4bit"
23
+ },
24
+ {
25
+ "id": "gemma-4-26b",
26
+ "label": "Gemma 4 26B",
27
+ "minRamGb": 32,
28
+ "gguf": "unsloth/gemma-4-26B-A4B-it-qat-GGUF/gemma-4-26B-A4B-it-qat-UD-Q4_K_XL.gguf",
29
+ "mlx": "mlx-community/gemma-4-26b-a4b-4bit"
30
+ },
31
+ {
32
+ "id": "qwen-3.6-35b-compact",
33
+ "label": "Qwen 3.6 35B",
34
+ "minRamGb": 32,
35
+ "gguf": "unsloth/Qwen3.6-35B-A3B-GGUF/Qwen3.6-35B-A3B-UD-Q4_K_S.gguf",
36
+ "mlx": "mlx-community/Qwen3.6-35B-A3B-4bit"
37
+ },
38
+ {
39
+ "id": "qwen-3.6-35b",
40
+ "label": "Qwen 3.6 35B",
41
+ "minRamGb": 48,
42
+ "gguf": "unsloth/Qwen3.6-35B-A3B-GGUF/Qwen3.6-35B-A3B-UD-Q4_K_M.gguf",
43
+ "mlx": ""
44
+ },
45
+ {
46
+ "id": "gemma-4-31b",
47
+ "label": "Gemma 4 31B",
48
+ "minRamGb": 64,
49
+ "gguf": "unsloth/gemma-4-31B-it-qat-GGUF/gemma-4-31B-it-qat-UD-Q4_K_XL.gguf",
50
+ "mlx": "mlx-community/gemma-4-31b-4bit"
51
+ },
52
+ {
53
+ "id": "qwen-3.6-27b",
54
+ "label": "Qwen 3.6 27B",
55
+ "minRamGb": 64,
56
+ "gguf": "unsloth/Qwen3.6-27B-MTP-GGUF/Qwen3.6-27B-Q4_K_M.gguf",
57
+ "mlx": "mlx-community/Qwen3.6-27B-4bit"
58
+ }
59
+ ]
60
+ }
@@ -15,21 +15,6 @@ export const BACKEND_INSTALLERS = {
15
15
  failure: "Download it manually from https://lmstudio.ai",
16
16
  allFailure: "✗ LM Studio installation failed. Download from https://lmstudio.ai",
17
17
  },
18
- ollama: {
19
- label: "Ollama",
20
- choiceLabel: "Ollama",
21
- hint: "brew install ollama — models download on demand",
22
- commands: [["brew", ["install", "ollama"], "Ollama"]],
23
- success(model) {
24
- console.log(pc.green("✓ Ollama installed"));
25
- console.log(pc.yellow("\nStart Ollama and pull a model:"));
26
- console.log(pc.bold(` ollama serve & ollama pull ${model.ollama}`));
27
- console.log(pc.dim(`Recommended for your machine: ${model.label}`));
28
- console.log(pc.dim("Then run offgrid-ai again to pick and run a model."));
29
- },
30
- failure: "Install it manually from https://ollama.com",
31
- allFailure: "✗ Ollama installation failed. Install manually from https://ollama.com",
32
- },
33
18
  omlx: {
34
19
  label: "oMLX",
35
20
  choiceLabel: "oMLX",
@@ -52,6 +37,6 @@ export const BACKEND_INSTALLERS = {
52
37
 
53
38
  export const BACKEND_INSTALL_CHOICES = [
54
39
  ...Object.entries(BACKEND_INSTALLERS).map(([value, installer]) => ({ value, label: installer.choiceLabel, hint: installer.hint })),
55
- { value: "all", label: "Install all three", hint: "LM Studio + Ollama + oMLX" },
40
+ { value: "all", label: "Install both", hint: "LM Studio + oMLX" },
56
41
  { value: "skip", label: "Skip for now", hint: "I'll set up models myself" },
57
42
  ];
package/src/backends.mjs CHANGED
@@ -1,13 +1,14 @@
1
1
  import { findLlamaServer } from "./config.mjs";
2
2
  import { scanGgufModels } from "./scan.mjs";
3
3
  import { parseModelName } from "./model-name.mjs";
4
+ import { scanMlxModels } from "./mlx-discovery.mjs";
5
+ import { DEFAULT_PORT as MLX_VLM_PORT } from "./mlx-flags.mjs";
4
6
 
5
7
  // ── Backend definitions ────────────────────────────────────────────────────
6
8
 
7
9
  export const LOCAL_HOST = "127.0.0.1";
8
10
  export const LLAMA_CPP_PORT = 8080;
9
11
  export const LLAMA_CPP_MTP_PORT = 8081;
10
- export const OLLAMA_PORT = 11434;
11
12
  export const OMLX_PORT = 8000;
12
13
 
13
14
  export function baseUrlFor({ host = LOCAL_HOST, port, path = "/v1" }) {
@@ -41,18 +42,6 @@ export const BACKENDS = {
41
42
  needsCommandFile: true,
42
43
  scanModels: async () => (await scanGgufModels()).models,
43
44
  },
44
- "ollama": {
45
- id: "ollama",
46
- label: "Ollama",
47
- type: "managed-server",
48
- providerId: "ollama",
49
- defaultHost: "localhost",
50
- defaultPort: OLLAMA_PORT,
51
- defaultBaseUrl: baseUrlFor({ host: "localhost", port: OLLAMA_PORT }),
52
- apiBaseUrl: baseUrlFor({ host: "localhost", port: OLLAMA_PORT, path: "" }),
53
- needsCommandFile: false,
54
- scanModels: () => scanOllamaModels(),
55
- },
56
45
  "omlx": {
57
46
  id: "omlx",
58
47
  label: "oMLX",
@@ -65,6 +54,17 @@ export const BACKENDS = {
65
54
  needsCommandFile: false,
66
55
  scanModels: () => scanOmlxModels(),
67
56
  },
57
+ "mlx-vlm": {
58
+ id: "mlx-vlm",
59
+ label: "mlx-vlm",
60
+ type: "local-server",
61
+ providerId: "mlx-vlm",
62
+ defaultHost: LOCAL_HOST,
63
+ defaultPort: MLX_VLM_PORT,
64
+ defaultBaseUrl: baseUrlFor({ port: MLX_VLM_PORT }),
65
+ needsCommandFile: true,
66
+ scanModels: async () => scanMlxModels(),
67
+ },
68
68
  };
69
69
 
70
70
  export function backendFor(backendId) {
@@ -75,6 +75,7 @@ export function backendFor(backendId) {
75
75
 
76
76
  export async function backendBinaryFor(backendId) {
77
77
  const backend = BACKENDS[backendId ?? "llama-cpp"];
78
+ if (backend.id === "mlx-vlm") return "python3"; // mlx-vlm spawns via python3 + the strict=False wrapper
78
79
  if (backend.type === "managed-server") return null;
79
80
  const discovered = await findLlamaServer();
80
81
  return discovered; // null means "not found — trigger onboarding"
@@ -85,29 +86,6 @@ export function defaultFlagsForBackend(backendId) {
85
86
  return { host: backend.defaultHost ?? LOCAL_HOST, port: backend.defaultPort };
86
87
  }
87
88
 
88
- // ── Ollama model discovery ──────────────────────────────────────────────
89
-
90
- async function scanOllamaModels() {
91
- const response = await fetch(`${BACKENDS.ollama.apiBaseUrl}/api/tags`, { signal: AbortSignal.timeout(3000) });
92
- if (!response.ok) {
93
- throw new Error(`Ollama /api/tags returned ${response.status} ${response.statusText}`);
94
- }
95
- const body = await response.json();
96
- if (!Array.isArray(body?.models)) return [];
97
- return body.models
98
- .filter((model) => isLocalOllamaModel(model))
99
- .map((model) => ({
100
- id: model.name,
101
- label: parseModelName(model.name, "ollama").display,
102
- aliasSuggestion: model.name,
103
- sizeBytes: model.size ?? 0,
104
- quant: model.details?.quantization_level,
105
- family: model.details?.family,
106
- backend: "ollama",
107
- source: "ollama",
108
- })).sort((a, b) => a.label.localeCompare(b.label));
109
- }
110
-
111
89
  // ── oMLX model discovery ───────────────────────────────────────────────
112
90
 
113
91
  async function scanOmlxModels() {
@@ -123,7 +101,8 @@ async function scanOmlxModels() {
123
101
  id: model.id,
124
102
  label: parseModelName(model.id, "omlx").display,
125
103
  aliasSuggestion: model.id,
126
- sizeBytes: 0,
104
+ sizeBytes: model.size ?? 0,
105
+ contextLength: model.max_model_len ?? null,
127
106
  quant: null,
128
107
  family: null,
129
108
  backend: "omlx",
@@ -133,13 +112,6 @@ async function scanOmlxModels() {
133
112
 
134
113
  // ── Labels ──────────────────────────────────────────────────────────────
135
114
 
136
- function isLocalOllamaModel(model) {
137
- const name = String(model?.name ?? "");
138
- if (/:cloud(?:$|\b)/i.test(name)) return false;
139
- if (!Number.isFinite(model?.size) || model.size <= 0) return false;
140
- return true;
141
- }
142
-
143
115
  function isChatOmlxModel(model) {
144
116
  if (typeof model?.id !== "string" || !model.id.trim()) return false;
145
117
  const type = String(model.type ?? model.model_type ?? "").toLowerCase();
@@ -148,4 +120,5 @@ function isChatOmlxModel(model) {
148
120
  return true;
149
121
  }
150
122
 
151
- // (ollamaLabel and omlxLabel removed — parseModelName in model-name.mjs is the single path)
123
+ // (ollamaLabel and omlxLabel removed — parseModelName in model-name.mjs is the single path)
124
+ // (Ollama backend removed — offgrid-ai now uses llama-server + mlx-vlm + oMLX)
@@ -1,99 +1,12 @@
1
- // ── Unload model from server memory after benchmark ────────────────────────────
1
+ // ── Benchmark finalization (metadata + summary rendering) ───────────────────
2
+ // unloadModelFromServer has been moved to src/process.mjs (it's the managed-server
3
+ // counterpart to stopProfile, used by both the benchmark flow and the Pi chat flow).
2
4
 
3
- import { backendFor } from "../backends.mjs";
4
- import { apiRootUrl, serverModelIds } from "../process.mjs";
5
5
  import { existsSync } from "node:fs";
6
6
  import { readFile, writeFile } from "node:fs/promises";
7
7
  import { join } from "node:path";
8
8
  import { pc, renderRows, renderSection } from "../ui.mjs";
9
9
 
10
- export async function unloadModelFromServer(profile) {
11
- const backend = backendFor(profile.backend);
12
-
13
- if (backend.id === "ollama") {
14
- const apiBaseUrl = apiRootUrl(profile.baseUrl || backend.apiBaseUrl || "");
15
-
16
- try {
17
- await fetch(`${apiBaseUrl}/api/generate`, {
18
- method: "POST",
19
- headers: { "Content-Type": "application/json" },
20
- body: JSON.stringify({ model: profile.modelAlias, prompt: "", stream: false, keep_alive: 0 }),
21
- signal: AbortSignal.timeout(10000),
22
- });
23
- return { unloaded: true, backend: backend.id };
24
- } catch (err) {
25
- return { unloaded: false, backend: backend.id, error: err.message };
26
- }
27
- }
28
-
29
- if (backend.id === "llama-cpp" || backend.id === "llama-cpp-mtp") {
30
- // llama.cpp unloads when the server process exits; no HTTP unload API exists.
31
- // If offgrid-ai started the server, stopProfile already handled it.
32
- return { unloaded: false, backend: backend.id, reason: "stop server to unload" };
33
- }
34
-
35
- if (backend.id === "omlx") {
36
- return await unloadOmlxModel(profile);
37
- }
38
-
39
- return { unloaded: false, backend: backend.id, reason: "unsupported backend" };
40
- }
41
-
42
- async function unloadOmlxModel(profile) {
43
- const baseUrl = profile.baseUrl?.replace(/\/v1\/?$/u, "") || "";
44
- const adminUrl = `${baseUrl}/admin/api/models`;
45
- const modelId = profile.modelAlias || profile.omlxModel || profile.id;
46
-
47
- try {
48
- const ids = await serverModelIds(profile.baseUrl);
49
- const match = ids.find((id) => id.toLowerCase() === modelId.toLowerCase());
50
- const targetId = match ?? modelId;
51
-
52
- const response = await fetch(`${adminUrl}/${encodeURIComponent(targetId)}/unload`, {
53
- method: "POST",
54
- headers: { "Content-Type": "application/json" },
55
- signal: AbortSignal.timeout(30000),
56
- });
57
-
58
- if (response.ok) {
59
- return { unloaded: true, backend: "omlx", modelId: targetId };
60
- }
61
-
62
- const detail = await responseErrorDetail(response);
63
-
64
- if (response.status === 400 && /not loaded/i.test(detail)) {
65
- return { unloaded: true, backend: "omlx", modelId: targetId, reason: "model was not loaded" };
66
- }
67
-
68
- if (response.status === 401 || response.status === 403) {
69
- return {
70
- unloaded: false,
71
- backend: "omlx",
72
- modelId: targetId,
73
- error: "oMLX admin authentication required. Enable skip_api_key_verification in oMLX settings, or unload manually from the admin panel.",
74
- };
75
- }
76
-
77
- return { unloaded: false, backend: "omlx", modelId: targetId, error: `HTTP ${response.status}: ${detail}` };
78
- } catch (err) {
79
- if (err?.name === "AbortError" || err?.name === "TimeoutError") {
80
- return { unloaded: false, backend: "omlx", modelId, error: "Unload request timed out. The model may still be unloading in the background." };
81
- }
82
- return { unloaded: false, backend: "omlx", modelId, error: err.message };
83
- }
84
- }
85
-
86
- async function responseErrorDetail(response) {
87
- const text = await response.text().catch(() => "");
88
- if (!text) return "";
89
- try {
90
- const body = JSON.parse(text);
91
- return body?.detail ?? body?.message ?? text;
92
- } catch {
93
- return text;
94
- }
95
- }
96
-
97
10
  export async function finalizeBenchmarkRun(runDirectory, runResult, speedMetrics, speedMetricsError = null) {
98
11
  const metadataPath = join(runDirectory, "metadata.json");
99
12
  const metadata = JSON.parse(await readFile(metadataPath, "utf8"));
@@ -4,7 +4,7 @@ import { join } from "node:path";
4
4
  import { ensureDirs } from "../config.mjs";
5
5
  import { backendFor } from "../backends.mjs";
6
6
  import { hasPi, hasPiModel, syncPiConfig } from "../harness-pi.mjs";
7
- import { serverReady, startServer, waitForReady, stopProfile, modelAvailableOnServer } from "../process.mjs";
7
+ import { serverReady, startServer, waitForReady, stopProfile, modelAvailableOnServer, unloadModelFromServer } from "../process.mjs";
8
8
  import { loadProfiles } from "../profiles.mjs";
9
9
  import { pc, createPrompt } from "../ui.mjs";
10
10
  import { linkBenchmarkRepo } from "./repo.mjs";
@@ -12,12 +12,11 @@ import { loadBenchmarks } from "./shared.mjs";
12
12
  import { prepareBenchmarkRun } from "./prepare.mjs";
13
13
  import { runBenchmarkInPi } from "./pi-runner.mjs";
14
14
  import { queryServerMetrics } from "./metrics.mjs";
15
- import { unloadModelFromServer } from "./finalize.mjs";
16
15
  import { finalizeBenchmarkRun, renderBenchmarkSummary } from "./finalize.mjs";
17
16
 
18
17
  function benchmarkModelSource(profile) {
19
18
  if (!profile) return "cloud";
20
- return profile.providerId === "llama-cpp-mtp" ? "llama-cpp-mtp" : profile.backend === "ollama" ? "ollama" : profile.backend === "omlx" ? "omlx" : "llama-cpp";
19
+ return profile.providerId === "llama-cpp-mtp" ? "llama-cpp-mtp" : profile.backend === "omlx" ? "omlx" : "llama-cpp";
21
20
  }
22
21
 
23
22
  async function chooseBenchmarkAction(prompt, canRun) {
@@ -29,7 +28,7 @@ async function chooseBenchmarkAction(prompt, canRun) {
29
28
  }
30
29
 
31
30
  function managedModelId(profile) {
32
- return profile.omlxModel ?? profile.ollamaModel ?? profile.modelAlias ?? profile.label;
31
+ return profile.omlxModel ?? profile.modelAlias ?? profile.label;
33
32
  }
34
33
 
35
34
  async function ensureManagedModelAvailableForBenchmark(profile, backend) {
@@ -1,7 +1,6 @@
1
1
  // ── Backend-aware server speed metrics ───────────────────────────────────────
2
2
 
3
3
  import { backendFor } from "../backends.mjs";
4
- import { apiRootUrl } from "../process.mjs";
5
4
 
6
5
  const BENCH_SPEED_PROMPT = "Write a one-sentence summary of machine learning.";
7
6
  const SPEED_QUERY_TIMEOUT_MS = 120_000;
@@ -16,9 +15,6 @@ export async function queryServerMetrics(profile) {
16
15
  if (backend.id === "omlx") {
17
16
  return await queryOmlxMetrics(profile);
18
17
  }
19
- if (backend.id === "ollama") {
20
- return await queryOllamaMetrics(profile);
21
- }
22
18
 
23
19
  throw new Error(`Unsupported backend for benchmark speed metrics: ${backend.id}`);
24
20
  }
@@ -115,43 +111,3 @@ async function queryOmlxMetrics(profile) {
115
111
  metricSource: "oMLX /v1/chat/completions streaming include_usage",
116
112
  };
117
113
  }
118
-
119
- async function queryOllamaMetrics(profile) {
120
- const body = {
121
- model: profile.modelAlias,
122
- prompt: BENCH_SPEED_PROMPT,
123
- stream: false,
124
- options: { num_predict: SPEED_QUERY_MAX_TOKENS },
125
- };
126
-
127
- const apiBaseUrl = apiRootUrl(profile.baseUrl || backendFor(profile.backend).apiBaseUrl || "");
128
-
129
- const response = await fetch(`${apiBaseUrl}/api/generate`, {
130
- method: "POST",
131
- headers: { "Content-Type": "application/json" },
132
- body: JSON.stringify(body),
133
- signal: AbortSignal.timeout(SPEED_QUERY_TIMEOUT_MS),
134
- });
135
-
136
- if (!response.ok) {
137
- throw new Error(`Ollama speed query failed: ${response.status} ${response.statusText}`);
138
- }
139
-
140
- const data = await response.json();
141
- const promptEvalNs = data.prompt_eval_duration ?? 0;
142
- const evalNs = data.eval_duration ?? 0;
143
- const loadNs = data.load_duration ?? 0;
144
-
145
- const promptEvalCount = data.prompt_eval_count ?? 0;
146
- const evalCount = data.eval_count ?? 0;
147
-
148
- return {
149
- prefillTokensPerSecond: promptEvalNs > 0 ? (promptEvalCount / (promptEvalNs / 1e9)) : null,
150
- generationTokensPerSecond: evalNs > 0 ? (evalCount / (evalNs / 1e9)) : null,
151
- ttftMs: promptEvalNs / 1e6,
152
- modelLoadMs: loadNs / 1e6,
153
- speculativeDecodeAcceptance: null,
154
- kvCacheTokens: null,
155
- metricSource: "Ollama /api/generate",
156
- };
157
- }
@@ -55,7 +55,7 @@ export async function prepareBenchmarkRun({ repoPath, benchmark, kind, modelId,
55
55
  kind,
56
56
  runId,
57
57
  benchmark: { id: benchmark.id, title: benchmark.title, description: benchmark.description, prompt: benchmark.prompt },
58
- model: { id: modelId, slug: modelSlug, displayName: parseModelName(modelId, modelSource === "ollama" ? "ollama" : modelSource === "omlx" ? "omlx" : "local-gguf").display },
58
+ model: { id: modelId, slug: modelSlug, displayName: parseModelName(modelId, modelSource === "omlx" ? "omlx" : "local-gguf").display },
59
59
  status: "prepared",
60
60
  createdAt: now.toISOString(),
61
61
  updatedAt: now.toISOString(),