offgrid-ai 0.9.6 → 0.10.1
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +6 -6
- package/package.json +4 -3
- package/resources/hf-download.py +79 -0
- package/resources/mlxvlm-server-wrapper.py +112 -0
- package/resources/recommendations.json +60 -0
- package/src/backend-installers.mjs +1 -16
- package/src/backends.mjs +18 -45
- package/src/benchmark/finalize.mjs +3 -90
- package/src/benchmark/flow.mjs +3 -4
- package/src/benchmark/metrics.mjs +0 -44
- package/src/benchmark/prepare.mjs +1 -1
- package/src/benchmark.mjs +3 -1
- package/src/commands/main.mjs +7 -7
- package/src/commands/models.mjs +21 -18
- package/src/commands/onboard.mjs +67 -9
- package/src/commands/run.mjs +20 -5
- package/src/commands/status.mjs +1 -1
- package/src/config.mjs +11 -2
- package/src/discovery-shared.mjs +44 -0
- package/src/hardware.mjs +49 -0
- package/src/harness-pi.mjs +25 -11
- package/src/huggingface.mjs +209 -0
- package/src/managed.mjs +1 -5
- package/src/mlx-discovery.mjs +294 -0
- package/src/mlx-flags.mjs +93 -0
- package/src/model-catalog.mjs +78 -11
- package/src/model-name.mjs +7 -25
- package/src/model-presenters.mjs +114 -38
- package/src/process.mjs +129 -32
- package/src/profile-setup.mjs +105 -0
- package/src/profiles.mjs +30 -0
- package/src/recommendations.mjs +56 -14
- package/src/scan.mjs +43 -8
package/README.md
CHANGED
|
@@ -2,7 +2,7 @@
|
|
|
2
2
|
|
|
3
3
|
# offgrid-ai
|
|
4
4
|
|
|
5
|
-
**Helper CLI for running local AI models on Mac with llama
|
|
5
|
+
**Helper CLI for running local AI models on Mac with llama-server, mlx-vlm, and oMLX.**
|
|
6
6
|
|
|
7
7
|
[](package.json)
|
|
8
8
|
[]()
|
|
@@ -12,19 +12,19 @@
|
|
|
12
12
|
|
|
13
13
|
## What is offgrid-ai?
|
|
14
14
|
|
|
15
|
-
offgrid-ai is a command-line tool that lets you run AI models locally. Running local models with llama
|
|
15
|
+
offgrid-ai is a command-line tool that lets you run AI models locally. Running local models with llama-server, mlx-vlm, or oMLX have a steep learning curve compared to cloud-based models, so offgrid-ai is designed to abstract away the complexity, while still providing a powerful and flexible way to run local models.
|
|
16
16
|
|
|
17
17
|
This is the recommended workflow:
|
|
18
18
|
|
|
19
|
-
1. Download models from **LM Studio
|
|
19
|
+
1. Download models from **LM Studio** or **oMLX**
|
|
20
20
|
2. Do minimal configuration using the `offgrid-ai` command
|
|
21
21
|
3. Run the model with `offgrid-ai` with Pi in interactive mode
|
|
22
22
|
|
|
23
23
|
## Core Features
|
|
24
|
-
- Auto-detects available models from LM Studio,
|
|
24
|
+
- Auto-detects available models from LM Studio, oMLX, and HuggingFace
|
|
25
25
|
- Auto-detects MTP (multi-token prediction) or QAT (quantization aware training) models, and applies the correct flags for llama.cpp
|
|
26
|
-
- Auto-applies the optimal flags for the model type
|
|
27
|
-
- Start / stop
|
|
26
|
+
- Auto-applies the optimal flags for the model type (llama.cpp server flags, mlx-vlm APC/thinking/context flags)
|
|
27
|
+
- Start / stop local servers automatically for chat sessions (llama-server and mlx-vlm)
|
|
28
28
|
|
|
29
29
|
## Quick start
|
|
30
30
|
|
package/package.json
CHANGED
|
@@ -1,6 +1,6 @@
|
|
|
1
1
|
{
|
|
2
2
|
"name": "offgrid-ai",
|
|
3
|
-
"version": "0.
|
|
3
|
+
"version": "0.10.1",
|
|
4
4
|
"description": "Privacy-first CLI for running local LLMs — discover, configure, run, benchmark",
|
|
5
5
|
"author": "Eeshan Srivastava (https://eeshans.com)",
|
|
6
6
|
"type": "module",
|
|
@@ -12,6 +12,8 @@
|
|
|
12
12
|
"src/*.mjs",
|
|
13
13
|
"src/commands/*.mjs",
|
|
14
14
|
"src/benchmark/*.mjs",
|
|
15
|
+
"resources/*.py",
|
|
16
|
+
"resources/recommendations.json",
|
|
15
17
|
"install.sh"
|
|
16
18
|
],
|
|
17
19
|
"publishConfig": {
|
|
@@ -32,7 +34,7 @@
|
|
|
32
34
|
"start": "node bin/offgrid-ai.mjs",
|
|
33
35
|
"test": "node --test test/*.mjs",
|
|
34
36
|
"test:integration": "OFFGRID_INTEGRATION=1 node --test test/integration/*.mjs",
|
|
35
|
-
"lint": "eslint src/*.mjs src/commands/*.mjs src/benchmark/*.mjs bin/*.mjs",
|
|
37
|
+
"lint": "eslint src/*.mjs src/commands/*.mjs src/benchmark/*.mjs scripts/*.mjs bin/*.mjs",
|
|
36
38
|
"check:privacy": "node scripts/privacy-gate.mjs",
|
|
37
39
|
"release:check": "bash scripts/release-check.sh",
|
|
38
40
|
"release:check:fast": "bash scripts/release-check.sh --skip-install --skip-manual",
|
|
@@ -46,7 +48,6 @@
|
|
|
46
48
|
"keywords": [
|
|
47
49
|
"local-llm",
|
|
48
50
|
"llama-cpp",
|
|
49
|
-
"ollama",
|
|
50
51
|
"cli",
|
|
51
52
|
"privacy",
|
|
52
53
|
"llm",
|
|
@@ -0,0 +1,79 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
Download a HuggingFace model into the standard HF cache.
|
|
4
|
+
|
|
5
|
+
Usage:
|
|
6
|
+
python3 hf-download.py --repo mlx-community/gemma-4-e2b-it-4bit
|
|
7
|
+
python3 hf-download.py --repo unsloth/gemma-4-E2B-it-GGUF --file gemma-4-E2B-it-Q4_K_S.gguf
|
|
8
|
+
|
|
9
|
+
Streams NDJSON progress events to stdout.
|
|
10
|
+
"""
|
|
11
|
+
import argparse
|
|
12
|
+
import json
|
|
13
|
+
import os
|
|
14
|
+
import sys
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
def emit(event):
|
|
18
|
+
print(json.dumps(event), flush=True)
|
|
19
|
+
|
|
20
|
+
|
|
21
|
+
def progress_callback(relative_path, downloaded, total):
|
|
22
|
+
emit({
|
|
23
|
+
"type": "progress",
|
|
24
|
+
"file": relative_path,
|
|
25
|
+
"downloadedBytes": downloaded,
|
|
26
|
+
"totalBytes": total,
|
|
27
|
+
})
|
|
28
|
+
|
|
29
|
+
|
|
30
|
+
def main():
|
|
31
|
+
parser = argparse.ArgumentParser(description="Download a HuggingFace model into the standard cache.")
|
|
32
|
+
parser.add_argument("--repo", required=True, help="HuggingFace repo ID (e.g. mlx-community/gemma-4-e2b-it-4bit)")
|
|
33
|
+
parser.add_argument("--file", help="Specific filename to download (for GGUF). Omit to download the full repo (MLX).")
|
|
34
|
+
parser.add_argument("--cache-dir", help="HF hub cache directory (where models--org--name/... live). Defaults to $HF_HUB_CACHE or $HF_HOME/hub or ~/.cache/huggingface/hub.")
|
|
35
|
+
args = parser.parse_args()
|
|
36
|
+
|
|
37
|
+
try:
|
|
38
|
+
from huggingface_hub import hf_hub_download, snapshot_download
|
|
39
|
+
except ImportError as e:
|
|
40
|
+
emit({"type": "error", "message": f"huggingface_hub is not installed: {e}"})
|
|
41
|
+
sys.exit(1)
|
|
42
|
+
|
|
43
|
+
cache_dir = args.cache_dir or os.environ.get("HF_HUB_CACHE") or os.path.join(
|
|
44
|
+
os.environ.get("HF_HOME") or os.path.join(os.path.expanduser("~"), ".cache", "huggingface"),
|
|
45
|
+
"hub",
|
|
46
|
+
)
|
|
47
|
+
|
|
48
|
+
try:
|
|
49
|
+
if args.file:
|
|
50
|
+
local_path = hf_hub_download(
|
|
51
|
+
repo_id=args.repo,
|
|
52
|
+
filename=args.file,
|
|
53
|
+
cache_dir=cache_dir,
|
|
54
|
+
resume_download=True,
|
|
55
|
+
)
|
|
56
|
+
emit({
|
|
57
|
+
"type": "complete",
|
|
58
|
+
"localDir": os.path.dirname(local_path),
|
|
59
|
+
"localPath": local_path,
|
|
60
|
+
"format": "gguf",
|
|
61
|
+
})
|
|
62
|
+
else:
|
|
63
|
+
local_dir = snapshot_download(
|
|
64
|
+
repo_id=args.repo,
|
|
65
|
+
cache_dir=cache_dir,
|
|
66
|
+
resume_download=True,
|
|
67
|
+
)
|
|
68
|
+
emit({
|
|
69
|
+
"type": "complete",
|
|
70
|
+
"localDir": local_dir,
|
|
71
|
+
"format": "mlx",
|
|
72
|
+
})
|
|
73
|
+
except Exception as e:
|
|
74
|
+
emit({"type": "error", "message": str(e)})
|
|
75
|
+
sys.exit(1)
|
|
76
|
+
|
|
77
|
+
|
|
78
|
+
if __name__ == "__main__":
|
|
79
|
+
main()
|
|
@@ -0,0 +1,112 @@
|
|
|
1
|
+
#!/usr/bin/env python3
|
|
2
|
+
"""
|
|
3
|
+
mlx-vlm server wrapper with strict=False model loading + APC merge fix.
|
|
4
|
+
|
|
5
|
+
Two monkey-patches are applied before the server starts:
|
|
6
|
+
|
|
7
|
+
1. strict=False model loading — needed for architectures with shared-KV weight
|
|
8
|
+
schemes (e.g. Gemma 4). Most models (Qwen, Llama, Mistral, Phi) load fine
|
|
9
|
+
with strict=True — strict=False is a no-op for them.
|
|
10
|
+
|
|
11
|
+
2. BatchRotatingKVCache.merge() shape-mismatch fix — upstream mlx-lm bug
|
|
12
|
+
(ml-explore/mlx-lm PR #1116, Blaizzy/mlx-vlm Issue #923). The merge() method
|
|
13
|
+
crashes with `ValueError: [broadcast_shapes] Shapes (1,1,28,256) and
|
|
14
|
+
(1,1,512,256) cannot be broadcast` when APC merges exact-cache entries with
|
|
15
|
+
different fill levels. This affects all sliding-window attention models
|
|
16
|
+
(Gemma 4, Mistral, Mixtral). The fix uses explicit slicing instead of
|
|
17
|
+
negative indexing to guarantee exactly `l` elements are extracted.
|
|
18
|
+
|
|
19
|
+
This patch can be removed once mlx-lm fixes merge() upstream (not fixed in
|
|
20
|
+
0.31.2 or 0.31.3 — the merge() method is identical in both).
|
|
21
|
+
|
|
22
|
+
Benchmark finding: mlx-vlm clears Metal cache after every request (GitHub Issue
|
|
23
|
+
#999) unless APC_ENABLED=1 is set. The env var is set by the Electron app at
|
|
24
|
+
spawn time, not in this wrapper.
|
|
25
|
+
|
|
26
|
+
Usage:
|
|
27
|
+
python3 mlxvlm-server-wrapper.py --model <path> --host 127.0.0.1 --port <port>
|
|
28
|
+
"""
|
|
29
|
+
import sys
|
|
30
|
+
|
|
31
|
+
# ── Patch 1: strict=False model loading ──────────────────────────────────────
|
|
32
|
+
|
|
33
|
+
import mlx_vlm.utils as _utils
|
|
34
|
+
_orig_load_model = _utils.load_model
|
|
35
|
+
|
|
36
|
+
def _patched_load_model(model_path, lazy=False, strict=True, **kwargs):
|
|
37
|
+
return _orig_load_model(model_path, lazy=lazy, strict=False, **kwargs)
|
|
38
|
+
|
|
39
|
+
_utils.load_model = _patched_load_model
|
|
40
|
+
|
|
41
|
+
# ── Patch 2: BatchRotatingKVCache.merge() shape-mismatch fix ──────────────────
|
|
42
|
+
#
|
|
43
|
+
# Upstream bug: _temporal_order() can return a buffer whose seq dimension differs
|
|
44
|
+
# from c.size(). The negative slice [..., -l:, :] then produces a mismatched shape,
|
|
45
|
+
# crashing with ValueError: [broadcast_shapes].
|
|
46
|
+
#
|
|
47
|
+
# Fix: use explicit slicing to extract exactly `l` elements, right-aligning within
|
|
48
|
+
# the target slice when the buffer is shorter than `l` (left-padded by zeros from
|
|
49
|
+
# the pre-allocated target tensor).
|
|
50
|
+
|
|
51
|
+
import mlx.core as mx
|
|
52
|
+
from mlx_lm.models import cache as _lm_cache
|
|
53
|
+
|
|
54
|
+
_orig_merge = _lm_cache.BatchRotatingKVCache.merge
|
|
55
|
+
|
|
56
|
+
@classmethod
|
|
57
|
+
def _patched_merge(cls, caches):
|
|
58
|
+
if not all(c.max_size == caches[0].max_size for c in caches):
|
|
59
|
+
raise ValueError(
|
|
60
|
+
"BatchRotatingKVCache can only merge caches with the same maximum size"
|
|
61
|
+
)
|
|
62
|
+
|
|
63
|
+
offsets = [c.offset for c in caches]
|
|
64
|
+
lengths = [c.size() for c in caches]
|
|
65
|
+
max_length = max(lengths)
|
|
66
|
+
|
|
67
|
+
if max_length == 0:
|
|
68
|
+
return cls(caches[0].max_size, [0] * len(caches))
|
|
69
|
+
|
|
70
|
+
padding = [max_length - l for l in lengths]
|
|
71
|
+
B = len(caches)
|
|
72
|
+
H = max(c.keys.shape[1] for c in caches if c.keys is not None)
|
|
73
|
+
Dk = max(c.keys.shape[3] for c in caches if c.keys is not None)
|
|
74
|
+
Dv = max(c.values.shape[3] for c in caches if c.values is not None)
|
|
75
|
+
dt = next(iter(c.keys.dtype for c in caches if c.keys is not None))
|
|
76
|
+
|
|
77
|
+
keys = mx.zeros((B, H, max_length, Dk), dtype=dt)
|
|
78
|
+
values = mx.zeros((B, H, max_length, Dv), dtype=dt)
|
|
79
|
+
for i, (p, l, c) in enumerate(zip(padding, lengths, caches)):
|
|
80
|
+
if c.keys is None:
|
|
81
|
+
continue
|
|
82
|
+
ordered_k = c._temporal_order(c.keys)
|
|
83
|
+
ordered_v = c._temporal_order(c.values)
|
|
84
|
+
seq_len = ordered_k.shape[2]
|
|
85
|
+
if seq_len >= l:
|
|
86
|
+
# Normal case: extract the last `l` tokens.
|
|
87
|
+
start = seq_len - l
|
|
88
|
+
keys[i : i + 1, :, p : p + l] = ordered_k[..., start : start + l, :]
|
|
89
|
+
values[i : i + 1, :, p : p + l] = ordered_v[..., start : start + l, :]
|
|
90
|
+
else:
|
|
91
|
+
# Buffer shorter than l: right-align within the slice (left-padded
|
|
92
|
+
# by zeros from the pre-allocated target tensor).
|
|
93
|
+
gap = l - seq_len
|
|
94
|
+
keys[i : i + 1, :, p + gap : p + l] = ordered_k
|
|
95
|
+
values[i : i + 1, :, p + gap : p + l] = ordered_v
|
|
96
|
+
|
|
97
|
+
cache = cls(caches[0].max_size, padding)
|
|
98
|
+
cache.keys = keys
|
|
99
|
+
cache.values = values
|
|
100
|
+
cache.offset = mx.array(offsets)
|
|
101
|
+
cache._idx = keys.shape[2]
|
|
102
|
+
cache._offset = keys.shape[2]
|
|
103
|
+
|
|
104
|
+
return cache
|
|
105
|
+
|
|
106
|
+
_lm_cache.BatchRotatingKVCache.merge = _patched_merge
|
|
107
|
+
|
|
108
|
+
# ── Run the server ────────────────────────────────────────────────────────────
|
|
109
|
+
# main() parses sys.argv for --model, --host, --port, etc.
|
|
110
|
+
from mlx_vlm.server import main
|
|
111
|
+
main()
|
|
112
|
+
|
|
@@ -0,0 +1,60 @@
|
|
|
1
|
+
{
|
|
2
|
+
"models": [
|
|
3
|
+
{
|
|
4
|
+
"id": "gemma-4-e2b",
|
|
5
|
+
"label": "Gemma 4 E2B",
|
|
6
|
+
"minRamGb": 8,
|
|
7
|
+
"gguf": "unsloth/gemma-4-E2B-it-GGUF/gemma-4-E2B-it-Q4_K_S.gguf",
|
|
8
|
+
"mlx": "mlx-community/gemma-4-e2b-it-4bit"
|
|
9
|
+
},
|
|
10
|
+
{
|
|
11
|
+
"id": "qwen-3.5-9b",
|
|
12
|
+
"label": "Qwen 3.5 9B",
|
|
13
|
+
"minRamGb": 16,
|
|
14
|
+
"gguf": "unsloth/Qwen3.5-9B-GGUF/Qwen3.5-9B-UD-Q4_K_S.gguf",
|
|
15
|
+
"mlx": "lmstudio-community/Qwen3.5-9B-MLX-4bit"
|
|
16
|
+
},
|
|
17
|
+
{
|
|
18
|
+
"id": "gemma-4-12b-qat",
|
|
19
|
+
"label": "Gemma 4 12B",
|
|
20
|
+
"minRamGb": 24,
|
|
21
|
+
"gguf": "unsloth/gemma-4-12B-it-qat-GGUF/gemma-4-12B-it-qat-UD-Q4_K_XL.gguf",
|
|
22
|
+
"mlx": "mlx-community/gemma-4-12B-it-qat-4bit"
|
|
23
|
+
},
|
|
24
|
+
{
|
|
25
|
+
"id": "gemma-4-26b",
|
|
26
|
+
"label": "Gemma 4 26B",
|
|
27
|
+
"minRamGb": 32,
|
|
28
|
+
"gguf": "unsloth/gemma-4-26B-A4B-it-qat-GGUF/gemma-4-26B-A4B-it-qat-UD-Q4_K_XL.gguf",
|
|
29
|
+
"mlx": "mlx-community/gemma-4-26b-a4b-4bit"
|
|
30
|
+
},
|
|
31
|
+
{
|
|
32
|
+
"id": "qwen-3.6-35b-compact",
|
|
33
|
+
"label": "Qwen 3.6 35B",
|
|
34
|
+
"minRamGb": 32,
|
|
35
|
+
"gguf": "unsloth/Qwen3.6-35B-A3B-GGUF/Qwen3.6-35B-A3B-UD-Q4_K_S.gguf",
|
|
36
|
+
"mlx": "mlx-community/Qwen3.6-35B-A3B-4bit"
|
|
37
|
+
},
|
|
38
|
+
{
|
|
39
|
+
"id": "qwen-3.6-35b",
|
|
40
|
+
"label": "Qwen 3.6 35B",
|
|
41
|
+
"minRamGb": 48,
|
|
42
|
+
"gguf": "unsloth/Qwen3.6-35B-A3B-GGUF/Qwen3.6-35B-A3B-UD-Q4_K_M.gguf",
|
|
43
|
+
"mlx": ""
|
|
44
|
+
},
|
|
45
|
+
{
|
|
46
|
+
"id": "gemma-4-31b",
|
|
47
|
+
"label": "Gemma 4 31B",
|
|
48
|
+
"minRamGb": 64,
|
|
49
|
+
"gguf": "unsloth/gemma-4-31B-it-qat-GGUF/gemma-4-31B-it-qat-UD-Q4_K_XL.gguf",
|
|
50
|
+
"mlx": "mlx-community/gemma-4-31b-4bit"
|
|
51
|
+
},
|
|
52
|
+
{
|
|
53
|
+
"id": "qwen-3.6-27b",
|
|
54
|
+
"label": "Qwen 3.6 27B",
|
|
55
|
+
"minRamGb": 64,
|
|
56
|
+
"gguf": "unsloth/Qwen3.6-27B-MTP-GGUF/Qwen3.6-27B-Q4_K_M.gguf",
|
|
57
|
+
"mlx": "mlx-community/Qwen3.6-27B-4bit"
|
|
58
|
+
}
|
|
59
|
+
]
|
|
60
|
+
}
|
|
@@ -15,21 +15,6 @@ export const BACKEND_INSTALLERS = {
|
|
|
15
15
|
failure: "Download it manually from https://lmstudio.ai",
|
|
16
16
|
allFailure: "✗ LM Studio installation failed. Download from https://lmstudio.ai",
|
|
17
17
|
},
|
|
18
|
-
ollama: {
|
|
19
|
-
label: "Ollama",
|
|
20
|
-
choiceLabel: "Ollama",
|
|
21
|
-
hint: "brew install ollama — models download on demand",
|
|
22
|
-
commands: [["brew", ["install", "ollama"], "Ollama"]],
|
|
23
|
-
success(model) {
|
|
24
|
-
console.log(pc.green("✓ Ollama installed"));
|
|
25
|
-
console.log(pc.yellow("\nStart Ollama and pull a model:"));
|
|
26
|
-
console.log(pc.bold(` ollama serve & ollama pull ${model.ollama}`));
|
|
27
|
-
console.log(pc.dim(`Recommended for your machine: ${model.label}`));
|
|
28
|
-
console.log(pc.dim("Then run offgrid-ai again to pick and run a model."));
|
|
29
|
-
},
|
|
30
|
-
failure: "Install it manually from https://ollama.com",
|
|
31
|
-
allFailure: "✗ Ollama installation failed. Install manually from https://ollama.com",
|
|
32
|
-
},
|
|
33
18
|
omlx: {
|
|
34
19
|
label: "oMLX",
|
|
35
20
|
choiceLabel: "oMLX",
|
|
@@ -52,6 +37,6 @@ export const BACKEND_INSTALLERS = {
|
|
|
52
37
|
|
|
53
38
|
export const BACKEND_INSTALL_CHOICES = [
|
|
54
39
|
...Object.entries(BACKEND_INSTALLERS).map(([value, installer]) => ({ value, label: installer.choiceLabel, hint: installer.hint })),
|
|
55
|
-
{ value: "all", label: "Install
|
|
40
|
+
{ value: "all", label: "Install both", hint: "LM Studio + oMLX" },
|
|
56
41
|
{ value: "skip", label: "Skip for now", hint: "I'll set up models myself" },
|
|
57
42
|
];
|
package/src/backends.mjs
CHANGED
|
@@ -1,13 +1,14 @@
|
|
|
1
1
|
import { findLlamaServer } from "./config.mjs";
|
|
2
2
|
import { scanGgufModels } from "./scan.mjs";
|
|
3
3
|
import { parseModelName } from "./model-name.mjs";
|
|
4
|
+
import { scanMlxModels } from "./mlx-discovery.mjs";
|
|
5
|
+
import { DEFAULT_PORT as MLX_VLM_PORT } from "./mlx-flags.mjs";
|
|
4
6
|
|
|
5
7
|
// ── Backend definitions ────────────────────────────────────────────────────
|
|
6
8
|
|
|
7
9
|
export const LOCAL_HOST = "127.0.0.1";
|
|
8
10
|
export const LLAMA_CPP_PORT = 8080;
|
|
9
11
|
export const LLAMA_CPP_MTP_PORT = 8081;
|
|
10
|
-
export const OLLAMA_PORT = 11434;
|
|
11
12
|
export const OMLX_PORT = 8000;
|
|
12
13
|
|
|
13
14
|
export function baseUrlFor({ host = LOCAL_HOST, port, path = "/v1" }) {
|
|
@@ -41,18 +42,6 @@ export const BACKENDS = {
|
|
|
41
42
|
needsCommandFile: true,
|
|
42
43
|
scanModels: async () => (await scanGgufModels()).models,
|
|
43
44
|
},
|
|
44
|
-
"ollama": {
|
|
45
|
-
id: "ollama",
|
|
46
|
-
label: "Ollama",
|
|
47
|
-
type: "managed-server",
|
|
48
|
-
providerId: "ollama",
|
|
49
|
-
defaultHost: "localhost",
|
|
50
|
-
defaultPort: OLLAMA_PORT,
|
|
51
|
-
defaultBaseUrl: baseUrlFor({ host: "localhost", port: OLLAMA_PORT }),
|
|
52
|
-
apiBaseUrl: baseUrlFor({ host: "localhost", port: OLLAMA_PORT, path: "" }),
|
|
53
|
-
needsCommandFile: false,
|
|
54
|
-
scanModels: () => scanOllamaModels(),
|
|
55
|
-
},
|
|
56
45
|
"omlx": {
|
|
57
46
|
id: "omlx",
|
|
58
47
|
label: "oMLX",
|
|
@@ -65,6 +54,17 @@ export const BACKENDS = {
|
|
|
65
54
|
needsCommandFile: false,
|
|
66
55
|
scanModels: () => scanOmlxModels(),
|
|
67
56
|
},
|
|
57
|
+
"mlx-vlm": {
|
|
58
|
+
id: "mlx-vlm",
|
|
59
|
+
label: "mlx-vlm",
|
|
60
|
+
type: "local-server",
|
|
61
|
+
providerId: "mlx-vlm",
|
|
62
|
+
defaultHost: LOCAL_HOST,
|
|
63
|
+
defaultPort: MLX_VLM_PORT,
|
|
64
|
+
defaultBaseUrl: baseUrlFor({ port: MLX_VLM_PORT }),
|
|
65
|
+
needsCommandFile: true,
|
|
66
|
+
scanModels: async () => scanMlxModels(),
|
|
67
|
+
},
|
|
68
68
|
};
|
|
69
69
|
|
|
70
70
|
export function backendFor(backendId) {
|
|
@@ -75,6 +75,7 @@ export function backendFor(backendId) {
|
|
|
75
75
|
|
|
76
76
|
export async function backendBinaryFor(backendId) {
|
|
77
77
|
const backend = BACKENDS[backendId ?? "llama-cpp"];
|
|
78
|
+
if (backend.id === "mlx-vlm") return "python3"; // mlx-vlm spawns via python3 + the strict=False wrapper
|
|
78
79
|
if (backend.type === "managed-server") return null;
|
|
79
80
|
const discovered = await findLlamaServer();
|
|
80
81
|
return discovered; // null means "not found — trigger onboarding"
|
|
@@ -85,29 +86,6 @@ export function defaultFlagsForBackend(backendId) {
|
|
|
85
86
|
return { host: backend.defaultHost ?? LOCAL_HOST, port: backend.defaultPort };
|
|
86
87
|
}
|
|
87
88
|
|
|
88
|
-
// ── Ollama model discovery ──────────────────────────────────────────────
|
|
89
|
-
|
|
90
|
-
async function scanOllamaModels() {
|
|
91
|
-
const response = await fetch(`${BACKENDS.ollama.apiBaseUrl}/api/tags`, { signal: AbortSignal.timeout(3000) });
|
|
92
|
-
if (!response.ok) {
|
|
93
|
-
throw new Error(`Ollama /api/tags returned ${response.status} ${response.statusText}`);
|
|
94
|
-
}
|
|
95
|
-
const body = await response.json();
|
|
96
|
-
if (!Array.isArray(body?.models)) return [];
|
|
97
|
-
return body.models
|
|
98
|
-
.filter((model) => isLocalOllamaModel(model))
|
|
99
|
-
.map((model) => ({
|
|
100
|
-
id: model.name,
|
|
101
|
-
label: parseModelName(model.name, "ollama").display,
|
|
102
|
-
aliasSuggestion: model.name,
|
|
103
|
-
sizeBytes: model.size ?? 0,
|
|
104
|
-
quant: model.details?.quantization_level,
|
|
105
|
-
family: model.details?.family,
|
|
106
|
-
backend: "ollama",
|
|
107
|
-
source: "ollama",
|
|
108
|
-
})).sort((a, b) => a.label.localeCompare(b.label));
|
|
109
|
-
}
|
|
110
|
-
|
|
111
89
|
// ── oMLX model discovery ───────────────────────────────────────────────
|
|
112
90
|
|
|
113
91
|
async function scanOmlxModels() {
|
|
@@ -123,7 +101,8 @@ async function scanOmlxModels() {
|
|
|
123
101
|
id: model.id,
|
|
124
102
|
label: parseModelName(model.id, "omlx").display,
|
|
125
103
|
aliasSuggestion: model.id,
|
|
126
|
-
sizeBytes: 0,
|
|
104
|
+
sizeBytes: model.size ?? 0,
|
|
105
|
+
contextLength: model.max_model_len ?? null,
|
|
127
106
|
quant: null,
|
|
128
107
|
family: null,
|
|
129
108
|
backend: "omlx",
|
|
@@ -133,13 +112,6 @@ async function scanOmlxModels() {
|
|
|
133
112
|
|
|
134
113
|
// ── Labels ──────────────────────────────────────────────────────────────
|
|
135
114
|
|
|
136
|
-
function isLocalOllamaModel(model) {
|
|
137
|
-
const name = String(model?.name ?? "");
|
|
138
|
-
if (/:cloud(?:$|\b)/i.test(name)) return false;
|
|
139
|
-
if (!Number.isFinite(model?.size) || model.size <= 0) return false;
|
|
140
|
-
return true;
|
|
141
|
-
}
|
|
142
|
-
|
|
143
115
|
function isChatOmlxModel(model) {
|
|
144
116
|
if (typeof model?.id !== "string" || !model.id.trim()) return false;
|
|
145
117
|
const type = String(model.type ?? model.model_type ?? "").toLowerCase();
|
|
@@ -148,4 +120,5 @@ function isChatOmlxModel(model) {
|
|
|
148
120
|
return true;
|
|
149
121
|
}
|
|
150
122
|
|
|
151
|
-
// (ollamaLabel and omlxLabel removed — parseModelName in model-name.mjs is the single path)
|
|
123
|
+
// (ollamaLabel and omlxLabel removed — parseModelName in model-name.mjs is the single path)
|
|
124
|
+
// (Ollama backend removed — offgrid-ai now uses llama-server + mlx-vlm + oMLX)
|
|
@@ -1,99 +1,12 @@
|
|
|
1
|
-
// ──
|
|
1
|
+
// ── Benchmark finalization (metadata + summary rendering) ───────────────────
|
|
2
|
+
// unloadModelFromServer has been moved to src/process.mjs (it's the managed-server
|
|
3
|
+
// counterpart to stopProfile, used by both the benchmark flow and the Pi chat flow).
|
|
2
4
|
|
|
3
|
-
import { backendFor } from "../backends.mjs";
|
|
4
|
-
import { apiRootUrl, serverModelIds } from "../process.mjs";
|
|
5
5
|
import { existsSync } from "node:fs";
|
|
6
6
|
import { readFile, writeFile } from "node:fs/promises";
|
|
7
7
|
import { join } from "node:path";
|
|
8
8
|
import { pc, renderRows, renderSection } from "../ui.mjs";
|
|
9
9
|
|
|
10
|
-
export async function unloadModelFromServer(profile) {
|
|
11
|
-
const backend = backendFor(profile.backend);
|
|
12
|
-
|
|
13
|
-
if (backend.id === "ollama") {
|
|
14
|
-
const apiBaseUrl = apiRootUrl(profile.baseUrl || backend.apiBaseUrl || "");
|
|
15
|
-
|
|
16
|
-
try {
|
|
17
|
-
await fetch(`${apiBaseUrl}/api/generate`, {
|
|
18
|
-
method: "POST",
|
|
19
|
-
headers: { "Content-Type": "application/json" },
|
|
20
|
-
body: JSON.stringify({ model: profile.modelAlias, prompt: "", stream: false, keep_alive: 0 }),
|
|
21
|
-
signal: AbortSignal.timeout(10000),
|
|
22
|
-
});
|
|
23
|
-
return { unloaded: true, backend: backend.id };
|
|
24
|
-
} catch (err) {
|
|
25
|
-
return { unloaded: false, backend: backend.id, error: err.message };
|
|
26
|
-
}
|
|
27
|
-
}
|
|
28
|
-
|
|
29
|
-
if (backend.id === "llama-cpp" || backend.id === "llama-cpp-mtp") {
|
|
30
|
-
// llama.cpp unloads when the server process exits; no HTTP unload API exists.
|
|
31
|
-
// If offgrid-ai started the server, stopProfile already handled it.
|
|
32
|
-
return { unloaded: false, backend: backend.id, reason: "stop server to unload" };
|
|
33
|
-
}
|
|
34
|
-
|
|
35
|
-
if (backend.id === "omlx") {
|
|
36
|
-
return await unloadOmlxModel(profile);
|
|
37
|
-
}
|
|
38
|
-
|
|
39
|
-
return { unloaded: false, backend: backend.id, reason: "unsupported backend" };
|
|
40
|
-
}
|
|
41
|
-
|
|
42
|
-
async function unloadOmlxModel(profile) {
|
|
43
|
-
const baseUrl = profile.baseUrl?.replace(/\/v1\/?$/u, "") || "";
|
|
44
|
-
const adminUrl = `${baseUrl}/admin/api/models`;
|
|
45
|
-
const modelId = profile.modelAlias || profile.omlxModel || profile.id;
|
|
46
|
-
|
|
47
|
-
try {
|
|
48
|
-
const ids = await serverModelIds(profile.baseUrl);
|
|
49
|
-
const match = ids.find((id) => id.toLowerCase() === modelId.toLowerCase());
|
|
50
|
-
const targetId = match ?? modelId;
|
|
51
|
-
|
|
52
|
-
const response = await fetch(`${adminUrl}/${encodeURIComponent(targetId)}/unload`, {
|
|
53
|
-
method: "POST",
|
|
54
|
-
headers: { "Content-Type": "application/json" },
|
|
55
|
-
signal: AbortSignal.timeout(30000),
|
|
56
|
-
});
|
|
57
|
-
|
|
58
|
-
if (response.ok) {
|
|
59
|
-
return { unloaded: true, backend: "omlx", modelId: targetId };
|
|
60
|
-
}
|
|
61
|
-
|
|
62
|
-
const detail = await responseErrorDetail(response);
|
|
63
|
-
|
|
64
|
-
if (response.status === 400 && /not loaded/i.test(detail)) {
|
|
65
|
-
return { unloaded: true, backend: "omlx", modelId: targetId, reason: "model was not loaded" };
|
|
66
|
-
}
|
|
67
|
-
|
|
68
|
-
if (response.status === 401 || response.status === 403) {
|
|
69
|
-
return {
|
|
70
|
-
unloaded: false,
|
|
71
|
-
backend: "omlx",
|
|
72
|
-
modelId: targetId,
|
|
73
|
-
error: "oMLX admin authentication required. Enable skip_api_key_verification in oMLX settings, or unload manually from the admin panel.",
|
|
74
|
-
};
|
|
75
|
-
}
|
|
76
|
-
|
|
77
|
-
return { unloaded: false, backend: "omlx", modelId: targetId, error: `HTTP ${response.status}: ${detail}` };
|
|
78
|
-
} catch (err) {
|
|
79
|
-
if (err?.name === "AbortError" || err?.name === "TimeoutError") {
|
|
80
|
-
return { unloaded: false, backend: "omlx", modelId, error: "Unload request timed out. The model may still be unloading in the background." };
|
|
81
|
-
}
|
|
82
|
-
return { unloaded: false, backend: "omlx", modelId, error: err.message };
|
|
83
|
-
}
|
|
84
|
-
}
|
|
85
|
-
|
|
86
|
-
async function responseErrorDetail(response) {
|
|
87
|
-
const text = await response.text().catch(() => "");
|
|
88
|
-
if (!text) return "";
|
|
89
|
-
try {
|
|
90
|
-
const body = JSON.parse(text);
|
|
91
|
-
return body?.detail ?? body?.message ?? text;
|
|
92
|
-
} catch {
|
|
93
|
-
return text;
|
|
94
|
-
}
|
|
95
|
-
}
|
|
96
|
-
|
|
97
10
|
export async function finalizeBenchmarkRun(runDirectory, runResult, speedMetrics, speedMetricsError = null) {
|
|
98
11
|
const metadataPath = join(runDirectory, "metadata.json");
|
|
99
12
|
const metadata = JSON.parse(await readFile(metadataPath, "utf8"));
|
package/src/benchmark/flow.mjs
CHANGED
|
@@ -4,7 +4,7 @@ import { join } from "node:path";
|
|
|
4
4
|
import { ensureDirs } from "../config.mjs";
|
|
5
5
|
import { backendFor } from "../backends.mjs";
|
|
6
6
|
import { hasPi, hasPiModel, syncPiConfig } from "../harness-pi.mjs";
|
|
7
|
-
import { serverReady, startServer, waitForReady, stopProfile, modelAvailableOnServer } from "../process.mjs";
|
|
7
|
+
import { serverReady, startServer, waitForReady, stopProfile, modelAvailableOnServer, unloadModelFromServer } from "../process.mjs";
|
|
8
8
|
import { loadProfiles } from "../profiles.mjs";
|
|
9
9
|
import { pc, createPrompt } from "../ui.mjs";
|
|
10
10
|
import { linkBenchmarkRepo } from "./repo.mjs";
|
|
@@ -12,12 +12,11 @@ import { loadBenchmarks } from "./shared.mjs";
|
|
|
12
12
|
import { prepareBenchmarkRun } from "./prepare.mjs";
|
|
13
13
|
import { runBenchmarkInPi } from "./pi-runner.mjs";
|
|
14
14
|
import { queryServerMetrics } from "./metrics.mjs";
|
|
15
|
-
import { unloadModelFromServer } from "./finalize.mjs";
|
|
16
15
|
import { finalizeBenchmarkRun, renderBenchmarkSummary } from "./finalize.mjs";
|
|
17
16
|
|
|
18
17
|
function benchmarkModelSource(profile) {
|
|
19
18
|
if (!profile) return "cloud";
|
|
20
|
-
return profile.providerId === "llama-cpp-mtp" ? "llama-cpp-mtp" : profile.backend === "
|
|
19
|
+
return profile.providerId === "llama-cpp-mtp" ? "llama-cpp-mtp" : profile.backend === "omlx" ? "omlx" : "llama-cpp";
|
|
21
20
|
}
|
|
22
21
|
|
|
23
22
|
async function chooseBenchmarkAction(prompt, canRun) {
|
|
@@ -29,7 +28,7 @@ async function chooseBenchmarkAction(prompt, canRun) {
|
|
|
29
28
|
}
|
|
30
29
|
|
|
31
30
|
function managedModelId(profile) {
|
|
32
|
-
return profile.omlxModel ?? profile.
|
|
31
|
+
return profile.omlxModel ?? profile.modelAlias ?? profile.label;
|
|
33
32
|
}
|
|
34
33
|
|
|
35
34
|
async function ensureManagedModelAvailableForBenchmark(profile, backend) {
|
|
@@ -1,7 +1,6 @@
|
|
|
1
1
|
// ── Backend-aware server speed metrics ───────────────────────────────────────
|
|
2
2
|
|
|
3
3
|
import { backendFor } from "../backends.mjs";
|
|
4
|
-
import { apiRootUrl } from "../process.mjs";
|
|
5
4
|
|
|
6
5
|
const BENCH_SPEED_PROMPT = "Write a one-sentence summary of machine learning.";
|
|
7
6
|
const SPEED_QUERY_TIMEOUT_MS = 120_000;
|
|
@@ -16,9 +15,6 @@ export async function queryServerMetrics(profile) {
|
|
|
16
15
|
if (backend.id === "omlx") {
|
|
17
16
|
return await queryOmlxMetrics(profile);
|
|
18
17
|
}
|
|
19
|
-
if (backend.id === "ollama") {
|
|
20
|
-
return await queryOllamaMetrics(profile);
|
|
21
|
-
}
|
|
22
18
|
|
|
23
19
|
throw new Error(`Unsupported backend for benchmark speed metrics: ${backend.id}`);
|
|
24
20
|
}
|
|
@@ -115,43 +111,3 @@ async function queryOmlxMetrics(profile) {
|
|
|
115
111
|
metricSource: "oMLX /v1/chat/completions streaming include_usage",
|
|
116
112
|
};
|
|
117
113
|
}
|
|
118
|
-
|
|
119
|
-
async function queryOllamaMetrics(profile) {
|
|
120
|
-
const body = {
|
|
121
|
-
model: profile.modelAlias,
|
|
122
|
-
prompt: BENCH_SPEED_PROMPT,
|
|
123
|
-
stream: false,
|
|
124
|
-
options: { num_predict: SPEED_QUERY_MAX_TOKENS },
|
|
125
|
-
};
|
|
126
|
-
|
|
127
|
-
const apiBaseUrl = apiRootUrl(profile.baseUrl || backendFor(profile.backend).apiBaseUrl || "");
|
|
128
|
-
|
|
129
|
-
const response = await fetch(`${apiBaseUrl}/api/generate`, {
|
|
130
|
-
method: "POST",
|
|
131
|
-
headers: { "Content-Type": "application/json" },
|
|
132
|
-
body: JSON.stringify(body),
|
|
133
|
-
signal: AbortSignal.timeout(SPEED_QUERY_TIMEOUT_MS),
|
|
134
|
-
});
|
|
135
|
-
|
|
136
|
-
if (!response.ok) {
|
|
137
|
-
throw new Error(`Ollama speed query failed: ${response.status} ${response.statusText}`);
|
|
138
|
-
}
|
|
139
|
-
|
|
140
|
-
const data = await response.json();
|
|
141
|
-
const promptEvalNs = data.prompt_eval_duration ?? 0;
|
|
142
|
-
const evalNs = data.eval_duration ?? 0;
|
|
143
|
-
const loadNs = data.load_duration ?? 0;
|
|
144
|
-
|
|
145
|
-
const promptEvalCount = data.prompt_eval_count ?? 0;
|
|
146
|
-
const evalCount = data.eval_count ?? 0;
|
|
147
|
-
|
|
148
|
-
return {
|
|
149
|
-
prefillTokensPerSecond: promptEvalNs > 0 ? (promptEvalCount / (promptEvalNs / 1e9)) : null,
|
|
150
|
-
generationTokensPerSecond: evalNs > 0 ? (evalCount / (evalNs / 1e9)) : null,
|
|
151
|
-
ttftMs: promptEvalNs / 1e6,
|
|
152
|
-
modelLoadMs: loadNs / 1e6,
|
|
153
|
-
speculativeDecodeAcceptance: null,
|
|
154
|
-
kvCacheTokens: null,
|
|
155
|
-
metricSource: "Ollama /api/generate",
|
|
156
|
-
};
|
|
157
|
-
}
|
|
@@ -55,7 +55,7 @@ export async function prepareBenchmarkRun({ repoPath, benchmark, kind, modelId,
|
|
|
55
55
|
kind,
|
|
56
56
|
runId,
|
|
57
57
|
benchmark: { id: benchmark.id, title: benchmark.title, description: benchmark.description, prompt: benchmark.prompt },
|
|
58
|
-
model: { id: modelId, slug: modelSlug, displayName: parseModelName(modelId, modelSource === "
|
|
58
|
+
model: { id: modelId, slug: modelSlug, displayName: parseModelName(modelId, modelSource === "omlx" ? "omlx" : "local-gguf").display },
|
|
59
59
|
status: "prepared",
|
|
60
60
|
createdAt: now.toISOString(),
|
|
61
61
|
updatedAt: now.toISOString(),
|