project-ara 0.0.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- ara/__init__.py +7 -0
- ara/acquire.py +142 -0
- ara/apps.py +176 -0
- ara/backends/__init__.py +3 -0
- ara/backends/ane.py +10 -0
- ara/backends/apple.py +178 -0
- ara/backends/coral.py +10 -0
- ara/backends/cpu.py +151 -0
- ara/backends/cuda.py +217 -0
- ara/backends/esp32.py +12 -0
- ara/backends/hexagon.py +11 -0
- ara/backends/intel_npu.py +10 -0
- ara/backends/oneapi.py +10 -0
- ara/backends/vulkan.py +175 -0
- ara/backends/webgpu.py +11 -0
- ara/backends/xdna.py +11 -0
- ara/calibration.py +28 -0
- ara/catalog.py +325 -0
- ara/cli.py +1848 -0
- ara/contracts/__init__.py +9 -0
- ara/contracts/driver.py +87 -0
- ara/contracts/ramp.py +257 -0
- ara/contracts/worker.py +51 -0
- ara/db.py +207 -0
- ara/detect.py +592 -0
- ara/engine_env.py +152 -0
- ara/engines.py +253 -0
- ara/estimate.py +96 -0
- ara/hardware.py +1215 -0
- ara/hf_auth.py +168 -0
- ara/hub.py +35 -0
- ara/mlx.py +103 -0
- ara/profile.py +38 -0
- ara/pythons.py +312 -0
- ara/registry.py +68 -0
- ara/serialize.py +54 -0
- ara/status.py +252 -0
- ara/ui.py +86 -0
- ara/versions.py +98 -0
- ara/workers/__init__.py +2 -0
- ara/workers/cpu_llama.py +360 -0
- ara/workers/vulkan_llama.py +465 -0
- project_ara-0.0.1.dist-info/METADATA +152 -0
- project_ara-0.0.1.dist-info/RECORD +48 -0
- project_ara-0.0.1.dist-info/WHEEL +4 -0
- project_ara-0.0.1.dist-info/entry_points.txt +2 -0
- project_ara-0.0.1.dist-info/licenses/LICENSE +202 -0
- project_ara-0.0.1.dist-info/licenses/NOTICE +4 -0
ara/__init__.py
ADDED
|
@@ -0,0 +1,7 @@
|
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
2
|
+
# Copyright 2026 Will Sarg
|
|
3
|
+
"""Project ARA — AI Runs Anywhere.
|
|
4
|
+
|
|
5
|
+
Backend-agnostic local inference. Core is pure Python; hardware-specific
|
|
6
|
+
engines load lazily behind a backend protocol (see docs in the project_ara vault).
|
|
7
|
+
"""
|
ara/acquire.py
ADDED
|
@@ -0,0 +1,142 @@
|
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
2
|
+
# Copyright 2026 Will Sarg
|
|
3
|
+
"""Model acquisition — backend-neutral downloads into the HF cache.
|
|
4
|
+
|
|
5
|
+
``download(repo_id)`` fetches a model; ``repo_size_gb`` / ``free_disk_gb`` back the
|
|
6
|
+
pre-download disk check. Uses ``huggingface_hub`` directly, which produces the exact
|
|
7
|
+
cache layout that mlx_lm / wmx-suite reads.
|
|
8
|
+
|
|
9
|
+
No token required for ungated models (e.g. mlx-community/*). Set HF_TOKEN or
|
|
10
|
+
HUGGING_FACE_HUB_TOKEN for gated ones, or HF_ENDPOINT for a mirror.
|
|
11
|
+
"""
|
|
12
|
+
from __future__ import annotations
|
|
13
|
+
|
|
14
|
+
import os
|
|
15
|
+
import re
|
|
16
|
+
|
|
17
|
+
# Headroom we insist on beyond the raw download, so a fetch never fills the disk
|
|
18
|
+
# (unpacking, the snapshot's own .incomplete temp files, normal system churn).
|
|
19
|
+
DISK_BUFFER_GB = 2.0
|
|
20
|
+
|
|
21
|
+
# A well-formed Hugging Face repo id: ``name`` or ``org/name``, each segment starting with an
|
|
22
|
+
# alphanumeric. Rejects anything an out-of-process worker's argparse could mis-read as a flag or
|
|
23
|
+
# path — a leading ``-``, an ``=``, whitespace, ``..`` traversal, extra slashes. The model is a
|
|
24
|
+
# *sink arg* (it becomes argv for the engine worker), so ARA validates its shape before it ever
|
|
25
|
+
# leaves the process. Defensive: the value is a local CLI arg, but cheap to get right.
|
|
26
|
+
_MODEL_ID_RE = re.compile(r"^[A-Za-z0-9][A-Za-z0-9._-]*(/[A-Za-z0-9][A-Za-z0-9._-]*)?$")
|
|
27
|
+
|
|
28
|
+
|
|
29
|
+
def valid_model_id(model: str) -> bool:
|
|
30
|
+
"""True if *model* is a well-formed HF repo id (``org/name`` or ``name``), safe to pass as a
|
|
31
|
+
worker argv positional. Rejects flag-like / traversal / malformed values."""
|
|
32
|
+
return isinstance(model, str) and _MODEL_ID_RE.match(model) is not None
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def is_local_gguf(model: str) -> bool:
|
|
36
|
+
"""True if *model* points at an existing local ``.gguf`` file that's safe as a worker argv
|
|
37
|
+
positional (never flag-like). The engine workers already resolve a ``.gguf`` path directly, so
|
|
38
|
+
this lets the CLI accept loose GGUF files on disk (e.g. a local model library) without
|
|
39
|
+
weakening the repo-id guard. The leading-``-`` ban preserves the argv-injection guarantee — a
|
|
40
|
+
real file path is a safe positional. (Slug: 2026-06-25-local-gguf-cli-support)"""
|
|
41
|
+
return (isinstance(model, str) and model.endswith(".gguf")
|
|
42
|
+
and not model.startswith("-") and os.path.isfile(model))
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
def valid_model_ref(model: str) -> bool:
|
|
46
|
+
"""True if *model* is a usable model reference safe to pass to an engine worker: a well-formed
|
|
47
|
+
HF repo id, or a local ``.gguf`` file path. The single guard the CLI applies before a model
|
|
48
|
+
becomes worker argv. (Slug: 2026-06-25-local-gguf-cli-support)"""
|
|
49
|
+
return valid_model_id(model) or is_local_gguf(model)
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
_REASON_GATED = "gated"
|
|
53
|
+
_REASON_NOT_FOUND = "not_found"
|
|
54
|
+
_REASON_AUTH = "auth"
|
|
55
|
+
_REASON_OFFLINE = "offline"
|
|
56
|
+
_REASON_UNKNOWN = "unknown"
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def classify_repo_error(exc: BaseException) -> str:
|
|
60
|
+
"""Map a Hugging Face (or network) exception to a small honest reason string.
|
|
61
|
+
|
|
62
|
+
Returns one of: ``"gated"``, ``"not_found"``, ``"auth"``, ``"offline"``, ``"unknown"``.
|
|
63
|
+
Pure function — safe to call with any exception type, including non-HF ones.
|
|
64
|
+
Imported lazily so this module stays cheap at import time.
|
|
65
|
+
"""
|
|
66
|
+
from huggingface_hub.errors import (
|
|
67
|
+
GatedRepoError, HfHubHTTPError, LocalEntryNotFoundError,
|
|
68
|
+
OfflineModeIsEnabled, RepositoryNotFoundError,
|
|
69
|
+
)
|
|
70
|
+
|
|
71
|
+
if isinstance(exc, GatedRepoError):
|
|
72
|
+
return _REASON_GATED
|
|
73
|
+
if isinstance(exc, RepositoryNotFoundError):
|
|
74
|
+
return _REASON_NOT_FOUND
|
|
75
|
+
if isinstance(exc, (LocalEntryNotFoundError, OfflineModeIsEnabled)):
|
|
76
|
+
return _REASON_OFFLINE
|
|
77
|
+
if isinstance(exc, ConnectionError):
|
|
78
|
+
return _REASON_OFFLINE
|
|
79
|
+
if isinstance(exc, HfHubHTTPError) and getattr(
|
|
80
|
+
getattr(exc, "response", None), "status_code", None) == 401:
|
|
81
|
+
return _REASON_AUTH
|
|
82
|
+
return _REASON_UNKNOWN
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
def probe_repo(repo_id: str) -> dict:
|
|
86
|
+
"""Probe *repo_id* and return ``{"size_gb": float|None, "reason": str|None}``.
|
|
87
|
+
|
|
88
|
+
``reason`` is None on success; one of the ``classify_repo_error`` strings on failure.
|
|
89
|
+
``size_gb`` is None when the size can't be read (empty repo or any error).
|
|
90
|
+
Use this when the caller needs to surface *why* a fetch failed (e.g. the CLI).
|
|
91
|
+
``repo_size_gb`` is still the right call when only the size matters.
|
|
92
|
+
"""
|
|
93
|
+
from huggingface_hub import HfApi
|
|
94
|
+
|
|
95
|
+
try:
|
|
96
|
+
info = HfApi().model_info(repo_id, files_metadata=True)
|
|
97
|
+
total = sum(s.size for s in (info.siblings or []) if s.size)
|
|
98
|
+
return {"size_gb": round(total / 1e9, 3) if total else None, "reason": None}
|
|
99
|
+
except Exception as exc:
|
|
100
|
+
return {"size_gb": None, "reason": classify_repo_error(exc)}
|
|
101
|
+
|
|
102
|
+
|
|
103
|
+
def repo_size_gb(repo_id: str) -> float | None:
|
|
104
|
+
"""Total download size of *repo_id* in GB (decimal). None if it can't be read
|
|
105
|
+
(offline, private, or an API hiccup) — callers treat None as 'size unknown'."""
|
|
106
|
+
return probe_repo(repo_id)["size_gb"]
|
|
107
|
+
|
|
108
|
+
|
|
109
|
+
def free_disk_gb() -> float | None:
|
|
110
|
+
"""Free space (GB, decimal) on the volume holding the home directory."""
|
|
111
|
+
import shutil
|
|
112
|
+
from pathlib import Path
|
|
113
|
+
|
|
114
|
+
try:
|
|
115
|
+
return shutil.disk_usage(Path.home()).free / 1e9
|
|
116
|
+
except Exception:
|
|
117
|
+
return None
|
|
118
|
+
|
|
119
|
+
|
|
120
|
+
def download(repo_id: str, *, progress: bool = False) -> None:
|
|
121
|
+
"""Download *repo_id* into the HF cache. Network + disk only, no engine load.
|
|
122
|
+
|
|
123
|
+
``progress=True`` enables HF's native tqdm bars for the duration of this call;
|
|
124
|
+
``progress=False`` (default) silences them so the caller owns the output.
|
|
125
|
+
The prior bar state is always restored in ``finally`` regardless of which path
|
|
126
|
+
ran or whether the download succeeded.
|
|
127
|
+
"""
|
|
128
|
+
from huggingface_hub import snapshot_download
|
|
129
|
+
from huggingface_hub.utils import are_progress_bars_disabled, disable_progress_bars, enable_progress_bars
|
|
130
|
+
|
|
131
|
+
was_disabled = are_progress_bars_disabled()
|
|
132
|
+
if progress:
|
|
133
|
+
enable_progress_bars()
|
|
134
|
+
else:
|
|
135
|
+
disable_progress_bars()
|
|
136
|
+
try:
|
|
137
|
+
snapshot_download(repo_id)
|
|
138
|
+
finally:
|
|
139
|
+
if was_disabled:
|
|
140
|
+
disable_progress_bars()
|
|
141
|
+
else:
|
|
142
|
+
enable_progress_bars()
|
ara/apps.py
ADDED
|
@@ -0,0 +1,176 @@
|
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
2
|
+
# Copyright 2026 Will Sarg
|
|
3
|
+
"""Inventory of AI/ML applications installed on the machine — GUI apps in /Applications
|
|
4
|
+
plus Homebrew packages — matched against a curated catalog of known AI/ML software.
|
|
5
|
+
|
|
6
|
+
A different lens from ENGINES (what ARA can launch) and FRAMEWORKS (python libraries):
|
|
7
|
+
this is "what AI software is installed here," organized by what it's for. Read-only.
|
|
8
|
+
macOS-focused (scans /Applications + Homebrew); degrades to whatever it can find elsewhere.
|
|
9
|
+
"""
|
|
10
|
+
from __future__ import annotations
|
|
11
|
+
|
|
12
|
+
from dataclasses import dataclass
|
|
13
|
+
from pathlib import Path
|
|
14
|
+
|
|
15
|
+
from ara import versions
|
|
16
|
+
|
|
17
|
+
# Category keys in display order, with their section sub-headers.
|
|
18
|
+
CATEGORY_LABEL = {
|
|
19
|
+
"runner": "model runners",
|
|
20
|
+
"image": "image generation",
|
|
21
|
+
"speech": "speech / audio",
|
|
22
|
+
"toolkit": "ML toolkits",
|
|
23
|
+
"assistant": "AI assistants",
|
|
24
|
+
"coding": "AI coding",
|
|
25
|
+
}
|
|
26
|
+
_ORDER = list(CATEGORY_LABEL)
|
|
27
|
+
|
|
28
|
+
# (label, category, [.app bundle names], [brew formula/cask tokens]). Curated — matched
|
|
29
|
+
# exactly (case-insensitive), no keyword guessing, so a hit is always a real known app.
|
|
30
|
+
CATALOG: list[tuple[str, str, list[str], list[str]]] = [
|
|
31
|
+
# local model runners / chat frontends
|
|
32
|
+
("LM Studio", "runner", ["LM Studio"], ["lm-studio"]),
|
|
33
|
+
("Ollama", "runner", ["Ollama"], ["ollama"]),
|
|
34
|
+
("GPT4All", "runner", ["GPT4All", "gpt4all"], ["gpt4all"]),
|
|
35
|
+
("Jan", "runner", ["Jan"], ["jan"]),
|
|
36
|
+
("Msty", "runner", ["Msty"], ["msty"]),
|
|
37
|
+
("Enchanted", "runner", ["Enchanted"], []),
|
|
38
|
+
("Ollamac", "runner", ["Ollamac"], []),
|
|
39
|
+
("Pinokio", "runner", ["Pinokio"], ["pinokio"]),
|
|
40
|
+
("Transformer Lab", "runner", ["Transformer Lab"], []),
|
|
41
|
+
# image generation
|
|
42
|
+
("DiffusionBee", "image", ["DiffusionBee"], ["diffusionbee"]),
|
|
43
|
+
("Draw Things", "image", ["Draw Things"], []),
|
|
44
|
+
("ComfyUI", "image", ["ComfyUI"], []),
|
|
45
|
+
("InvokeAI", "image", ["InvokeAI"], []),
|
|
46
|
+
("Diffusers", "image", ["Diffusers"], []),
|
|
47
|
+
("Fooocus", "image", ["Fooocus"], []),
|
|
48
|
+
# speech / audio
|
|
49
|
+
("MacWhisper", "speech", ["MacWhisper"], ["macwhisper"]),
|
|
50
|
+
("superwhisper", "speech", ["superwhisper"], ["superwhisper"]),
|
|
51
|
+
("VoiceInk", "speech", ["VoiceInk"], ["voiceink"]),
|
|
52
|
+
("Aiko", "speech", ["Aiko"], []),
|
|
53
|
+
("Whisper Transcription", "speech", ["Whisper Transcription"], []),
|
|
54
|
+
# ML toolkits / CLIs (largely Homebrew)
|
|
55
|
+
("llama.cpp", "toolkit", [], ["llama.cpp"]),
|
|
56
|
+
("whisper.cpp", "toolkit", [], ["whisper-cpp"]),
|
|
57
|
+
("MLX", "toolkit", [], ["mlx", "mlx-c"]),
|
|
58
|
+
("ggml", "toolkit", [], ["ggml"]),
|
|
59
|
+
("ONNX Runtime", "toolkit", [], ["onnxruntime"]),
|
|
60
|
+
("PyTorch", "toolkit", [], ["pytorch"]),
|
|
61
|
+
("TensorFlow", "toolkit", [], ["tensorflow"]),
|
|
62
|
+
("Hugging Face CLI", "toolkit", [], ["huggingface-cli"]),
|
|
63
|
+
# AI assistants (cloud clients) and AI coding tools
|
|
64
|
+
("ChatGPT", "assistant", ["ChatGPT"], ["chatgpt"]),
|
|
65
|
+
("Claude", "assistant", ["Claude"], ["claude"]),
|
|
66
|
+
("Perplexity", "assistant", ["Perplexity"], ["perplexity"]),
|
|
67
|
+
("Cursor", "coding", ["Cursor"], ["cursor"]),
|
|
68
|
+
("Windsurf", "coding", ["Windsurf"], ["windsurf"]),
|
|
69
|
+
("Antigravity", "coding", ["Antigravity"], []),
|
|
70
|
+
# Codex ships as two distinct artifacts — keep them separate so their independent
|
|
71
|
+
# versions aren't compared as "drift" (the .app is com.openai.codex; the cask is the CLI).
|
|
72
|
+
("Codex", "coding", ["Codex"], []),
|
|
73
|
+
("Codex CLI", "coding", [], ["codex"]),
|
|
74
|
+
("CodexBar", "coding", ["CodexBar"], ["codexbar"]),
|
|
75
|
+
("Claude Code", "coding", [], ["claude-code"]),
|
|
76
|
+
("GitHub Copilot", "coding", ["GitHub Copilot"], ["copilot"]),
|
|
77
|
+
("Warp", "coding", ["Warp"], ["warp"]),
|
|
78
|
+
]
|
|
79
|
+
|
|
80
|
+
|
|
81
|
+
@dataclass(frozen=True)
|
|
82
|
+
class App:
|
|
83
|
+
label: str
|
|
84
|
+
category: str
|
|
85
|
+
in_app: bool # a .app bundle is present in an Applications folder
|
|
86
|
+
cask: bool # installed as a Homebrew cask (which IS how its .app got there)
|
|
87
|
+
formula: bool # installed as a Homebrew formula (CLI)
|
|
88
|
+
version: str | None = None # what's actually installed (.app plist for GUIs)
|
|
89
|
+
brew_recorded: str | None = None # Homebrew's receipt version, when it manages this
|
|
90
|
+
cask_token: str | None = None # the matched brew cask token (for drift remediation)
|
|
91
|
+
installed_at: float | None = None # epoch mtime/birthtime, for "recently installed"
|
|
92
|
+
|
|
93
|
+
@property
|
|
94
|
+
def homebrew(self) -> bool:
|
|
95
|
+
return self.cask or self.formula
|
|
96
|
+
|
|
97
|
+
@property
|
|
98
|
+
def drift(self) -> bool:
|
|
99
|
+
"""A cask GUI app whose installed (.app) version has self-updated past Homebrew's
|
|
100
|
+
frozen receipt — so `brew` no longer reflects reality (and `brew upgrade` may clobber).
|
|
101
|
+
Requires an actual installed .app, so a CLI-only cask never counts as drift."""
|
|
102
|
+
return bool(self.cask and self.in_app and self.brew_recorded and self.version
|
|
103
|
+
and self.version != self.brew_recorded)
|
|
104
|
+
|
|
105
|
+
@property
|
|
106
|
+
def duplicate(self) -> bool:
|
|
107
|
+
"""Two independent installs of the same tool: a CLI formula alongside a GUI
|
|
108
|
+
install (cask or a hand-dropped .app), or both a cask and a formula. A cask plus
|
|
109
|
+
its own .app is NOT a duplicate — the cask is what put the .app there."""
|
|
110
|
+
return self.formula and (self.cask or self.in_app)
|
|
111
|
+
|
|
112
|
+
@property
|
|
113
|
+
def source(self) -> str:
|
|
114
|
+
if self.cask and self.formula:
|
|
115
|
+
return "Homebrew (cask + formula)"
|
|
116
|
+
if self.cask:
|
|
117
|
+
return "Homebrew (cask)"
|
|
118
|
+
if self.formula and self.in_app:
|
|
119
|
+
return "Homebrew (formula) + separate app"
|
|
120
|
+
if self.formula:
|
|
121
|
+
return "Homebrew (formula)"
|
|
122
|
+
return "app (not via Homebrew)"
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
_APP_DIRS = (Path("/Applications"), Path.home() / "Applications")
|
|
126
|
+
_BREW_PREFIX = Path("/opt/homebrew") if Path("/opt/homebrew").exists() else Path("/usr/local")
|
|
127
|
+
|
|
128
|
+
|
|
129
|
+
def _install_time(bundles: list[str], tokens: list[str], in_app: bool) -> float | None:
|
|
130
|
+
"""Best-effort install/update time: a .app's filesystem time, else a Homebrew dir's.
|
|
131
|
+
Uses max(mtime, birthtime) — some bundles report a bogus birthtime."""
|
|
132
|
+
if in_app:
|
|
133
|
+
for b in bundles:
|
|
134
|
+
for base in _APP_DIRS:
|
|
135
|
+
app = base / f"{b}.app"
|
|
136
|
+
if app.is_dir():
|
|
137
|
+
st = app.stat()
|
|
138
|
+
return max(st.st_mtime, getattr(st, "st_birthtime", 0) or 0)
|
|
139
|
+
for t in tokens:
|
|
140
|
+
for sub in ("Caskroom", "Cellar"):
|
|
141
|
+
d = _BREW_PREFIX / sub / t
|
|
142
|
+
if d.is_dir():
|
|
143
|
+
try:
|
|
144
|
+
return max((p.stat().st_mtime for p in d.iterdir()), default=d.stat().st_mtime)
|
|
145
|
+
except Exception:
|
|
146
|
+
return None
|
|
147
|
+
return None
|
|
148
|
+
|
|
149
|
+
|
|
150
|
+
def scan() -> list[App]:
|
|
151
|
+
"""Installed AI/ML apps from the curated catalog, ordered by category then name."""
|
|
152
|
+
formulae, casks = versions.brew_formulae(), versions.brew_casks()
|
|
153
|
+
out: list[App] = []
|
|
154
|
+
for label, category, bundles, tokens in CATALOG:
|
|
155
|
+
in_app, app_ver = versions.find_app(bundles)
|
|
156
|
+
cask_ver = next((casks[t] for t in tokens if casks.get(t)), None)
|
|
157
|
+
formula_ver = next((formulae[t] for t in tokens if formulae.get(t)), None)
|
|
158
|
+
cask = any(t in casks for t in tokens)
|
|
159
|
+
formula = any(t in formulae for t in tokens)
|
|
160
|
+
if not (in_app or cask or formula):
|
|
161
|
+
continue
|
|
162
|
+
# The installed truth is the .app's own version; show that. For a cask we also keep
|
|
163
|
+
# brew's receipt so we can flag self-update drift. A formula (CLI) has no .app, so
|
|
164
|
+
# its brew version IS the truth.
|
|
165
|
+
cask_token = next((t for t in tokens if t in casks), None)
|
|
166
|
+
if cask:
|
|
167
|
+
version, brew_recorded = (app_ver or cask_ver), cask_ver
|
|
168
|
+
elif formula:
|
|
169
|
+
version, brew_recorded = formula_ver, None
|
|
170
|
+
else: # hand-installed .app
|
|
171
|
+
version, brew_recorded = app_ver, None
|
|
172
|
+
out.append(App(label, category, in_app=in_app, cask=cask, formula=formula,
|
|
173
|
+
version=version, brew_recorded=brew_recorded, cask_token=cask_token,
|
|
174
|
+
installed_at=_install_time(bundles, tokens, in_app)))
|
|
175
|
+
out.sort(key=lambda a: (_ORDER.index(a.category), a.label.lower()))
|
|
176
|
+
return out
|
ara/backends/__init__.py
ADDED
ara/backends/ane.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
2
|
+
# Copyright 2026 Will Sarg
|
|
3
|
+
"""Apple Neural Engine (ANE / CoreML) — STUB (no implementation yet).
|
|
4
|
+
|
|
5
|
+
Contract class: **graph-fit** (not a context ramp). The question here isn't "how far
|
|
6
|
+
can KV-cache grow" but "does this fixed, quantized graph map onto the accelerator and
|
|
7
|
+
its memory slice." A different assessment from apple.py, which targets the *GPU*
|
|
8
|
+
(MLX/Metal) on the same chip — a modern Mac carries both backends at once.
|
|
9
|
+
Wall source: unified memory shared with the system; programmed via CoreML.
|
|
10
|
+
"""
|
ara/backends/apple.py
ADDED
|
@@ -0,0 +1,178 @@
|
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
2
|
+
# Copyright 2026 Will Sarg
|
|
3
|
+
"""Apple-Silicon backend adapter — drives wmx-suite's MLX measurement out-of-process.
|
|
4
|
+
|
|
5
|
+
A lean device oracle, symmetric with backends/cuda.py: it reads the machine's memory wall and
|
|
6
|
+
runs wmx-suite's crash-safe calibration, but it owns **no persistence** — ARA stores and reuses
|
|
7
|
+
the calibration (see cli.render_profile). It never imports wmx in-process: every engine call
|
|
8
|
+
goes through the isolated ``apple`` env via :mod:`ara.engine_env`, so nothing MLX-shaped loads
|
|
9
|
+
in ARA's interpreter and the core stays engine-free at runtime, not just at lock time.
|
|
10
|
+
"""
|
|
11
|
+
from __future__ import annotations
|
|
12
|
+
|
|
13
|
+
# Core, engine-free helpers (no wmx) — safe to import at module load and patchable in tests.
|
|
14
|
+
from ara import calibration, db, engine_env
|
|
15
|
+
from ara.contracts import driver
|
|
16
|
+
|
|
17
|
+
# The wmx worker modules ARA drives in the isolated apple env (never imported in-process).
|
|
18
|
+
DEVICE_MODULE = "wmx_suite.device"
|
|
19
|
+
|
|
20
|
+
# Model ARA calibrates against — smallest SmolLM (MLX 4-bit). Calibration only measures
|
|
21
|
+
# fixed memory overhead, so a tiny instruct model is plenty.
|
|
22
|
+
CALIBRATION_MODEL = "mlx-community/SmolLM-135M-Instruct-4bit"
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
def safe_limits() -> dict:
|
|
26
|
+
"""Read this machine's safe memory limits via the wmx worker. Pure read — no model.
|
|
27
|
+
|
|
28
|
+
Stateless: returns the budget with no stored overhead (``calibrated=False``). ARA overlays
|
|
29
|
+
a previously-measured overhead from its own store — the engine no longer reads a database.
|
|
30
|
+
"""
|
|
31
|
+
facts = engine_env.run_worker("apple", ["-m", DEVICE_MODULE, "limits"])
|
|
32
|
+
return {
|
|
33
|
+
**facts,
|
|
34
|
+
"overhead_gb": None, # ARA owns the stored calibration now
|
|
35
|
+
"calibrated": False,
|
|
36
|
+
"calibrated_at": None,
|
|
37
|
+
}
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def calibration_model_cached(model: str = CALIBRATION_MODEL) -> bool:
|
|
41
|
+
"""Is the calibration model already in the HF cache? (cheap, no load)."""
|
|
42
|
+
from huggingface_hub import try_to_load_from_cache
|
|
43
|
+
|
|
44
|
+
try:
|
|
45
|
+
return isinstance(try_to_load_from_cache(model, "config.json"), str)
|
|
46
|
+
except Exception:
|
|
47
|
+
return False
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def download_calibration_model(model: str = CALIBRATION_MODEL, *,
|
|
51
|
+
progress: bool = False) -> None:
|
|
52
|
+
"""Fetch the calibration model into the HF cache. Network + disk only."""
|
|
53
|
+
from ara import acquire
|
|
54
|
+
|
|
55
|
+
acquire.download(model, progress=progress)
|
|
56
|
+
|
|
57
|
+
|
|
58
|
+
def calibrate(model: str = CALIBRATION_MODEL) -> dict:
|
|
59
|
+
"""Run wmx-suite's crash-safe calibration via the worker; return fresh limits + what it
|
|
60
|
+
measured.
|
|
61
|
+
|
|
62
|
+
The worker loads the model and watches memory under wmx-suite's predictive safety ramp,
|
|
63
|
+
which aborts before approaching the safe budget. ARA only invokes it (out-of-process in the
|
|
64
|
+
apple env). Surfaces the **effective** cold-start overhead (clamped to the engine's floor:
|
|
65
|
+
``max(default, measured)``) as ``overhead_gb`` so ARA can persist it; the raw measurement is
|
|
66
|
+
in the ``"calibration"`` sub-dict for the caller to show.
|
|
67
|
+
|
|
68
|
+
If the worker fails (error dict or exception), returns an uncalibrated result with a
|
|
69
|
+
``calibration_error`` field (never ``calibrated=True`` for unobserved data — Rule #3).
|
|
70
|
+
The safe default overhead is still in effect via ``_budget_params``; callers can detect the
|
|
71
|
+
condition via ``calibrated=False`` + presence of ``calibration_error``.
|
|
72
|
+
"""
|
|
73
|
+
limits = safe_limits()
|
|
74
|
+
try:
|
|
75
|
+
result = engine_env.run_worker("apple", ["-m", DEVICE_MODULE, "calibrate", model])
|
|
76
|
+
except Exception as exc:
|
|
77
|
+
limits["calibrated"] = False
|
|
78
|
+
limits["overhead_gb"] = None
|
|
79
|
+
limits["calibration_error"] = (
|
|
80
|
+
f"calibration unavailable for {model!r}: {exc}"
|
|
81
|
+
)
|
|
82
|
+
return limits
|
|
83
|
+
if result.get("error"):
|
|
84
|
+
limits["calibrated"] = False
|
|
85
|
+
limits["overhead_gb"] = None
|
|
86
|
+
limits["calibration_error"] = (
|
|
87
|
+
f"calibration unavailable for {model!r}: {result['error']}"
|
|
88
|
+
)
|
|
89
|
+
limits["calibration"] = result
|
|
90
|
+
return limits
|
|
91
|
+
overheads = [v for v in (result.get("measured_overhead_gb"),
|
|
92
|
+
result.get("default_overhead_gb")) if v is not None]
|
|
93
|
+
limits["overhead_gb"] = max(overheads) if overheads else None
|
|
94
|
+
limits["calibrated"] = True
|
|
95
|
+
limits["calibration"] = result
|
|
96
|
+
return limits
|
|
97
|
+
|
|
98
|
+
|
|
99
|
+
# ARA-owned ramp policy (the engine only measures; ARA decides the schedule + safety margin).
|
|
100
|
+
WORKER_MODULE = "wmx_suite.measure_one"
|
|
101
|
+
RAMP_SCHEDULE = [2000, 4000, 8000, 16000, 32000, 65536, 131072]
|
|
102
|
+
DEFAULT_MARGIN_GB = 2.0 # safety cushion below the wall (ARA policy)
|
|
103
|
+
DEFAULT_OVERHEAD_GB = 1.0 # fallback cold-start overhead until calibrated
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def _budget_params() -> tuple[float, float]:
|
|
107
|
+
"""ARA-owned (margin, overhead). Margin is policy; overhead is this machine's stored
|
|
108
|
+
calibration for the wmx engine, or a safe default if uncalibrated."""
|
|
109
|
+
overhead = DEFAULT_OVERHEAD_GB
|
|
110
|
+
stored = calibration.get_calibration(db.connect(), "wmx")
|
|
111
|
+
if stored and stored.get("fixed_overhead_gb") is not None:
|
|
112
|
+
overhead = stored["fixed_overhead_gb"]
|
|
113
|
+
return DEFAULT_MARGIN_GB, overhead
|
|
114
|
+
|
|
115
|
+
|
|
116
|
+
# KV-cache quant lever (parity with the Vulkan lane). ARA's cross-engine `--kv-quant`
|
|
117
|
+
# {f16,q8_0,q4_0} maps to MLX's integer kv-bits (fp16 = no quant). The effective bytes/elem
|
|
118
|
+
# (8-bit/4-bit payload + an fp16 scale+bias per 64-elem group) feeds the KV-aware decode
|
|
119
|
+
# estimate so it reflects the cache actually in use — not always fp16.
|
|
120
|
+
_MLX_KV_BITS = {"f16": None, "q8_0": 8, "q4_0": 4}
|
|
121
|
+
_MLX_KV_BYTES = {"f16": 2.0, "q8_0": 8 / 8 + 2 * 2 / 64, "q4_0": 4 / 8 + 2 * 2 / 64}
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def _worker_argv(model: str, ctx: int, margin: float, overhead: float, *,
|
|
125
|
+
preflight: bool = False, kv_quant: str = "f16") -> list[str]:
|
|
126
|
+
argv = ["-m", WORKER_MODULE, model, str(ctx),
|
|
127
|
+
"--margin", str(margin), "--overhead", str(overhead)]
|
|
128
|
+
if preflight:
|
|
129
|
+
argv.append("--preflight")
|
|
130
|
+
bits = _MLX_KV_BITS[kv_quant]
|
|
131
|
+
if bits is not None:
|
|
132
|
+
argv += ["--kv-bits", str(bits)]
|
|
133
|
+
return argv
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
def characterize(model: str, *, progress: bool = False, kv_quant: str = "f16") -> dict:
|
|
137
|
+
"""Measure *model*'s safe context ceiling on this Mac — the thin path.
|
|
138
|
+
|
|
139
|
+
Pure wiring: ARA owns the methodology in the engine-agnostic ``contracts.driver`` (the
|
|
140
|
+
antidote to an Apple-shaped abstraction); this adapter only supplies the Apple specifics —
|
|
141
|
+
the isolated ``apple`` env, wmx's self-vetoing ``measure_one`` worker, the budget params,
|
|
142
|
+
and the schedule. ARA never imports wmx in-process. Crash-safety is layered: the driver
|
|
143
|
+
gates each rung (L1 ``plan_next`` + L2 actual-footprint check), the engine refuses-before-
|
|
144
|
+
load (L4) and a watchdog aborts mid-probe (L5). Returns ``{model, safe_context, points}``.
|
|
145
|
+
|
|
146
|
+
``progress`` is accepted for interface symmetry with the cpu backend but has no effect
|
|
147
|
+
here: the HF download bar already ran in-process during the pre-fetch step.
|
|
148
|
+
"""
|
|
149
|
+
margin, overhead = _budget_params()
|
|
150
|
+
return driver.characterize(
|
|
151
|
+
model,
|
|
152
|
+
preflight=lambda m: engine_env.run_worker(
|
|
153
|
+
"apple", _worker_argv(m, 0, margin, overhead, preflight=True, kv_quant=kv_quant)),
|
|
154
|
+
measure=lambda m, ctx: engine_env.run_worker(
|
|
155
|
+
"apple", _worker_argv(m, ctx, margin, overhead, kv_quant=kv_quant)),
|
|
156
|
+
schedule=RAMP_SCHEDULE,
|
|
157
|
+
kv_dtype_bytes=_MLX_KV_BYTES[kv_quant], # decode-ceiling estimate reflects the cache type
|
|
158
|
+
)
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
DEFAULT_MAX_TOKENS = 256
|
|
162
|
+
|
|
163
|
+
|
|
164
|
+
def generate(model, prompt, *, max_context, max_tokens=DEFAULT_MAX_TOKENS,
|
|
165
|
+
kv_quant: str = "f16") -> dict:
|
|
166
|
+
"""One-shot MLX completion, governed: max_context is the characterized safe ceiling, so the
|
|
167
|
+
worker generates under the wall. Out-of-process in the isolated `apple` env via wmx-suite's
|
|
168
|
+
generate worker; the prompt goes over stdin, never argv. ``kv_quant`` (default ``"f16"``)
|
|
169
|
+
should match how *model* was characterized. Returns {context, completion} or a refusal
|
|
170
|
+
{refused, reason}. ARA never imports MLX in-process."""
|
|
171
|
+
margin, overhead = _budget_params()
|
|
172
|
+
argv = ["-m", "wmx_suite.generate", model, str(max_context),
|
|
173
|
+
"--margin", str(margin), "--overhead", str(overhead),
|
|
174
|
+
"--max-tokens", str(max_tokens)]
|
|
175
|
+
bits = _MLX_KV_BITS[kv_quant]
|
|
176
|
+
if bits is not None:
|
|
177
|
+
argv += ["--kv-bits", str(bits)]
|
|
178
|
+
return engine_env.run_worker("apple", argv, input=prompt)
|
ara/backends/coral.py
ADDED
|
@@ -0,0 +1,10 @@
|
|
|
1
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
2
|
+
# Copyright 2026 Will Sarg
|
|
3
|
+
"""Google Coral / Edge TPU — STUB (no implementation yet).
|
|
4
|
+
|
|
5
|
+
Contract class: **graph-fit** (not a context ramp). A fixed-function edge accelerator
|
|
6
|
+
(USB/PCIe/SoM) that runs only INT8 TFLite models compiled by the Edge TPU compiler;
|
|
7
|
+
assessment is "does this compiled graph fit the device's on-chip SRAM + its model
|
|
8
|
+
budget," with overflow spilling to host. Closest neighbour to the MCU/TinyML class.
|
|
9
|
+
Wall source: Edge TPU on-chip memory (+ host RAM for the runtime).
|
|
10
|
+
"""
|