npm - @tryhamster/gerbil - Versions diffs - 1.0.0-rc.9 → 1.0.0 - Mend

@tryhamster/gerbil 1.0.0-rc.9 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (179) hide show

package/LICENSE +1 -1
package/README.md +247 -84
package/dist/architectures-C1I5V3Dt.mjs +6070 -0
package/dist/architectures-C1I5V3Dt.mjs.map +1 -0
package/dist/browser/index.d.ts +264 -588
package/dist/browser/index.d.ts.map +1 -1
package/dist/browser/index.js +585 -2334
package/dist/browser/index.js.map +1 -1
package/dist/cli.mjs +625 -1098
package/dist/cli.mjs.map +1 -1
package/dist/defaults-9komdrbY.mjs +24 -0
package/dist/defaults-9komdrbY.mjs.map +1 -0
package/dist/frameworks/express.d.mts +1 -3
package/dist/frameworks/express.d.mts.map +1 -1
package/dist/frameworks/express.mjs +7 -7
package/dist/frameworks/express.mjs.map +1 -1
package/dist/frameworks/fastify.d.mts +1 -1
package/dist/frameworks/fastify.d.mts.map +1 -1
package/dist/frameworks/fastify.mjs +3 -3
package/dist/frameworks/fastify.mjs.map +1 -1
package/dist/frameworks/hono.d.mts +1 -1
package/dist/frameworks/hono.d.mts.map +1 -1
package/dist/frameworks/hono.mjs +4 -4
package/dist/frameworks/hono.mjs.map +1 -1
package/dist/frameworks/next.d.mts +3 -2
package/dist/frameworks/next.d.mts.map +1 -1
package/dist/frameworks/next.mjs +4 -4
package/dist/frameworks/next.mjs.map +1 -1
package/dist/frameworks/react.d.mts +1 -1
package/dist/frameworks/trpc.d.mts +1 -1
package/dist/frameworks/trpc.d.mts.map +1 -1
package/dist/frameworks/trpc.mjs +4 -4
package/dist/frameworks/trpc.mjs.map +1 -1
package/dist/gerbil-BHrJJIa4.mjs +1656 -0
package/dist/gerbil-BHrJJIa4.mjs.map +1 -0
package/dist/gerbil-BT9fCydo.d.mts +488 -0
package/dist/gerbil-BT9fCydo.d.mts.map +1 -0
package/dist/gerbil-DomNfIr1.mjs +4 -0
package/dist/gpu/hooks.d.mts +520 -0
package/dist/gpu/hooks.d.mts.map +1 -0
package/dist/gpu/hooks.mjs +1188 -0
package/dist/gpu/hooks.mjs.map +1 -0
package/dist/gpu/index.d.mts +2 -0
package/dist/gpu/index.mjs +6 -0
package/dist/gpu-33qCAtHW.mjs +3615 -0
package/dist/gpu-33qCAtHW.mjs.map +1 -0
package/dist/index-Dgmb2kE3.d.mts +245 -0
package/dist/index-Dgmb2kE3.d.mts.map +1 -0
package/dist/index-jEAL2s-A.d.mts +2022 -0
package/dist/index-jEAL2s-A.d.mts.map +1 -0
package/dist/index.d.mts +22 -487
package/dist/index.d.mts.map +1 -1
package/dist/index.mjs +13 -8
package/dist/index.mjs.map +1 -1
package/dist/indexeddb-store-BWIMtxxH.mjs +103 -0
package/dist/indexeddb-store-BWIMtxxH.mjs.map +1 -0
package/dist/indexeddb-store-ClH12Xnl.mjs +4 -0
package/dist/integrations/ai-sdk.d.mts +75 -6
package/dist/integrations/ai-sdk.d.mts.map +1 -1
package/dist/integrations/ai-sdk.mjs +131 -15
package/dist/integrations/ai-sdk.mjs.map +1 -1
package/dist/integrations/langchain.d.mts +1 -1
package/dist/integrations/langchain.d.mts.map +1 -1
package/dist/integrations/langchain.mjs +5 -5
package/dist/integrations/langchain.mjs.map +1 -1
package/dist/integrations/llamaindex.d.mts +1 -1
package/dist/integrations/llamaindex.d.mts.map +1 -1
package/dist/integrations/llamaindex.mjs +5 -5
package/dist/integrations/llamaindex.mjs.map +1 -1
package/dist/integrations/mcp-client.mjs +3 -3
package/dist/integrations/mcp-client.mjs.map +1 -1
package/dist/integrations/mcp.d.mts +3 -2
package/dist/integrations/mcp.d.mts.map +1 -1
package/dist/integrations/mcp.mjs +5 -5
package/dist/{mcp-BvbriaBy.mjs → mcp-1DaMsaBc.mjs} +4 -4
package/dist/mcp-1DaMsaBc.mjs.map +1 -0
package/dist/memory/index.d.mts +3 -0
package/dist/memory/index.mjs +6 -0
package/dist/memory-D1P7Tmda.mjs +4 -0
package/dist/memory-DVN0MnIG.mjs +132 -0
package/dist/memory-DVN0MnIG.mjs.map +1 -0
package/dist/memory-Dj0J1v88.mjs +294 -0
package/dist/memory-Dj0J1v88.mjs.map +1 -0
package/dist/moonshine-stt-BLyVoRpB.mjs +4 -0
package/dist/moonshine-stt-v_P_Ci_m.mjs +11936 -0
package/dist/moonshine-stt-v_P_Ci_m.mjs.map +1 -0
package/dist/{one-liner-s-lD8rCC.mjs → one-liner-DnQn7HJK.mjs} +14 -16
package/dist/one-liner-DnQn7HJK.mjs.map +1 -0
package/dist/repl-jV5gcJFA.mjs +9 -0
package/dist/skills/index.d.mts +270 -320
package/dist/skills/index.d.mts.map +1 -1
package/dist/skills/index.mjs +5 -5
package/dist/{skills-CD3Orlex.mjs → skills-DX8D59UH.mjs} +187 -32
package/dist/skills-DX8D59UH.mjs.map +1 -0
package/dist/{tools-Bi1P7Xoy.mjs → tools-DQ1mPUw5.mjs} +34 -22
package/dist/tools-DQ1mPUw5.mjs.map +1 -0
package/dist/{types-CiTc7ez3.d.mts → types-D6FiR_oh.d.mts} +106 -12
package/dist/types-D6FiR_oh.d.mts.map +1 -0
package/dist/types-DQBe2lFo.d.mts +165 -0
package/dist/types-DQBe2lFo.d.mts.map +1 -0
package/dist/{utils-CZBZ8dgR.mjs → utils-DKO55ZmZ.mjs} +1 -1
package/dist/{utils-CZBZ8dgR.mjs.map → utils-DKO55ZmZ.mjs.map} +1 -1
package/dist/vector-B0panuy6.mjs +95 -0
package/dist/vector-B0panuy6.mjs.map +1 -0
package/docs/PROJECT-STATE.md +321 -0
package/docs/adding-a-model-family.md +280 -0
package/docs/ai-sdk.md +70 -61
package/docs/architecture/overview.md +17 -7
package/docs/browser.md +203 -8
package/docs/embeddings.md +156 -0
package/docs/gerbil-site-native-migration.md +217 -0
package/docs/gpu-engine/architectures.md +398 -0
package/docs/gpu-engine/ir.md +372 -0
package/docs/gpu-engine/kernels.md +718 -0
package/docs/gpu-engine/paper.html +1759 -0
package/docs/gpu-engine/paper.md +2109 -0
package/docs/gpu-engine/safetensors.md +312 -0
package/docs/gpu-engine/tokenizer.md +302 -0
package/docs/memory-rag.md +91 -0
package/docs/metal-safari-intel.md +190 -0
package/docs/mobile-failure-diagnosis.md +124 -0
package/docs/mobile.md +99 -0
package/docs/observability.md +230 -0
package/docs/onnx-removal-plan.md +339 -0
package/docs/research/autoresearch-portable.md +904 -0
package/docs/research/dispatch-reduction-hivemind.md +84 -0
package/docs/research/ios-safari-model-caching.md +117 -0
package/docs/research/mobile-webgpu-speed-fusion.md +135 -0
package/docs/research/native-stt-model-selection.md +49 -0
package/docs/research/native-tts-model-selection.md +90 -0
package/docs/research/native-vs-chromium-decision.md +152 -0
package/docs/research/nemotron-mamba2-inference.md +910 -0
package/docs/research/qwen35-multimodal.md +293 -0
package/docs/research/qwen36-gemma4-targets.md +337 -0
package/docs/research/sota-embedding-models.md +179 -0
package/docs/research/sota-mobile-models-2026.md +263 -0
package/docs/research/sota-modality-models.md +202 -0
package/docs/research/tps-baselines.md +71 -0
package/docs/research/webgpu-m4-reference.md +104 -0
package/docs/site-update-plan.md +155 -0
package/docs/structured-output.md +123 -0
package/docs/stt.md +63 -446
package/docs/tts.md +77 -499
package/docs/vision.md +100 -338
package/package.json +22 -7
package/dist/chrome-backend-CORwaIyC.mjs +0 -1212
package/dist/chrome-backend-CORwaIyC.mjs.map +0 -1
package/dist/chrome-backend-DIKYoWj-.mjs +0 -3
package/dist/gerbil-CJ3ifloF.mjs +0 -4
package/dist/gerbil-Dw4Qj77e.mjs +0 -1631
package/dist/gerbil-Dw4Qj77e.mjs.map +0 -1
package/dist/gerbil-qOTe1nl2.d.mts +0 -431
package/dist/gerbil-qOTe1nl2.d.mts.map +0 -1
package/dist/kokoro-BNTb6egA.mjs +0 -20210
package/dist/kokoro-BNTb6egA.mjs.map +0 -1
package/dist/kokoro-CMOGDSgT.js +0 -20212
package/dist/kokoro-CMOGDSgT.js.map +0 -1
package/dist/mcp-BvbriaBy.mjs.map +0 -1
package/dist/one-liner-s-lD8rCC.mjs.map +0 -1
package/dist/repl-DveXw36T.mjs +0 -9
package/dist/skills-CD3Orlex.mjs.map +0 -1
package/dist/stt-Bu-E23Sc.js +0 -433
package/dist/stt-Bu-E23Sc.js.map +0 -1
package/dist/stt-CpLYbGFd.mjs +0 -433
package/dist/stt-CpLYbGFd.mjs.map +0 -1
package/dist/stt-DRPLEEHB.mjs +0 -3
package/dist/tools-Bi1P7Xoy.mjs.map +0 -1
package/dist/transformers.web-DiD1gTwk.js +0 -44695
package/dist/transformers.web-DiD1gTwk.js.map +0 -1
package/dist/transformers.web-u34VxRFM.js +0 -3
package/dist/tts-CqroPaSK.js +0 -724
package/dist/tts-CqroPaSK.js.map +0 -1
package/dist/tts-DXgsKGCe.mjs +0 -3
package/dist/tts-DeGANMNV.mjs +0 -730
package/dist/tts-DeGANMNV.mjs.map +0 -1
package/dist/types-CiTc7ez3.d.mts.map +0 -1
/package/dist/{auto-update-S9s5-g0C.mjs → auto-update-BVaLXcDE.mjs} +0 -0
/package/dist/{chunk-CkXuGtQK.mjs → chunk-B9cbKln6.mjs} +0 -0
/package/dist/{microphone-DaMZFRuR.mjs → microphone-Bqmoz9_K.mjs} +0 -0

package/docs/research/native-vs-chromium-decision.md ADDED Viewed

@@ -0,0 +1,152 @@
+# Native WebGPU Engine vs. the Chromium Approach — Decision Analysis
+**Date:** 2026-06-12
+**Status:** Recommendation — deprecate `chrome-backend.ts` now; keep the transformers.js worker path indefinitely (it is not "the chromium approach")
+**Context:** The native engine (`src/gpu/`) now runs on desktop Node via node-dawn (145 tok/s INT4) and on mobile WebKit (iPad, 31–36 tok/s, byte-correct after the Metal coherence fixes in `38bc674`, `2f0cabc`, `d574cdb`).
+---
+## 1. What "the chromium approach" actually is in this codebase
+There are **two separate things** that get lumped together, and they have completely different removal calculus. Only one of them is actually Chromium.
+### 1a. `src/core/chrome-backend.ts` (`ChromeGPUBackend`) — the real chromium approach
+1,836 lines. **Node.js only.** What it does, concretely:
+- Finds an installed Chrome/Edge/Brave/Chromium binary on the host (`findChrome()`, `CHROME_PATHS`), or fails with "Chrome not found."
+- Launches it headless via **puppeteer-core** (a hard runtime dependency in `package.json`), with a persistent profile at `~/.gerbil/chrome-cache` and a saved WebSocket endpoint file for cross-process reconnection.
+- Starts a localhost HTTP server on fixed port **43724** that serves a generated HTML page (`getWorkerPageHTML()`). The page imports **transformers.js 4.0.0-next.7 from the jsdelivr CDN**, loads an ONNX model with `dtype: "q4f16", device: "webgpu"`, and exposes `window.gerbilGenerate` / `gerbilGenerateVision` / `gerbilInterrupt` / `gerbilReset`.
+- Streams tokens back to Node by **JSON-encoding them through `console.log` and parsing CDP `Runtime.consoleAPICalled` events**.
+- Carries a large operational surface to keep this stable: orphan-page cleanup, `SingletonLock` removal, WS-endpoint staleness handling, `MAX_CONCURRENT_PAGES` limits, JS-heap polling via CDP `Performance.getMetrics`, `checkMemoryAndCleanup`, `killAllBackends`, SIGINT/SIGTERM/exit kill handlers, and the `gerbil cleanup` CLI command (`src/cli/index.ts:1041–1130`) whose entire job is killing zombie Chrome processes with `pkill -f "gerbil/chrome-cache"`.
+- Side-jobs unrelated to inference: it owns `~/.gerbil/cached-models.json` bookkeeping (`getChromeCachedModels` / `trackCachedModel` / `refreshCachedModelSizes`), which `gerbil.ts:786` and `src/cli/repl/utils.ts` use to answer "is this model cached?" in the REPL/CLI.
+**When it is actually used today** (`src/core/gerbil.ts:382–505`, `592–629`):
+| Path | Condition | Backend chosen |
+|---|---|---|
+| Node text gen, `device: auto/webgpu` | default | **native engine** (`effectiveBackend = "webgpu-native"`) |
+| Node text gen, explicit `backend: "onnx"` + WebGPU | opt-in | **ChromeGPUBackend** |
+| Node **vision model** + WebGPU (e.g. `ministral-3b`) | automatic | **ChromeGPUBackend** (`loadVisionModel`, line 610) |
+| Native engine load fails (unsupported arch, GPU error) | automatic | falls back to **CPU transformers.js pipeline** — *not* Chrome |
+| Browser | any | never (puppeteer is dynamically imported, Node-only) |
+Key fact: **the chrome backend is already off the default text-generation path.** `device: "auto"` in Node routes to the native engine. The catch-block fallback for unsupported architectures goes to CPU, not Chrome. Chrome is only reachable via (a) explicit `backend: "onnx"` with GPU, or (b) vision models in Node.
+### 1b. The transformers.js worker path (`src/browser/worker.ts`, `worker-entry.ts`) — NOT chromium
+This runs in **the end user's actual browser**, whatever it is. `worker-entry.ts` is bundled into a self-contained classic-worker IIFE (`worker-code.generated.ts`, built by `scripts/build-worker.mjs`) so it works in iOS WKWebView where module workers from blob URLs fail. It uses transformers.js + onnxruntime-web with three execution tiers chosen by `backend-selector.ts` from the device profile:
+- **WebGPU + q4f16** (≥2 GB GPU est.), **WebGPU + q4** (1–2 GB),
+- **WASM + q4** (≥1.5 GB RAM, "5–10× slower" warning),
+- **WASM/CPU + q4i** (last resort, 256-token context).
+Plus an iOS-specific **main-thread** variant (`createIOSMainThreadWorker`) with crash breadcrumbs in localStorage, single-threaded WASM, 128-token caps, and a device-memory-tiered model **fallback chain** (qwen3-1.7b → … → smollm2-135m).
+Calling this "the chromium approach" is a category error — it's the *any-browser, any-hardware* compatibility layer. It happens to share transformers.js with the Chrome backend, which is the only family resemblance.
+### 1c. Where each feature actually lives
+- `useChat` / `useCompletion` (`use-chat.ts`) → `createGerbilWorker` → **transformers.js worker** (1b). Hard-gated on `isWebGPUSupported()` at the hook level (arguably a bug — the worker itself supports WASM, see §2 notes).
+- `useEmbedding` (`use-embedding.ts`) → its own CDN-loaded transformers.js worker, `feature-extraction` pipeline, WebGPU-if-available else WASM. **No native path exists.**
+- `useSpeech` (`use-speech.ts`) → **kokoro-js** (Kokoro 82M) or transformers.js `text-to-speech` pipeline (Supertonic, WebGPU). **No native path.**
+- `useVoiceInput` / `useVoiceChat` (`use-voice-input.ts`) → `WhisperSTT` (`src/core/stt.ts`), transformers.js `automatic-speech-recognition`, CPU/WASM only. **No native path.**
+- `useNativeEngine` (`use-native-engine.ts`) → **native WebGPUEngine** directly on the main thread; deliberately *not* exported from `browser/index.ts` to keep GPU code out of the main bundle (import from `/gpu`).
+- Node `g.embed()` → transformers.js `feature-extraction` pipeline (CPU). Node STT/TTS → same ONNX story.
+- Tool calling / JSON mode → prompt-level (`core/tools.ts`), works on **any** text backend including native.
+---
+## 2. Capability matrix
+Backends: **N** = native engine (`src/gpu`), **T** = transformers.js (browser worker / Node pipeline), **C** = ChromeGPUBackend. ✅ works today, ⚠️ works with caveats, ❌ unavailable.
+| Environment × Feature | Text gen | Embeddings | STT (Whisper) | TTS | Vision (VLM) | Tool calling/JSON |
+|---|---|---|---|---|---|---|
+| **Node + GPU (desktop)** | **N** ✅ 145 tok/s (Qwen only) · C ⚠️ legacy opt-in (~100 tok/s, any ONNX) | T ✅ (CPU) | T ✅ (CPU) | T/kokoro ✅ (CPU) | **C ✅ (only GPU path)** · T ⚠️ CPU fallback | N/T/C ✅ (prompt-level) |
+| **Node, no GPU / headless CI** | T ✅ (CPU pipeline) | T ✅ | T ✅ | T ✅ | T ⚠️ slow | ✅ |
+| **Browser: Chrome/Edge (WebGPU)** | T ✅ q4f16 default · N ✅ opt-in via `useNativeEngine` | T ✅ (WebGPU) | T ✅ (WASM) | T/kokoro ✅ | T ✅ | ✅ |
+| **Browser: Safari/WebKit desktop** | **N ✅ (packed-f16, post-Metal-fixes)** · T ⚠️ ORT-WebGPU flaky on WebKit → selector mostly lands on WASM | T ✅ (WASM) | T ✅ (WASM) | T ⚠️ | T ⚠️ WASM, slow | ✅ |
+| **iPhone/iPad (WKWebView)** | **N ✅ 31–36 tok/s** · T ⚠️ main-thread WASM, 128-token cap, crash breadcrumbs | T ⚠️ WASM | T ⚠️ | T ⚠️ | ❌ practically (memory) | ✅ |
+| **Browser, no WebGPU at all** (old Android, Firefox ESR, locked-down enterprise) | T ✅ WASM q4/q4i **only option** · N ❌ (hard-requires WebGPU) | T ✅ | T ✅ | T ⚠️ | T ⚠️ | ✅ |
+### What breaks if each backend is removed
+**Remove ChromeGPUBackend (C):**
+- Node GPU **vision** inference is the only real loss — `ministral-3b` etc. drop to CPU transformers.js.
+- `backend: "onnx"` + GPU in Node stops working (callers get native-or-CPU).
+- Node GPU text gen for **non-Qwen ONNX models** loses its (opt-in) GPU path — but note `device:"auto"` already sends those to CPU today, because the native-engine failure falls back to CPU, not Chrome.
+- `gerbil cleanup` CLI command and `~/.gerbil/cached-models.json` tracking need extraction/replacement (small, self-contained).
+- **Gains:** delete 1,836 lines + zombie-process management; drop `puppeteer-core` from hard `dependencies`; remove the "you must have Chrome installed" runtime requirement; remove the CDN dependency at inference time; remove a whole bug class (the file is ~40% lifecycle defensive code).
+**Remove transformers.js worker path (T):**
+- Browser loses: vision, embeddings, STT, TTS, the entire ONNX model zoo, the WASM fallback for no-WebGPU/low-memory devices, and the device-tiered model fallback chain. `useChat`/`useCompletion`/`useEmbedding`/`useSpeech`/`useVoiceInput` all die.
+- Node loses: CPU inference entirely (the only no-GPU path), embeddings, STT, TTS.
+- **This is not removable.** It is the project's breadth layer, and it is not chromium.
+**Remove native engine (N):** Node GPU text gen reverts to headless Chrome; Safari/iPad on-device inference reverts to "barely works on WASM." Obviously not on the table.
+---
+## 3. Model coverage gap: native vs. the ONNX zoo
+`src/gpu/architectures/index.ts` registers exactly three architecture strings:
+```
+Qwen2ForCausalLM   → generateQwen2Graph
+Qwen3ForCausalLM   → generateQwen2Graph
+Qwen3_5ForConditionalGeneration → generateQwen3_5Graph
+```
+Quant/weight formats (`model-loader.ts`): f32 safetensors, on-the-fly INT4 (`dtype: "q4"`), **pre-quantized GPTQ** (auto-detected, repacked), **MLX 4-bit affine** (auto-detected, group_size-aware repack). Capabilities struct is `{ text: true, vision: false, moe: false }` — text only. `maxSeqLen` capped at 4096 (512 default on WebKit) vs. the 32k–262k contexts the ONNX path advertises.
+Against the built-in registry (`src/core/models.ts`):
+| Built-in | Architecture | Native? |
+|---|---|---|
+| qwen3-0.6b / 1.7b | Qwen3ForCausalLM | ✅ |
+| qwen3.5-0.8b / 2b | Qwen3_5ForConditionalGeneration | ✅ |
+| qwen2.5-0.5b / coder | Qwen2ForCausalLM | ✅ |
+| smollm2-135m / 360m | LlamaForCausalLM | ❌ (TODO stub in registry) |
+| phi-3-mini | Phi3ForCausalLM | ❌ |
+| lfm2.5-1.2b-thinking | LFM2 hybrid | ❌ |
+| ministral-3b (vision) | Mistral3/vision | ❌ |
+So **6 of 11 built-ins run native; 5 don't** — and the 5 include the smallest models (SmolLM2, the low-memory fallback chain backbone) and the only vision model. Beyond built-ins, users lose the `hf:onnx-community/*` long tail (anything transformers.js can load), Whisper, Kokoro/Supertonic, and every embedding model (MiniLM etc.). Notably, the SmolLM2 gap matters more for the *browser fallback chain* than for Node: `getModelFallbackChain` bottoms out at smollm2-135m on <3 GB devices, and those devices are exactly the ones using the WASM path anyway — so the practical native gap is: **Llama-family graph, Phi graph, vision, embeddings, audio.**
+The Llama graph is the highest-leverage addition: one graph generator (`llama.ts`, already stubbed in the registry comments) unlocks SmolLM2, Llama 3.x, Mistral 7B-class, and most of the open-weights world, because the Qwen2 generator is already ~90% of a Llama generator (RoPE + GQA + SwiGLU + RMSNorm).
+---
+## 4. Recommendation
+**Short answer: yes for text, no overall — but the thing you can kill is `chrome-backend.ts`, and you should kill it now.** The transformers.js worker path is not "the chromium approach" and must stay; it's what makes the "anywhere JavaScript runs" promise true on the 40% of surfaces the native engine can't touch (no-WebGPU browsers, embeddings, audio, vision, non-Qwen models).
+The chrome backend, by contrast, is now a redundant middleman for its main job: it was built to give Node a WebGPU path before node-dawn worked, and the native engine is now *faster* (145 vs ~100 tok/s), dependency-free at runtime (no Chrome install, no CDN fetch, no puppeteer), and already the default. What remains load-bearing is narrow: Node+GPU **vision**, and opt-in Node+GPU **ONNX** for non-Qwen models. Neither justifies 1,836 lines, a hard puppeteer-core dependency, a localhost server, and a zombie-process janitor in every install.
+### Phased plan
+**Phase 1 — now (next rc):** Deprecate, demote, and stop auto-routing.
+1. Mark `ChromeGPUBackend` and `backend: "onnx"` + GPU-in-Node as deprecated (console.warn + README). Keep them working.
+2. Change `loadVisionModel` in Node to **CPU transformers.js by default**, with Chrome behind explicit `backend: "chrome"` opt-in. Vision-on-Node-GPU users are rare; the ones that exist can opt in for two more releases.
+3. Move `puppeteer-core` from `dependencies` to `optionalDependencies`/peer with a clear error message ("install puppeteer-core and Chrome to use backend:'chrome'"). This alone removes the heaviest install cost for everyone else.
+4. Extract `cached-models.json` bookkeeping (`getChromeCachedModels`, `trackCachedModel`, `refreshCachedModelSizes`) out of `chrome-backend.ts` into `src/core/model-cache.ts` — it's used by the REPL/CLI and `gerbil.ts:786` and has nothing to do with Chrome. Also fix the bookkeeping to cover native-engine downloads (it currently only tracks Chrome-IndexedDB models).
+5. While in there: declare `@huggingface/transformers` properly. It's statically imported by `src/core/gerbil.ts` but listed only in `devDependencies` and externalized by tsdown for Node builds — that's a latent packaging bug independent of this decision.
+**Phase 2 — gate for full removal (target: 2 releases):** the native engine must gain, in priority order:
+1. **Llama-family graph** (`LlamaForCausalLM`, `MistralForCausalLM`, `SmolLM`) — unlocks SmolLM2 built-ins + most of HF. This is the cheapest, highest-value gap closure.
+2. **Phi graph** (phi-3-mini built-in) or drop phi-3-mini from built-ins (it's the weakest entry in the registry anyway — 4k context).
+3. A decision on vision: either (a) accept CPU vision on Node and document it, or (b) implement the vision encoder natively (large effort — not worth blocking removal on; choose (a)).
+**Phase 3 — delete:** remove `chrome-backend.ts`, the `gerbil cleanup` Chrome plumbing, `puppeteer-core`, and the `backend: "chrome"` option. Net: −1,836 lines core, −CLI cruft, one fewer process manager to debug, no host-Chrome requirement, no inference-time CDN dependency.
+**Explicit non-goals:** do not deprecate the browser worker path, the WASM tiers, the iOS main-thread path, or the embedding/STT/TTS workers. If anything, invest the freed maintenance budget there: route `useChat` to the native engine *when the device profile says WebKit-with-WebGPU* (where ORT-WebGPU is flaky but native now shines), keep transformers.js as the chooser's other arm, and relax the `isWebGPUSupported()` hard-gate in `use-chat.ts` (the worker has a perfectly good WASM tier the hook currently refuses to use).
+### Decision summary
+| Component | Verdict |
+|---|---|
+| `chrome-backend.ts` (headless Chrome via puppeteer) | **Deprecate now, delete in 2 releases.** Gate: Llama graph in native engine; vision goes to CPU on Node. |
+| transformers.js browser worker (+ WASM tiers, iOS path) | **Keep indefinitely.** It's the breadth layer, not the chromium approach. |
+| transformers.js Node CPU pipeline | **Keep.** Only no-GPU Node path; also embeddings/STT/TTS. |
+| Native engine | **Default everywhere it can run; grow Llama graph next.** |