npm - @tryhamster/gerbil - Versions diffs - 1.0.0-rc.9 → 1.0.0 - Mend

@tryhamster/gerbil 1.0.0-rc.9 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (179) hide show

package/LICENSE +1 -1
package/README.md +247 -84
package/dist/architectures-C1I5V3Dt.mjs +6070 -0
package/dist/architectures-C1I5V3Dt.mjs.map +1 -0
package/dist/browser/index.d.ts +264 -588
package/dist/browser/index.d.ts.map +1 -1
package/dist/browser/index.js +585 -2334
package/dist/browser/index.js.map +1 -1
package/dist/cli.mjs +625 -1098
package/dist/cli.mjs.map +1 -1
package/dist/defaults-9komdrbY.mjs +24 -0
package/dist/defaults-9komdrbY.mjs.map +1 -0
package/dist/frameworks/express.d.mts +1 -3
package/dist/frameworks/express.d.mts.map +1 -1
package/dist/frameworks/express.mjs +7 -7
package/dist/frameworks/express.mjs.map +1 -1
package/dist/frameworks/fastify.d.mts +1 -1
package/dist/frameworks/fastify.d.mts.map +1 -1
package/dist/frameworks/fastify.mjs +3 -3
package/dist/frameworks/fastify.mjs.map +1 -1
package/dist/frameworks/hono.d.mts +1 -1
package/dist/frameworks/hono.d.mts.map +1 -1
package/dist/frameworks/hono.mjs +4 -4
package/dist/frameworks/hono.mjs.map +1 -1
package/dist/frameworks/next.d.mts +3 -2
package/dist/frameworks/next.d.mts.map +1 -1
package/dist/frameworks/next.mjs +4 -4
package/dist/frameworks/next.mjs.map +1 -1
package/dist/frameworks/react.d.mts +1 -1
package/dist/frameworks/trpc.d.mts +1 -1
package/dist/frameworks/trpc.d.mts.map +1 -1
package/dist/frameworks/trpc.mjs +4 -4
package/dist/frameworks/trpc.mjs.map +1 -1
package/dist/gerbil-BHrJJIa4.mjs +1656 -0
package/dist/gerbil-BHrJJIa4.mjs.map +1 -0
package/dist/gerbil-BT9fCydo.d.mts +488 -0
package/dist/gerbil-BT9fCydo.d.mts.map +1 -0
package/dist/gerbil-DomNfIr1.mjs +4 -0
package/dist/gpu/hooks.d.mts +520 -0
package/dist/gpu/hooks.d.mts.map +1 -0
package/dist/gpu/hooks.mjs +1188 -0
package/dist/gpu/hooks.mjs.map +1 -0
package/dist/gpu/index.d.mts +2 -0
package/dist/gpu/index.mjs +6 -0
package/dist/gpu-33qCAtHW.mjs +3615 -0
package/dist/gpu-33qCAtHW.mjs.map +1 -0
package/dist/index-Dgmb2kE3.d.mts +245 -0
package/dist/index-Dgmb2kE3.d.mts.map +1 -0
package/dist/index-jEAL2s-A.d.mts +2022 -0
package/dist/index-jEAL2s-A.d.mts.map +1 -0
package/dist/index.d.mts +22 -487
package/dist/index.d.mts.map +1 -1
package/dist/index.mjs +13 -8
package/dist/index.mjs.map +1 -1
package/dist/indexeddb-store-BWIMtxxH.mjs +103 -0
package/dist/indexeddb-store-BWIMtxxH.mjs.map +1 -0
package/dist/indexeddb-store-ClH12Xnl.mjs +4 -0
package/dist/integrations/ai-sdk.d.mts +75 -6
package/dist/integrations/ai-sdk.d.mts.map +1 -1
package/dist/integrations/ai-sdk.mjs +131 -15
package/dist/integrations/ai-sdk.mjs.map +1 -1
package/dist/integrations/langchain.d.mts +1 -1
package/dist/integrations/langchain.d.mts.map +1 -1
package/dist/integrations/langchain.mjs +5 -5
package/dist/integrations/langchain.mjs.map +1 -1
package/dist/integrations/llamaindex.d.mts +1 -1
package/dist/integrations/llamaindex.d.mts.map +1 -1
package/dist/integrations/llamaindex.mjs +5 -5
package/dist/integrations/llamaindex.mjs.map +1 -1
package/dist/integrations/mcp-client.mjs +3 -3
package/dist/integrations/mcp-client.mjs.map +1 -1
package/dist/integrations/mcp.d.mts +3 -2
package/dist/integrations/mcp.d.mts.map +1 -1
package/dist/integrations/mcp.mjs +5 -5
package/dist/{mcp-BvbriaBy.mjs → mcp-1DaMsaBc.mjs} +4 -4
package/dist/mcp-1DaMsaBc.mjs.map +1 -0
package/dist/memory/index.d.mts +3 -0
package/dist/memory/index.mjs +6 -0
package/dist/memory-D1P7Tmda.mjs +4 -0
package/dist/memory-DVN0MnIG.mjs +132 -0
package/dist/memory-DVN0MnIG.mjs.map +1 -0
package/dist/memory-Dj0J1v88.mjs +294 -0
package/dist/memory-Dj0J1v88.mjs.map +1 -0
package/dist/moonshine-stt-BLyVoRpB.mjs +4 -0
package/dist/moonshine-stt-v_P_Ci_m.mjs +11936 -0
package/dist/moonshine-stt-v_P_Ci_m.mjs.map +1 -0
package/dist/{one-liner-s-lD8rCC.mjs → one-liner-DnQn7HJK.mjs} +14 -16
package/dist/one-liner-DnQn7HJK.mjs.map +1 -0
package/dist/repl-jV5gcJFA.mjs +9 -0
package/dist/skills/index.d.mts +270 -320
package/dist/skills/index.d.mts.map +1 -1
package/dist/skills/index.mjs +5 -5
package/dist/{skills-CD3Orlex.mjs → skills-DX8D59UH.mjs} +187 -32
package/dist/skills-DX8D59UH.mjs.map +1 -0
package/dist/{tools-Bi1P7Xoy.mjs → tools-DQ1mPUw5.mjs} +34 -22
package/dist/tools-DQ1mPUw5.mjs.map +1 -0
package/dist/{types-CiTc7ez3.d.mts → types-D6FiR_oh.d.mts} +106 -12
package/dist/types-D6FiR_oh.d.mts.map +1 -0
package/dist/types-DQBe2lFo.d.mts +165 -0
package/dist/types-DQBe2lFo.d.mts.map +1 -0
package/dist/{utils-CZBZ8dgR.mjs → utils-DKO55ZmZ.mjs} +1 -1
package/dist/{utils-CZBZ8dgR.mjs.map → utils-DKO55ZmZ.mjs.map} +1 -1
package/dist/vector-B0panuy6.mjs +95 -0
package/dist/vector-B0panuy6.mjs.map +1 -0
package/docs/PROJECT-STATE.md +321 -0
package/docs/adding-a-model-family.md +280 -0
package/docs/ai-sdk.md +70 -61
package/docs/architecture/overview.md +17 -7
package/docs/browser.md +203 -8
package/docs/embeddings.md +156 -0
package/docs/gerbil-site-native-migration.md +217 -0
package/docs/gpu-engine/architectures.md +398 -0
package/docs/gpu-engine/ir.md +372 -0
package/docs/gpu-engine/kernels.md +718 -0
package/docs/gpu-engine/paper.html +1759 -0
package/docs/gpu-engine/paper.md +2109 -0
package/docs/gpu-engine/safetensors.md +312 -0
package/docs/gpu-engine/tokenizer.md +302 -0
package/docs/memory-rag.md +91 -0
package/docs/metal-safari-intel.md +190 -0
package/docs/mobile-failure-diagnosis.md +124 -0
package/docs/mobile.md +99 -0
package/docs/observability.md +230 -0
package/docs/onnx-removal-plan.md +339 -0
package/docs/research/autoresearch-portable.md +904 -0
package/docs/research/dispatch-reduction-hivemind.md +84 -0
package/docs/research/ios-safari-model-caching.md +117 -0
package/docs/research/mobile-webgpu-speed-fusion.md +135 -0
package/docs/research/native-stt-model-selection.md +49 -0
package/docs/research/native-tts-model-selection.md +90 -0
package/docs/research/native-vs-chromium-decision.md +152 -0
package/docs/research/nemotron-mamba2-inference.md +910 -0
package/docs/research/qwen35-multimodal.md +293 -0
package/docs/research/qwen36-gemma4-targets.md +337 -0
package/docs/research/sota-embedding-models.md +179 -0
package/docs/research/sota-mobile-models-2026.md +263 -0
package/docs/research/sota-modality-models.md +202 -0
package/docs/research/tps-baselines.md +71 -0
package/docs/research/webgpu-m4-reference.md +104 -0
package/docs/site-update-plan.md +155 -0
package/docs/structured-output.md +123 -0
package/docs/stt.md +63 -446
package/docs/tts.md +77 -499
package/docs/vision.md +100 -338
package/package.json +22 -7
package/dist/chrome-backend-CORwaIyC.mjs +0 -1212
package/dist/chrome-backend-CORwaIyC.mjs.map +0 -1
package/dist/chrome-backend-DIKYoWj-.mjs +0 -3
package/dist/gerbil-CJ3ifloF.mjs +0 -4
package/dist/gerbil-Dw4Qj77e.mjs +0 -1631
package/dist/gerbil-Dw4Qj77e.mjs.map +0 -1
package/dist/gerbil-qOTe1nl2.d.mts +0 -431
package/dist/gerbil-qOTe1nl2.d.mts.map +0 -1
package/dist/kokoro-BNTb6egA.mjs +0 -20210
package/dist/kokoro-BNTb6egA.mjs.map +0 -1
package/dist/kokoro-CMOGDSgT.js +0 -20212
package/dist/kokoro-CMOGDSgT.js.map +0 -1
package/dist/mcp-BvbriaBy.mjs.map +0 -1
package/dist/one-liner-s-lD8rCC.mjs.map +0 -1
package/dist/repl-DveXw36T.mjs +0 -9
package/dist/skills-CD3Orlex.mjs.map +0 -1
package/dist/stt-Bu-E23Sc.js +0 -433
package/dist/stt-Bu-E23Sc.js.map +0 -1
package/dist/stt-CpLYbGFd.mjs +0 -433
package/dist/stt-CpLYbGFd.mjs.map +0 -1
package/dist/stt-DRPLEEHB.mjs +0 -3
package/dist/tools-Bi1P7Xoy.mjs.map +0 -1
package/dist/transformers.web-DiD1gTwk.js +0 -44695
package/dist/transformers.web-DiD1gTwk.js.map +0 -1
package/dist/transformers.web-u34VxRFM.js +0 -3
package/dist/tts-CqroPaSK.js +0 -724
package/dist/tts-CqroPaSK.js.map +0 -1
package/dist/tts-DXgsKGCe.mjs +0 -3
package/dist/tts-DeGANMNV.mjs +0 -730
package/dist/tts-DeGANMNV.mjs.map +0 -1
package/dist/types-CiTc7ez3.d.mts.map +0 -1
/package/dist/{auto-update-S9s5-g0C.mjs → auto-update-BVaLXcDE.mjs} +0 -0
/package/dist/{chunk-CkXuGtQK.mjs → chunk-B9cbKln6.mjs} +0 -0
/package/dist/{microphone-DaMZFRuR.mjs → microphone-Bqmoz9_K.mjs} +0 -0

package/docs/adding-a-model-family.md ADDED Viewed

@@ -0,0 +1,280 @@
+# Adding a Model Family to the Gerbil WebGPU Engine
+This is the repeatable process for teaching the native WebGPU engine a new model
+architecture. The engine is **not** Qwen-specific — it's a registry of graph
+generators over a family-agnostic IR and kernel library. Adding a family is
+usually "write one generator + register it," and only occasionally "write a new
+kernel."
+> TL;DR effort tiers:
+> - **Tier 1 (hours):** the model is Llama-like (standard transformer). Reuse existing ops; write a config→IR generator. Llama, Mistral, Gemma-text, Phi, Qwen all live here.
+> - **Tier 2 (days):** the model has one novel op (a new norm, sliding-window attention, a gate) OR new per-node params on existing kernels. Write the generator + 1–2 WGSL kernels.
+> - **Tier 3 (weeks):** the model has a fundamentally new computation (SSM/Mamba, MoE routing, dual-graph encoder-decoder, codec decoder). New kernels + executor support. Qwen3.5's Mamba-2 path was Tier 3.
+> **Repeatable SOP:** for the step-by-step loop use the `add-model-family` skill
+> (`.claude/skills/add-model-family/SKILL.md`). This doc is the reference; the
+> skill is the procedure. Both encode the lessons in "Lessons from production"
+> below — read that section first.
+---
+## Lessons from production (read first)
+A wave of families was added against this framework — Qwen3.5 (Mamba-2 SSM + ViT),
+LFM2.5 (hybrid conv/attn), EmbeddingGemma (bidirectional Gemma3 encoder), Moonshine
+STT (CrossAttention + dual-graph), Kani-TTS (LFM2 codec-LM + NanoCodec decoder),
+Gemma 4 E2B (PLE CPU-streaming + proportional RoPE + double-wide MLP + value-norm +
+head_dim-512). What actually mattered:
+| Family | Arch string | Tier | What was new |
+|---|---|---|---|
+| Qwen2/3 | `Qwen2/3ForCausalLM` | 1 | baseline standard transformer (the template) |
+| Qwen3-Embedding | `Qwen3ForCausalLM` (embedding flag) | 1 | `SliceLastRow`→`L2Norm` pooling tail, no new kernels |
+| Qwen3.5 | `Qwen3_5ForConditionalGeneration` | 3 | Mamba-2 SSM kernels + ViT encoder (`VisionExecutor`) |
+| LFM2.5 | `Lfm2ForCausalLM` | 2 | short-conv/attention hybrid; `out_proj`→`o_proj` rename |
+| EmbeddingGemma | `Gemma3TextModel`/`Gemma3Model` | 2 | bidirectional encoder (`causal:false`) + mean-pool + dense tail |
+| Moonshine STT | `MoonshineForConditionalGeneration` | 3 | new `CrossAttention` kernel; dual-graph (encoder+decoder) executor |
+| Kani-TTS | `KaniTTS2ForCausalLM` | 3 | LFM2 codec-LM backbone + NanoCodec decoder (FSQ + causal HiFi-GAN) |
+| Gemma 4 E2B | `Gemma4ForConditionalGeneration` | 2 | PLE CPU-streaming, proportional/partial RoPE, value-norm, head_dim-512, double-wide MLP, per-layer scalar — all via per-node attrs + loader hooks |
+1. **VERIFY FIRST against the live checkpoint.** Fetch the live `config.json` and
+   range-read the safetensors header (keys/dtypes/shapes) BEFORE writing anything.
+   Confirm the arch string, dims, quant format, and classify every op. Do NOT
+   trust assumptions from sibling models: Gemma4 had no MatFormer though Gemma3n
+   did; OmniVoice's codec is diffusion not AR; MLX repos ship the ViT under a
+   different key prefix (`vision_tower.*`) than HF bf16 (`model.visual.*`), and the
+   LM under `language_model.model.*` (MLX) vs `model.language_model.*` (GPTQ/HF).
+   Watch for **nested config** — Qwen3.5/Gemma4 put the text tower under
+   `text_config`.
+2. **Prefer per-node params over new kernels.** Most "novel" behavior is a new
+   parameter on an existing kernel. Add it to the WGSL `Params` struct and read it
+   in `buildParams` with a **default that reproduces the old behavior
+   byte-identically** (`attn_scale ?? 1/sqrt(head_dim)`), so every existing caller
+   is unchanged. Gemma4's attn_scale=1.0, partial RoPE
+   (`rope_dim/rope_half/rope_denom/rope_active_pairs`), and per-layer `Scale`
+   (loader-patched from `layer_scalar_key`) all landed this way — zero new attention
+   kernels.
+3. **Validate with a LOAD-ONCE reference harness — the make-or-break lesson.** Dump
+   the HF/MLX reference activations to disk ONCE
+   (`{tokens, per_layer_last[L][hidden], argmax, logits_top}`), then iterate
+   engine-only in SECONDS via a single `engine.create()` + `rawForward()` +
+   per-layer `debugReadBuffer` cosine loop (`test-gemma4-perlayer.mjs`). Reloading
+   the 2.5GB model every cycle made Gemma4 cost an hour per iteration. Localize
+   divergence to the exact layer/op. Set `GERBIL_NO_ACT_POOL=1` when reading early
+   activations (the pool aliases buffers). Gate on coherent generation — bit-exact
+   is not the bar for lossy quants.
+4. **No-regression suite is mandatory for shared-kernel changes.** Keep all
+   existing models bit-exact: `test-q4-generate`, `test-vision-e2e`,
+   `test-crossattention`, `test-nanocodec-decode`, the embedding test, and the fast
+   `test-gemma4-perlayer`. The byte-identical-default pattern (lesson 2) is what
+   keeps these green.
+5. **Big tensors: shard or stream.** Per-buffer caps are ~256MB
+   (`maxBufferSize`) / ~128MB (`maxStorageBufferBindingSize`) on iPad. Shard the
+   embedding if vocab×hidden exceeds the cap; for vocab-scale auxiliary tables keep
+   them CPU-resident and stream per-layer slices. Gemma4's ~1.17GB PLE table has
+   **0 MB GPU residency** — the loader builds a `PleSource`, diverts it from the
+   weights map, and the executor uploads only the needed `[T, width]` rows per step.
+6. **Merge additively, never commit node_modules.** Registry/IR/index conflicts are
+   always additive — keep ALL families/ops from both sides. Use targeted `git add`,
+   never `git add -A`.
+---
+## The architecture
+When a model loads, `model-loader.ts` reads `config.json` → `architectures[0]`
+(e.g. `"LlamaForCausalLM"`) and looks it up in the registry:
+```
+src/gpu/architectures/index.ts   →  ARCHITECTURES: Record<string, GraphGenerator>
+```
+A `GraphGenerator` turns the raw HF config into a `ModelGraph` (the IR):
+```ts
+type GraphGenerator = (
+  config: Record<string, unknown>,
+  dtype?: "f32" | "q4",      // q4 = on-the-fly INT4 quantization
+  groupSize?: number,        // INT4 group size (default 128)
+  kvDtype?: "f32" | "f16",   // KV cache precision
+) => ModelGraph;
+```
+The `ModelGraph` is `{ tensors, nodes, executionOrder, inputs, outputs, config, capabilities, architecture }`.
+The executor (`executor.ts`) consumes it generically: it allocates buffers
+(liveness-pooled), compiles one pipeline per node from `KERNEL_REGISTRY`, and
+dispatches in `executionOrder`. **The generator never touches WebGPU** — it only
+describes the computation.
+---
+## Step-by-step
+### 1. Identify the family and check op coverage
+Read the model's `config.json` and its modeling code on HuggingFace. Classify
+every layer's ops and check them against what the engine already has
+(`KERNEL_REGISTRY` in `src/gpu/kernels/registry.ts`):
+**Implemented ops (reuse freely):** `Embedding`, `EmbeddingInt4`, `MatMul`,
+`MatMulInt4`, `Add`, `Mul`, `Scale` (per-element, loader-patchable scalar),
+`RMSNorm`, `LayerNorm`, `RoPE` (with optional partial-rotary attrs
+`rope_dim/rope_half/rope_denom/rope_active_pairs`), `Attention` (causal *and*
+bidirectional via the `causal` flag — set `causal: false` for an encoder/ViT;
+takes a per-node `attn_scale` defaulting to `1/sqrt(head_dim)`, plus
+`sliding_window`), `CrossAttention` (encoder-decoder), `Softmax`, `SiLU`,
+`SwiGLU`, `GELU` (tanh approx), `GeluErf` (exact erf), `AddBias` (row-broadcast
+bias), `ApplyRotaryEmb` (precomputed-cos/sin `rotate_half`), `SliceCols`
+(column-range extract, e.g. split a fused QKV), `L2Norm` (row-wise, embedding
+tail), `ResidualRMSNorm`, `KVCacheAppend`, `MambaSSM`, `CausalConv1d`,
+`SigmoidGate`, `ConvStateUpdate`, `SliceLastRow`, and codec ops
+(`Conv1d`/`ConvTranspose1d`/`Snake1d`, FSQ) for audio decoders.
+If the model only uses these → **Tier 1**. If it needs something new → note it;
+you'll write a kernel in Step 5.
+**Embedding models** are Tier 1: an embedding model (e.g. Qwen3-Embedding-0.6B,
+which is `Qwen3ForCausalLM`) is the causal-LM forward pass with a different tail —
+pass the `embedding` flag to `generateQwen2Graph` to swap `lm_head` for the
+`SliceLastRow` (last-token EOS pool) → `L2Norm` tail. No new kernels (paper §21).
+**Vision-capable checkpoints** (e.g. Qwen3.5) ship a ViT in the same weights; the
+encoder is a separate graph (`qwen3_5_vision.ts`) run by `VisionExecutor`, already
+built and bit-exact vs HF (paper §22). Patches arrive pre-flattened, so the
+patch-embed is a plain `MatMul` + `AddBias` — no Conv3d kernel.
+**Exotic features that need new work (flag early):**
+- Sliding-window / local attention (needs an attention kernel variant)
+- Mixture-of-Experts routing (`MoERouter`/`ExpertMatMul` are stubbed, not built)
+- Novel normalization (e.g. QK-norm, per-head norm) — new kernel
+- MatFormer / elastic parameters (Gemma 3n) — needs slicing logic + a loader story
+- Attention logit soft-capping (Gemma 2/3) — small kernel tweak
+- Non-RoPE position encodings
+### 2. Map config → dimensions
+Every generator starts by pulling dimensions from the raw config. Copy the block
+from `src/gpu/architectures/qwen2.ts` and adjust names to the new model's config:
+```ts
+const hidden_size = rawConfig.hidden_size as number;
+const num_layers = rawConfig.num_hidden_layers as number;
+const num_heads = rawConfig.num_attention_heads as number;
+const num_kv_heads = (rawConfig.num_key_value_heads as number) ?? num_heads;  // GQA
+const intermediate_size = rawConfig.intermediate_size as number;
+const vocab_size = rawConfig.vocab_size as number;
+const context_length = (rawConfig.max_position_embeddings as number) ?? 32768;
+const rms_norm_eps = (rawConfig.rms_norm_eps as number) ?? 1e-6;
+const rope_base = (rawConfig.rope_theta as number) ?? 1_000_000.0;
+const head_dim = (rawConfig.head_dim as number) ?? Math.floor(hidden_size / num_heads);
+const tieWordEmbeddings = (rawConfig.tie_word_embeddings as boolean) ?? false;
+```
+Watch for family differences: **head_dim is often NOT hidden_size/num_heads**
+(Gemma sets it explicitly); GQA means `num_kv_heads < num_heads`; some families
+have **QKV projection bias** (Qwen2 does, Llama/Gemma don't).
+### 3. Write the generator
+Create `src/gpu/architectures/<family>.ts` exporting
+`generate<Family>Graph(...)`. Use the local `addTensor`/`addNode` helpers (they
+also push to `executionOrder`). The per-layer skeleton, following qwen2.ts:
+1. **Embedding** — `Embedding`/`EmbeddingInt4` reading `input_ids` + `CANONICAL_KEYS.EMBED`.
+2. **Per layer** (loop `num_layers`):
+   - input RMSNorm → Q/K/V projections (`MatMulInt4`) → RoPE → `KVCacheAppend` → `Attention` → output projection → residual `Add`
+   - post-attn RMSNorm → gate/up projections → `SwiGLU` → down projection → residual `Add`
+   - (use `ResidualRMSNorm` to fuse the residual+norm where the family allows)
+3. **Final norm** (`RMSNorm`).
+4. **`SliceLastRow`** on the final hidden state → **lm_head** (`MatMulInt4`) → `logits`.
+   The `[1, vocab]` logits + SliceLastRow are mandatory (saves ~485MB at long
+   context and skips the full-vocab prefill matmul — see paper §18). Honor
+   `tieWordEmbeddings` (reuse embed weights for lm_head).
+Declare intermediate tensors with `storage: "activation"` — the executor pools
+them automatically; don't manage buffers yourself. Set `capabilities`
+(`text`, `vision`, `moe`) and return the graph.
+### 4. Register it
+In `src/gpu/architectures/index.ts`:
+```ts
+import { generateLlamaGraph } from "./llama.js";
+export const ARCHITECTURES = {
+  // ...existing...
+  LlamaForCausalLM: generateLlamaGraph,
+  MistralForCausalLM: generateLlamaGraph,   // Mistral == Llama arch
+};
+```
+One generator can serve a whole family — Llama, Mistral, and many fine-tunes
+share `LlamaForCausalLM`.
+### 5. Weight key mapping
+The loader maps HF safetensors keys → canonical names via a `HFKeyMapper`
+(`createDefaultHFKeyMapper` strips the `model.` prefix). If the new family names
+weights differently, supply a mapper or set `safetensorsKey` on each `addTensor`.
+`CANONICAL_KEYS` has helpers (`qProj(i)`, `layerInputNorm(i)`, `EMBED`,
+`LM_HEAD`, …) — reuse them so the loader and generator agree.
+### 6. Write any new kernel (only if Step 1 flagged one)
+Add to `src/gpu/kernels/registry.ts`: a WGSL string + a `KernelSpec`
+(`shaderCode`, `entryPoint`, `bindings`, `getDispatchSize`, `buildParams`), then
+register it in `KERNEL_REGISTRY` under your new `OpType` (add the op to the union
+in `ir.ts`). **Mobile constraints (WebKit/iPad):** workgroup memory ≤ 16 KB
+(`maxComputeWorkgroupStorageSize`); no `enable f16` (use packed `pack2x16float`);
+clamp `exp()` for Metal fast-math; avoid `select()` (use if/else). See the
+attention kernels for the safe patterns and the two-phase reduction.
+### 7. Validate correctness (do not skip)
+1. **Reference activations:** adapt `scripts/engine/test-reference.py` to dump the
+   new model's logits + key intermediate activations from HF transformers.
+2. **Compare:** run `engine.integrityCheck()` (weights + isolated dispatches +
+   pipeline probes) and diff the engine's logits against the reference. Match the
+   argmax and check sums within tolerance.
+3. **Coherence:** generate from a fixed prompt at temperature 0 and confirm
+   sensible text (the `scripts/engine/test-q4-generate.mjs` pattern).
+4. **Cross-platform:** the desktop (Dawn) and mobile (WebKit) outputs must be
+   byte-identical — queue an iPad run via the harness (`scripts/engine/ipad-queue.json`).
+### 8. Mobile / download budget checks
+- **Download size at q4** is the headline mobile metric — report it.
+- **Per-buffer limit:** iPad `maxBufferSize` defaults to 256MB and
+  `maxStorageBufferBindingSize` to 128MB. A single weight tensor (often the
+  embedding) must fit — shard it if the model's vocab×hidden exceeds the cap.
+- **maxSeqLen policy:** the engine clamps iOS to 512 by default; large contexts
+  multiply KV-cache memory.
+---
+## Worked example: the smallest possible new family (Llama)
+Llama is Qwen2 minus QKV bias. The fastest path:
+1. Copy `qwen2.ts` → `llama.ts`, rename the export.
+2. Remove the QKV-bias tensors/handling (Llama has none).
+3. Confirm head_dim, GQA (`num_key_value_heads`), and `tie_word_embeddings` from config.
+4. Register `LlamaForCausalLM`/`MistralForCausalLM` → `generateLlamaGraph`.
+5. Validate against a reference (Step 7). No new kernels needed — Tier 1.
+This single generator unlocks Llama, Mistral, and every model that ships as
+`LlamaForCausalLM` on HuggingFace.
+---
+## Checklist
+- [ ] Classified every op; new ops listed (or confirmed none)
+- [ ] `generate<Family>Graph` written, dimensions mapped from config
+- [ ] GQA / head_dim / QKV-bias / tied-embeddings handled
+- [ ] `SliceLastRow` + `[1, vocab]` logits in place
+- [ ] Registered in `ARCHITECTURES`
+- [ ] Key mapping verified (loader finds every weight)
+- [ ] New kernels (if any) respect the 16 KB / no-`enable f16` / clamped-`exp` mobile rules
+- [ ] Validated vs HF reference (logits argmax + coherence)
+- [ ] Desktop and iPad outputs byte-identical
+- [ ] Download size at q4 reported; largest weight fits the 256MB/128MB buffer caps

package/docs/ai-sdk.md CHANGED Viewed

@@ -1,6 +1,14 @@
 # Gerbil + AI SDK
-Gerbil works as a [Vercel AI SDK v5](https://sdk.vercel.ai/) provider, supporting text generation, speech synthesis (TTS), and transcription (STT).
+Gerbil works as a [Vercel AI SDK v5](https://sdk.vercel.ai/) provider, supporting text generation, embeddings, speech synthesis (TTS), and transcription (STT).
+> **Pre-1.0 note.** The AI SDK provider routes through the `Gerbil` class, which now runs on
+> the native WebGPU engine (no ONNX / transformers.js). TTS uses Kani-TTS-2, STT uses
+> Moonshine, and embeddings use EmbeddingGemma regardless of the model-id string you pass —
+> legacy ids like `kokoro-82m` / `whisper-tiny.en` are vestigial labels and the device must
+> have WebGPU (there is no CPU/WASM fallback). The first-class surface for the engine is
+> `WebGPUEngine` / `useEngine` (see the [README](../README.md), [TTS](./tts.md),
+> [STT](./stt.md), [Embeddings](./embeddings.md) docs).
 ## Setup
@@ -56,52 +64,64 @@ const { text } = await generateText({
 });
 ```
-## Speech Generation (TTS)
+## Embeddings
+> **Native.** `gerbil.embedding()` runs native EmbeddingGemma-300M on the WebGPU engine
+> (768-dim; the `all-MiniLM-L6-v2` default id is a vestigial label — the old ONNX
+> MiniLM/BGE/GTE lane has been removed). For direct control use `engine.embed()` — see
+> [Embeddings docs](./embeddings.md). Requires WebGPU.
-Generate speech from text using Kokoro TTS:
+Generate text embeddings for semantic search, similarity, and RAG:
 ```typescript
-import { experimental_generateSpeech as generateSpeech } from "ai";
+import { embed, embedMany } from "ai";
 import { gerbil } from "@tryhamster/gerbil/ai";
-const result = await generateSpeech({
-  model: gerbil.speech(),        // kokoro-82m by default
-  text: "Hello, welcome to Gerbil!",
-  voice: "af_heart",             // Female American voice
+// Single embedding (768-dim EmbeddingGemma vector)
+const { embedding } = await embed({
+  model: gerbil.embedding(),
+  value: "Hello world",
 });
-// result.audio is a Uint8Array in WAV format
-await writeFile("output.wav", result.audio);
+// Multiple embeddings
+const { embeddings } = await embedMany({
+  model: gerbil.embedding(),
+  values: ["Hello", "World", "How are you?"],
+});
 ```
-### Available Voices
+## Speech Generation (TTS)
-```typescript
-const voices = gerbil.listVoices();
-// Returns: [{ id, name, gender, language }, ...]
-// Example voices:
-// - af_heart (Female, American)
-// - bf_emma (Female, British)
-// - am_fenrir (Male, American)
-// - bm_daniel (Male, British)
-```
+> **Native.** `gerbil.speech()` runs Kani-TTS-2 on the native engine (the `kokoro-82m`
+> default id is a vestigial label). For direct control use `engine.speak()` — see
+> [TTS docs](./tts.md). Requires WebGPU.
-### Speech Options
+Generate speech from text:
 ```typescript
+import { experimental_generateSpeech as generateSpeech } from "ai";
+import { gerbil } from "@tryhamster/gerbil/ai";
 const result = await generateSpeech({
-  model: gerbil.speech("kokoro-82m", {
-    voice: "bf_emma",   // Default voice
-    speed: 1.2,         // Speed multiplier
-  }),
-  text: "Speak faster!",
+  model: gerbil.speech(),        // native Kani-TTS-2
+  text: "Hello, welcome to Gerbil!",
 });
+// result.audio is the synthesized PCM clip
+await writeFile("output.wav", result.audio);
 ```
+> Voice/speed selection from the old Kokoro lane no longer applies — Kani-TTS-2 uses its own
+> default voice. For full control over the native speech path, use `engine.speak()` directly
+> (see [TTS docs](./tts.md)).
 ## Transcription (STT)
-Transcribe audio to text using Whisper:
+> **Native.** `gerbil.transcription()` runs Moonshine on the native engine (the
+> `whisper-tiny.en` default id is a vestigial label; English only). For direct control use
+> `MoonshineSTT` — see [STT docs](./stt.md). Requires WebGPU.
+Transcribe audio to text:
 ```typescript
 import { experimental_transcribe as transcribe } from "ai";
@@ -109,46 +129,17 @@ import { gerbil } from "@tryhamster/gerbil/ai";
 import { readFile } from "fs/promises";
 const result = await transcribe({
-  model: gerbil.transcription(),   // whisper-tiny.en by default
+  model: gerbil.transcription(),   // native Moonshine (English)
   audio: await readFile("audio.wav"),
 });
 console.log(result.text);              // "Hello world"
 console.log(result.language);          // "en"
-console.log(result.durationInSeconds); // 2.5
-console.log(result.segments);          // Timestamped segments
-```
-### Available Models
-```typescript
-const models = gerbil.listTranscriptionModels();
-// Models (smallest to largest):
-// - whisper-tiny.en (39M, English only, fastest)
-// - whisper-tiny (39M, multilingual)
-// - whisper-base.en (74M, English only)
-// - whisper-base (74M, multilingual)
-// - whisper-small.en (244M, English only)
-// - whisper-small (244M, multilingual)
-// - whisper-large-v3-turbo (809M, 80+ languages, best quality)
 ```
-### Larger Models
-```typescript
-// Use a larger model for better accuracy
-const result = await transcribe({
-  model: gerbil.transcription("whisper-base"),
-  audio: audioBuffer,
-});
-// Use multilingual model with language hint
-const result = await transcribe({
-  model: gerbil.transcription("whisper-small", { language: "es" }),
-  audio: spanishAudio,
-});
-```
+> The Whisper model family (multilingual variants, timestamped segments) has been removed.
+> The native path is Moonshine, English-only, and does not produce timestamps. For full
+> control use `MoonshineSTT` directly (see [STT docs](./stt.md)).
 ## Custom Provider
@@ -179,6 +170,24 @@ const transcript = await transcribe({
 });
 ```
+## Model Preloading
+Download models ahead of time via the provider:
+```typescript
+import { gerbil } from "@tryhamster/gerbil/ai";
+// Check if cached
+if (!(await gerbil.isCached("qwen3-0.6b"))) {
+  // Preload during app init
+  await gerbil.preload("qwen3-0.6b", {
+    onProgress: (p) => console.log(p.status, p.progress),
+  });
+}
+// Later: generateText loads instantly from cache
+```
 ## Specification
 Gerbil implements the following AI SDK v5 interfaces:

package/docs/architecture/overview.md CHANGED Viewed

@@ -20,6 +20,8 @@ Responsibilities:
 - Generation orchestration
 - Streaming coordination
 - Session statistics
+- Request queue (concurrency control)
+- Telemetry hooks for observability
 ### 2. Model Registry (`src/core/models.ts`)
@@ -57,21 +59,29 @@ Enables WebGPU in Node.js by using headless Chrome as a GPU accelerator:
                             └───────────┘
 ```
-### 4. Browser Worker (`src/browser/index.ts`)
+### 4. Browser Module (`src/browser/index.ts`)
-Provides `createGerbilWorker()` for browser applications:
+Provides React hooks and workers for browser applications:
 ```typescript
+// LLM worker
 const gerbil = await createGerbilWorker({
   modelId: "qwen3-0.6b",
   onToken: (token) => console.log(token.text),
 });
+// React hooks
+const { messages, handleSubmit } = useChat();
+const { speak, isSpeaking } = useSpeech();
+const { startRecording, transcript } = useVoiceInput();
 ```
-Uses an inline Web Worker to:
-- Load models without blocking the UI
-- Stream tokens in real-time
-- Manage GPU memory separately from main thread
+Uses inline Web Workers for:
+- **LLM**: Model loading, token streaming, GPU memory management
+- **TTS**: Kokoro/Supertonic speech synthesis
+- **STT**: Whisper transcription
+All workers load dependencies from CDN to avoid bundler issues with onnxruntime-web.
 ## Execution Paths
@@ -166,7 +176,7 @@ src/
 │   ├── tools.ts          # Tool calling system
 │   └── chrome-backend.ts # Node.js WebGPU via Chrome
 ├── browser/
-│   └── index.ts          # createGerbilWorker + utilities
+│   └── index.ts          # React hooks + LLM/TTS/STT workers
 ├── skills/
 │   └── ...               # Built-in skills (commit, summarize, etc.)
 ├── integrations/