npm - @tryhamster/gerbil - Versions diffs - 1.0.0-rc.9 → 1.0.0 - Mend

@tryhamster/gerbil 1.0.0-rc.9 → 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (179) hide show

package/LICENSE +1 -1
package/README.md +247 -84
package/dist/architectures-C1I5V3Dt.mjs +6070 -0
package/dist/architectures-C1I5V3Dt.mjs.map +1 -0
package/dist/browser/index.d.ts +264 -588
package/dist/browser/index.d.ts.map +1 -1
package/dist/browser/index.js +585 -2334
package/dist/browser/index.js.map +1 -1
package/dist/cli.mjs +625 -1098
package/dist/cli.mjs.map +1 -1
package/dist/defaults-9komdrbY.mjs +24 -0
package/dist/defaults-9komdrbY.mjs.map +1 -0
package/dist/frameworks/express.d.mts +1 -3
package/dist/frameworks/express.d.mts.map +1 -1
package/dist/frameworks/express.mjs +7 -7
package/dist/frameworks/express.mjs.map +1 -1
package/dist/frameworks/fastify.d.mts +1 -1
package/dist/frameworks/fastify.d.mts.map +1 -1
package/dist/frameworks/fastify.mjs +3 -3
package/dist/frameworks/fastify.mjs.map +1 -1
package/dist/frameworks/hono.d.mts +1 -1
package/dist/frameworks/hono.d.mts.map +1 -1
package/dist/frameworks/hono.mjs +4 -4
package/dist/frameworks/hono.mjs.map +1 -1
package/dist/frameworks/next.d.mts +3 -2
package/dist/frameworks/next.d.mts.map +1 -1
package/dist/frameworks/next.mjs +4 -4
package/dist/frameworks/next.mjs.map +1 -1
package/dist/frameworks/react.d.mts +1 -1
package/dist/frameworks/trpc.d.mts +1 -1
package/dist/frameworks/trpc.d.mts.map +1 -1
package/dist/frameworks/trpc.mjs +4 -4
package/dist/frameworks/trpc.mjs.map +1 -1
package/dist/gerbil-BHrJJIa4.mjs +1656 -0
package/dist/gerbil-BHrJJIa4.mjs.map +1 -0
package/dist/gerbil-BT9fCydo.d.mts +488 -0
package/dist/gerbil-BT9fCydo.d.mts.map +1 -0
package/dist/gerbil-DomNfIr1.mjs +4 -0
package/dist/gpu/hooks.d.mts +520 -0
package/dist/gpu/hooks.d.mts.map +1 -0
package/dist/gpu/hooks.mjs +1188 -0
package/dist/gpu/hooks.mjs.map +1 -0
package/dist/gpu/index.d.mts +2 -0
package/dist/gpu/index.mjs +6 -0
package/dist/gpu-33qCAtHW.mjs +3615 -0
package/dist/gpu-33qCAtHW.mjs.map +1 -0
package/dist/index-Dgmb2kE3.d.mts +245 -0
package/dist/index-Dgmb2kE3.d.mts.map +1 -0
package/dist/index-jEAL2s-A.d.mts +2022 -0
package/dist/index-jEAL2s-A.d.mts.map +1 -0
package/dist/index.d.mts +22 -487
package/dist/index.d.mts.map +1 -1
package/dist/index.mjs +13 -8
package/dist/index.mjs.map +1 -1
package/dist/indexeddb-store-BWIMtxxH.mjs +103 -0
package/dist/indexeddb-store-BWIMtxxH.mjs.map +1 -0
package/dist/indexeddb-store-ClH12Xnl.mjs +4 -0
package/dist/integrations/ai-sdk.d.mts +75 -6
package/dist/integrations/ai-sdk.d.mts.map +1 -1
package/dist/integrations/ai-sdk.mjs +131 -15
package/dist/integrations/ai-sdk.mjs.map +1 -1
package/dist/integrations/langchain.d.mts +1 -1
package/dist/integrations/langchain.d.mts.map +1 -1
package/dist/integrations/langchain.mjs +5 -5
package/dist/integrations/langchain.mjs.map +1 -1
package/dist/integrations/llamaindex.d.mts +1 -1
package/dist/integrations/llamaindex.d.mts.map +1 -1
package/dist/integrations/llamaindex.mjs +5 -5
package/dist/integrations/llamaindex.mjs.map +1 -1
package/dist/integrations/mcp-client.mjs +3 -3
package/dist/integrations/mcp-client.mjs.map +1 -1
package/dist/integrations/mcp.d.mts +3 -2
package/dist/integrations/mcp.d.mts.map +1 -1
package/dist/integrations/mcp.mjs +5 -5
package/dist/{mcp-BvbriaBy.mjs → mcp-1DaMsaBc.mjs} +4 -4
package/dist/mcp-1DaMsaBc.mjs.map +1 -0
package/dist/memory/index.d.mts +3 -0
package/dist/memory/index.mjs +6 -0
package/dist/memory-D1P7Tmda.mjs +4 -0
package/dist/memory-DVN0MnIG.mjs +132 -0
package/dist/memory-DVN0MnIG.mjs.map +1 -0
package/dist/memory-Dj0J1v88.mjs +294 -0
package/dist/memory-Dj0J1v88.mjs.map +1 -0
package/dist/moonshine-stt-BLyVoRpB.mjs +4 -0
package/dist/moonshine-stt-v_P_Ci_m.mjs +11936 -0
package/dist/moonshine-stt-v_P_Ci_m.mjs.map +1 -0
package/dist/{one-liner-s-lD8rCC.mjs → one-liner-DnQn7HJK.mjs} +14 -16
package/dist/one-liner-DnQn7HJK.mjs.map +1 -0
package/dist/repl-jV5gcJFA.mjs +9 -0
package/dist/skills/index.d.mts +270 -320
package/dist/skills/index.d.mts.map +1 -1
package/dist/skills/index.mjs +5 -5
package/dist/{skills-CD3Orlex.mjs → skills-DX8D59UH.mjs} +187 -32
package/dist/skills-DX8D59UH.mjs.map +1 -0
package/dist/{tools-Bi1P7Xoy.mjs → tools-DQ1mPUw5.mjs} +34 -22
package/dist/tools-DQ1mPUw5.mjs.map +1 -0
package/dist/{types-CiTc7ez3.d.mts → types-D6FiR_oh.d.mts} +106 -12
package/dist/types-D6FiR_oh.d.mts.map +1 -0
package/dist/types-DQBe2lFo.d.mts +165 -0
package/dist/types-DQBe2lFo.d.mts.map +1 -0
package/dist/{utils-CZBZ8dgR.mjs → utils-DKO55ZmZ.mjs} +1 -1
package/dist/{utils-CZBZ8dgR.mjs.map → utils-DKO55ZmZ.mjs.map} +1 -1
package/dist/vector-B0panuy6.mjs +95 -0
package/dist/vector-B0panuy6.mjs.map +1 -0
package/docs/PROJECT-STATE.md +321 -0
package/docs/adding-a-model-family.md +280 -0
package/docs/ai-sdk.md +70 -61
package/docs/architecture/overview.md +17 -7
package/docs/browser.md +203 -8
package/docs/embeddings.md +156 -0
package/docs/gerbil-site-native-migration.md +217 -0
package/docs/gpu-engine/architectures.md +398 -0
package/docs/gpu-engine/ir.md +372 -0
package/docs/gpu-engine/kernels.md +718 -0
package/docs/gpu-engine/paper.html +1759 -0
package/docs/gpu-engine/paper.md +2109 -0
package/docs/gpu-engine/safetensors.md +312 -0
package/docs/gpu-engine/tokenizer.md +302 -0
package/docs/memory-rag.md +91 -0
package/docs/metal-safari-intel.md +190 -0
package/docs/mobile-failure-diagnosis.md +124 -0
package/docs/mobile.md +99 -0
package/docs/observability.md +230 -0
package/docs/onnx-removal-plan.md +339 -0
package/docs/research/autoresearch-portable.md +904 -0
package/docs/research/dispatch-reduction-hivemind.md +84 -0
package/docs/research/ios-safari-model-caching.md +117 -0
package/docs/research/mobile-webgpu-speed-fusion.md +135 -0
package/docs/research/native-stt-model-selection.md +49 -0
package/docs/research/native-tts-model-selection.md +90 -0
package/docs/research/native-vs-chromium-decision.md +152 -0
package/docs/research/nemotron-mamba2-inference.md +910 -0
package/docs/research/qwen35-multimodal.md +293 -0
package/docs/research/qwen36-gemma4-targets.md +337 -0
package/docs/research/sota-embedding-models.md +179 -0
package/docs/research/sota-mobile-models-2026.md +263 -0
package/docs/research/sota-modality-models.md +202 -0
package/docs/research/tps-baselines.md +71 -0
package/docs/research/webgpu-m4-reference.md +104 -0
package/docs/site-update-plan.md +155 -0
package/docs/structured-output.md +123 -0
package/docs/stt.md +63 -446
package/docs/tts.md +77 -499
package/docs/vision.md +100 -338
package/package.json +22 -7
package/dist/chrome-backend-CORwaIyC.mjs +0 -1212
package/dist/chrome-backend-CORwaIyC.mjs.map +0 -1
package/dist/chrome-backend-DIKYoWj-.mjs +0 -3
package/dist/gerbil-CJ3ifloF.mjs +0 -4
package/dist/gerbil-Dw4Qj77e.mjs +0 -1631
package/dist/gerbil-Dw4Qj77e.mjs.map +0 -1
package/dist/gerbil-qOTe1nl2.d.mts +0 -431
package/dist/gerbil-qOTe1nl2.d.mts.map +0 -1
package/dist/kokoro-BNTb6egA.mjs +0 -20210
package/dist/kokoro-BNTb6egA.mjs.map +0 -1
package/dist/kokoro-CMOGDSgT.js +0 -20212
package/dist/kokoro-CMOGDSgT.js.map +0 -1
package/dist/mcp-BvbriaBy.mjs.map +0 -1
package/dist/one-liner-s-lD8rCC.mjs.map +0 -1
package/dist/repl-DveXw36T.mjs +0 -9
package/dist/skills-CD3Orlex.mjs.map +0 -1
package/dist/stt-Bu-E23Sc.js +0 -433
package/dist/stt-Bu-E23Sc.js.map +0 -1
package/dist/stt-CpLYbGFd.mjs +0 -433
package/dist/stt-CpLYbGFd.mjs.map +0 -1
package/dist/stt-DRPLEEHB.mjs +0 -3
package/dist/tools-Bi1P7Xoy.mjs.map +0 -1
package/dist/transformers.web-DiD1gTwk.js +0 -44695
package/dist/transformers.web-DiD1gTwk.js.map +0 -1
package/dist/transformers.web-u34VxRFM.js +0 -3
package/dist/tts-CqroPaSK.js +0 -724
package/dist/tts-CqroPaSK.js.map +0 -1
package/dist/tts-DXgsKGCe.mjs +0 -3
package/dist/tts-DeGANMNV.mjs +0 -730
package/dist/tts-DeGANMNV.mjs.map +0 -1
package/dist/types-CiTc7ez3.d.mts.map +0 -1
/package/dist/{auto-update-S9s5-g0C.mjs → auto-update-BVaLXcDE.mjs} +0 -0
/package/dist/{chunk-CkXuGtQK.mjs → chunk-B9cbKln6.mjs} +0 -0
/package/dist/{microphone-DaMZFRuR.mjs → microphone-Bqmoz9_K.mjs} +0 -0

package/docs/metal-safari-intel.md ADDED Viewed

@@ -0,0 +1,190 @@
+# Metal/Safari WebGPU Intel
+Ongoing engineering log for Safari/Metal WebGPU compute shader debugging. Records findings, hypotheses, test results, and external references so we never retry failed approaches.
+---
+## Confirmed Facts (Hardware/Platform)
+- **iPad jetsam limit**: ~1.5-2GB for Safari web content processes. Model weights (Qwen3.5-0.8B INT4) are ~404MB, leaving ~1-1.5GB headroom.
+- **Desktop Chrome/Dawn**: All dispatch patterns work correctly (single pass, multi-pass, multi-submit). F32 and INT4 inference at 70-125 tok/s.
+- **Safari WebGPU**: Available in Safari 26+ (iOS 26+). Uses Metal backend.
+## The Core Problem
+Full forward pass (~400 dispatches per token) produces **all-zero logits starting at dispatch entry 2+** on Safari/Metal, while working perfectly on Chrome/Dawn.
+### Forward pass structure
+```
+Entry 0: EmbeddingInt4 — reads input_ids + quantized weights → writes embed_out        ✅
+Entry 1: RMSNorm       — reads embed_out + norm weights    → writes layer0_norm1_out   ✅
+Entry 2: MatVecInt4     — reads layer0_norm1_out + q weights → writes layer0_qkv        ❌ ZEROS
+Entry 3+: ALL ZEROS
+```
+Individual dispatches work correctly in isolation (separate queue.submit per test dispatch).
+---
+## Root Cause: Metal On-Chip Cache Coherence
+### WWDC25 Session 236 (Apple, "Unlock GPU computing with WebGPU")
+> "Command buffer boundaries require synchronization between high-speed on-chip memory and unified on-device memory."
+> "Unlike command buffers, **passes don't require synchronization with unified memory**."
+**Translation**: On Metal/Safari:
+- **Compute pass boundaries** (beginComputePass/end within one encoder) do NOT flush on-chip cache to unified memory
+- **Command buffer boundaries** (separate queue.submit calls) DO flush on-chip cache
+- Chrome/Dawn inserts barriers automatically between dispatches; Safari/Metal does NOT
+This is NOT a bug — it's Metal's design for performance. Chrome/Dawn is more conservative.
+### Why Dispatch 0-1 Work But 2+ Don't
+Likely a timing/size coincidence. Entry 0 (Embedding, large) and Entry 1 (RMSNorm, small) happen to have their writes flushed before Entry 2 reads them. Or their outputs overlap in on-chip cache lines that get evicted naturally.
+---
+## Approaches Tried & Results
+### ❌ FAILED: Separate compute passes, one encoder, one submit
+```
+encoder = createCommandEncoder()
+for each dispatch:
+  pass = encoder.beginComputePass()
+  pass.setPipeline(...)
+  pass.setBindGroup(...)
+  pass.dispatchWorkgroups(...)
+  pass.end()  // ← Does NOT flush on-chip cache (per WWDC25)
+queue.submit([encoder.finish()])
+```
+**Result**: Same zeros. Pass boundaries don't provide Metal memory coherence.
+### ❌ FAILED: Atomic barrier passes between dispatches
+Inserted atomic operations on separate buffers between dispatches to force synchronization.
+**Result**: Same zeros. Atomic operations don't force cross-dispatch cache flushes on Metal.
+### ❌ FAILED: Multiple encoders, single queue.submit([~400 CBs])
+```
+commandBuffers = []
+for each dispatch:
+  encoder = createCommandEncoder()
+  // ... dispatch ...
+  commandBuffers.push(encoder.finish())
+queue.submit(commandBuffers)  // single submit with all CBs
+```
+**Result**: Same zeros. All command buffers are in ONE submit, so Metal may process them without full barriers.
+### ❌ FAILED: Multiple encoders, one submit per encoder, DRAIN_EVERY=5
+```
+for each dispatch:
+  encoder = createCommandEncoder()
+  // ... dispatch ...
+  queue.submit([encoder.finish()])
+  if (i % 5 === 0) await queue.onSubmittedWorkDone()
+```
+**Result**: OOM on iPad (tab crash/reload).
+### ❌ FAILED: Per-dispatch submit with await onSubmittedWorkDone (DRAIN_EVERY=1)
+```
+for each dispatch:
+  encoder = createCommandEncoder()
+  // ... dispatch ...
+  queue.submit([encoder.finish()])
+  await queue.onSubmittedWorkDone()  // Full GPU drain after EVERY dispatch
+```
+**Result**: OOM on iPad. The 400 JS↔GPU roundtrips + Promise overhead + command buffer allocations exceed memory.
+### ❌ DISPROVED: Shader variant alternation (argument buffer caching hypothesis)
+Hypothesis: Metal caches argument buffers per compiled shader function. Alternating between variant 0/1 of each shader would force fresh argument buffers.
+**Test Q**: Dispatched same shader with different bind groups and different output buffers in one compute pass. Both outputs were correct.
+**Result**: Hypothesis wrong. Argument buffer caching is NOT the issue.
+### ❌ FAILED: Converting var<uniform> to var<storage, read>
+Changed all uniform bindings to storage bindings to rule out uniform buffer caching.
+**Result**: Same zeros.
+### ❌ FAILED: Using writeBuffer instead of mappedAtCreation
+Changed all buffer initialization to writeBuffer.
+**Result**: Same zeros.
+### ✅ WORKS (but OOMs): Per-dispatch submit without batching
+Separate queue.submit() per dispatch does produce correct results (confirmed by probe tests), but causes OOM on iPad due to memory overhead.
+---
+## Untried Approaches
+### Per-dispatch submit WITHOUT await (fire-and-forget)
+```
+for each dispatch:
+  encoder = createCommandEncoder()
+  // ... dispatch ...
+  queue.submit([encoder.finish()])
+  // NO await — let GPU pipeline command buffers
+// Only await final readback
+```
+**Rationale**: WebGPU spec guarantees queue.submit() ordering. Each submit provides a command buffer boundary (Metal cache flush). Without await, GPU can pipeline command buffers while JS continues. Avoids JS↔GPU roundtrip overhead. WebLLM uses this pattern.
+### Grouped submits by dependency level
+Instead of 400 individual submits, group independent dispatches (e.g., all dispatches in one transformer layer) into one encoder, submit after each group.
+**Rationale**: Reduces total submit count from ~400 to ~24-48 while maintaining barriers at dependency boundaries.
+---
+## Non-Metal Bugs Found
+### Double download (React 18 Strict Mode) — FIXED 2026-03-15
+`useNativeEngine` hook used `isLoading` (React state) as the double-call guard in `load()`. React state updates are batched/async, so in strict mode (dev server), the `useEffect` fires twice and BOTH calls pass the guard before `setIsLoading(true)` is processed. Two concurrent 1.75GB downloads run simultaneously, fighting over the progress bar (progress goes backwards) and doubling memory usage (crashes Chrome).
+**Fix**: Added `loadingRef` (a `useRef`) as a synchronous guard. Refs update immediately, blocking the second call.
+---
+## External References
+- **WWDC25 Session 236**: "Unlock GPU computing with WebGPU" — Key source for Metal/WebGPU coherence model
+- **WebLLM Issue #386**: "Models output is scrambled in Safari Technology Preview" — Same root cause, fixed with batched submits per operation. https://github.com/mlc-ai/web-llm/issues/386
+- **wgpu Issue #9221**: Metal lacks full `coherent(device)` until Metal 3.2 — WebKit may under-sync storage buffers
+- **gpuweb Issue #1842**: Discussion on writable storage buffer binding aliasing
+- **gpuweb Discussion #4755**: WebGPU limitations discussion including Metal coherence
+## Diagnostic Tests (All Run on iPad Safari)
+| Test | Description | Result |
+|------|-------------|--------|
+| Test J | Multi-dispatch, DIFFERENT pipelines, one compute pass | PASS (3,6,9,12) |
+| Test P | 300 dispatches, SAME pipeline, one compute pass | PASS |
+| Test N | Real RMSNorm kernel | PASS |
+| Test O | Real MatVec kernel | PASS |
+| Test Q | Same shader, different bind groups, different outputs | PASS |
+| Probe[0] | Entry 0 (Embedding) output | Correct |
+| Probe[1] | Entry 1 (RMSNorm) output | Correct |
+| Probe[2+] | Entry 2+ outputs | ALL ZEROS |
+---
+## Key Nuances
+1. **Test J passes but forward fails**: Test J uses ~4 dispatches with tiny data. The forward pass uses ~400 dispatches with large buffers. On-chip cache behavior depends on buffer sizes and access patterns.
+2. **Same-shader dispatches work (Test P)**: 300 dispatches of the SAME shader in one pass work because Metal can optimize memory access patterns for repeated use of the same compiled function. Different shaders break this optimization.
+3. **selectivity matters**: Only dispatches with DIFFERENT pipelines that share storage buffers exhibit the zero-output bug. Independent dispatches (non-overlapping buffers) or same-pipeline dispatches work fine.
+4. **`select()` and `exp()` bugs**: Separate from the coherence issue. Safari/Metal has correctness bugs with WGSL `select()` (workaround: use if/else) and `exp(-1e30)` returns NaN instead of 0 (workaround: clamp to max(x, -80.0)). Both already fixed in our attention kernel.
+---
+## 2026-06-12: Multi-agent investigation — diagnosis + fixes (see docs/mobile-failure-diagnosis.md)
+Full 18-agent investigation report: `docs/mobile-failure-diagnosis.md`. Headlines:
+**The "one mobile bug" is FOUR bugs:**
+1. **jetsam-crash = local memory bug, NOT Metal.** Activation buffers were pre-allocated per-tensor at full maxSeqLen (~2.3GB at T=512, incl. a 485MB [T, vocab] logits buffer). Both DRAIN_EVERY=1 and =5 "OOMs" were buffer footprint, not roundtrip overhead — the log's "400 roundtrips + Promise overhead" explanation was wrong (a drained loop bounds in-flight CBs to ~1). FIXED: liveness-pooled activations (431 tensors → 20 buffers, 37MB at T=256) + SliceLastRow makes logits [1, vocab]. INT4@512 now ≈ 0.6-0.7GB total.
+2. **zero-logits = genuine threshold-dependent WebKit within-submission visibility bug.** NOT stale params (refuted: zeroed params produce *no* writes via the col<N guard, yet probes show zeros *overwriting* pre-seeded data), NOT silent alloc failure (refuted: failure is submit-strategy-dependent, allocation is not). Every passing diagnostic was tiny (≤2 pipelines, 16-byte buffers); production is ~441 dispatches/15+ pipelines/MB buffers. The "Metal has no cross-dispatch coherence by design" theory is contradicted by the WebGPU spec, by Tests J/P, and by llama.cpp running ~64 passes/CB on iOS 26.4 (WebKit bug 311598). Reduce + file upstream.
+3. **The recorded gibberish was zero-logits in disguise** (~276 tok/s = zero-work dispatches; paper.md:752 root-caused it). backlog.md's "unknown WGSL→MSL compiler bug" is stale. A REAL data race existed in all three attention variants' Q·K reduction (leader 0 reads smem[1..15] while leaders 1..15 write them, no barrier) — FIXED with two-phase reduction.
+4. **Desktop 161→19.7 tok/s regression**: isMetalBackend matched Dawn-on-Metal (vendor "apple"). FIXED: detection is now UA-based isWebKitWebGPU. Every desktop perf number recorded between 2f0cabc and this fix is poisoned.
+**Also fixed:** variant alternation deleted (disproven by own Test Q yet still active, ~850-900 pipelines); fire-and-forget per-dispatch submit (unbounded in-flight CBs, the documented WebKit anti-pattern) replaced by grouped submits with ONE CB in flight, sweepable via `?group=N` (default 1 = proven-correct floor); error scopes + onuncapturederror added (was 100% silent); diagnostic page default model was BF16 (~6GB F32 graph!) → now mlx-community 4bit; iOS maxSeqLen default 512 (was 4096 = 18.9GB request); Cache API writes >64MB skipped on WebKit (slice(0) doubled the shard).
+**Next on-device runs (in order):** B1 memory isolation (q4 + maxseq=64/512), B2 `?group=N` sweep {1,8,32,64,128}, B3 Test R bisect (pipelines/bindings/buffer-size axes) → minimal WebKit repro for upstream filing. Record results to .tmp-ipad-results.jsonl WITH location.href, iOS version, device.limits.

package/docs/mobile-failure-diagnosis.md ADDED Viewed

@@ -0,0 +1,124 @@
+# Gerbil WebKit/Metal Inference Failure — Final Diagnosis Report
+Date: 2026-06-12. Scope: Qwen3.5-0.8B INT4 on WebKit WebGPU (iPad/iPhone Safari 26+); collateral desktop regression on Dawn-on-Metal. HEAD = 38bc674 + uncommitted working tree. **Critical caveat applying to everything below: `scripts/engine/.tmp-ipad-results.jsonl` is 0 bytes — the current fire-and-forget strategy has never produced a recorded on-device result, and no past iPad run recorded its model URL, device limits, or params readback.**
+---
+## 1. Diagnosis
+The four failure modes have four **different** root causes. Do not conflate them.
+### 1.1 jetsam-crash — ROOT CAUSE: memory budget violated at every tested configuration (H1, SUPPORTED, high confidence)
+This is not a Metal bug. The footprint math, recomputed directly from `generateGraph()` with the real cached configs, exceeds the ~1.5–2GB iOS web-content budget (docs/metal-safari-intel.md:9) in every configuration ever run:
+| Config | GPU footprint |
+|---|---|
+| INT4 @ maxSeqLen=512 (the diagnostic setting, examples/ipad-diagnostic.html:299) | **2.77GB** (2.30GB activations across ~430 buffers + 0.44GB weights) |
+| — of which the logits buffer alone | **485MB** (`["T", vocab_size]` f32, src/gpu/architectures/qwen3_5.ts:932-937; only the last row ever read — src/gpu/executor.ts:386-393) |
+| INT4 @ 256 | 1.61GB |
+| Engine/React-hook default maxSeqLen=4096 (src/gpu/index.ts:168-172, src/browser/use-native-engine.ts:84) | 18.91GB |
+| Diagnostic page **default URL** (no `?model=` → BF16 `Qwen/Qwen3.5-0.8B`, no dtype q4 — ipad-diagnostic.html:291, model-loader.ts:519-523) | **6.08GB** — any default-URL iPad run was memory-killed regardless of GPU correctness, while the page label said "INT4" |
+Allocation is one buffer per activation tensor at full maxSeqLen with zero reuse (src/gpu/executor.ts:838-850; `"T"`/`"L_max"` → maxSeqLen at :841-845). On top: JS load transients — Cache-API `slice(0)` copy per range (model-loader.ts:127-128) and MLX zero-copy views pinning the whole download ArrayBuffer (mlx-adapter.ts:50).
+**This fully explains why both DRAIN_EVERY=5 and DRAIN_EVERY=1 "OOM'd" identically** (intel.md:85, :87-95): drain frequency was never the dominant cost; the buffer footprint was. The log's "400 roundtrips + Promise overhead" explanation (intel.md:95) is wrong — a fully drained loop bounds in-flight CBs to ~1.
+Secondary contributor (H2/H7): the current fire-and-forget path adds ~401 unbounded in-flight Metal command buffers per token (executor.ts:373-381, 489-498) — the exact anti-pattern Apple's Mike Wyrzykowski flags as "GPU resource exhaustion" in WebKit bug 311598 — and the still-active variant-alternation machinery compiles ~850-900 per-node shader modules/pipelines vs ~25 on Dawn (executor.ts:258-261, :1026-1073; device.ts:286-291).
+### 1.2 zero-logits — ROOT CAUSE: genuine, threshold-dependent WebKit within-submission visibility bug (H9, SUPPORTED, medium confidence) — NOT stale params, NOT silent allocation failure
+The decisive evidence triangulation:
+- **Strategy-dependence kills all local-bug theories.** CPU-side state (input_ids write, all ~400 params writes) is byte-identical between batched and per-dispatch paths — the writes all happen before the `needsMultiEncoder` branch (executor.ts:328-365 then :367). Only submit granularity differs, and that alone flips correct↔zeros (intel.md:49-75 vs :110-111). A stale-params bug (H3) or invalid-resource bug (H4) would corrupt both modes equally.
+- **The zeros are computed, not residual.** The diagnostic ladder runs isolated entry 2 (which works, intel.md:25) immediately before the batched forward, pre-seeding `layer0_qkv` with correct nonzero data; `reset()` doesn't clear activations (executor.ts:539-553). Probe[2] then reads all zeros (intel.md:21, :160-162) — entry 2 **executed and overwrote correct data with zeros computed from a stale view of its input**. The MatVecInt4 write-guard (`col < params.N`, registry.ts:515-521) means zeroed params would have written nothing and left the seed visible. This is a read-side visibility failure, not a params-write failure.
+- **The pass-table cannot refute a scale threshold.** Every passing single-submission diagnostic is tiny: Test J = 16-byte buffers, 2 pipelines, 3 dispatches (device.ts:980-1009); Test P = 300 dispatches but ONE pipeline, 16 bytes (device.ts:1445-1457); Test Q = 2 dispatches, one pipeline (device.ts:1495-1517). Production = ~441 dispatches, ~15+ distinct pipelines, 6-9 bindings, MB-scale buffers, workgroup_size(256). No diagnostic ever combined >2 pipelines, >2 bindings, or >64KB buffers in one submission. The intel doc's Key Nuances (:168 vs :172) are mutually inconsistent without a scale variable — Test J *is* different-pipelines-sharing-a-buffer and passes.
+- **The "Metal-by-design no-coherence" theory (intel.md:31-46) is wrong.** The WebGPU spec guarantees cross-dispatch storage visibility within a submission (gpuweb #4433/#4434); Tests J and P pass (dependent chains in one submit); and llama.cpp's WebGPU backend computes correctly at ~64 passes/CB on iOS 26.4 (WebKit bug 311598). The WWDC25 quote at executor.ts:368-372 inverts the session's intent: it explains why CB boundaries are *expensive*, not why they're needed for coherence.
+This is a filable WebKit bug once the threshold (pipeline count / binding count / resident-set size) is bisected — no public report covers zeros-within-one-submit. The bisect result also directly sets the grouped-submit CB size (H2 fix).
+### 1.3 gibberish — TWO components
+**(a) The recorded 2026-03-11 gibberish was zero-logits in disguise (H6 REFUTED for the recorded event).** paper.md:752 root-causes it: writeBuffer during an active pass → stale zero params → all threads early-exit → degenerate output at ~276 tok/s. That throughput — 2-4x faster than desktop M4 Max — is the signature of zero-work dispatches, not miscompiled math. backlog.md:23's "unknown Safari WGSL→MSL compiler bug" is a stale snapshot predating diagnostics H-Q and d574cdb's two-phase fix. The packed-f16/loop-bound miscompile theory is additionally contradicted by Tests N/O (real RMSNorm/MatVec kernels with storage-params loop bounds) passing on iPad (intel.md:157-158), and the packed-f16 kernels only ever saw already-zero inputs in failing runs (zeros start at entry 2, before any KV/attention dispatch).
+**(b) A real, confirmed WGSL data race remains as the live candidate for any FUTURE gibberish once coherence is fixed (H5 SUPPORTED, medium confidence).** All three attention variants contain identical spec-level UB in the Q·K score reduction: after the publish barrier, leader threads read `smem[tid..tid+15]` and write `smem[pos_in_tile]` (0..15) with no barrier between cross-simdgroup reads and writes — leader 0's read range is exactly the write target of leaders 1..15 (registry.ts:942-954 verified above; identically at :1884-1895 and :2144-2155). Every kvMode ships it (selection at executor.ts:185). No diagnostic exercises the production attention kernel at occupancy (the kernel sits at exactly 16384 bytes smem, the spec minimum, registry.ts:860-863). It stays benign on Tint/M-series; WebKit's independent WGSL→MSL compiler + A-series scheduling may not be so forgiving. It corrupts only the first key position of each 16-position tile → plausible-but-wrong tokens. Cheap to fix (one extra barrier), so fix it unconditionally.
+### 1.4 desktop-regression — ROOT CAUSE: predicate conflates Apple GPU with WebKit implementation (H8, SUPPORTED, high confidence, verified live)
+`isMetalBackend = vendor === "apple" || vendor.includes("apple") || arch.startsWith("common")` (src/gpu/device.ts:136) matches dawn-node on the M4 Max (live probe: vendor "apple", arch "metal-3", device "apple-m4-max"). The UA fallback (device.ts:143-152) never runs in node because adapter.info exists. Consequence, verified by execution: node-dawn takes the full workaround path (`needsMultiEncoder = ctx.isMetalBackend`, executor.ts:133) including variant alternation ("reassigned 441/441 prefill pipelines" printed on desktop), producing the recorded 161.8 → 19.7 tok/s collapse at HEAD (results.jsonl, gitHash 24d444a vs final baseline entry) and a residual 169 → 125 tok/s regression in the working tree. Every desktop *performance* number since 2f0cabc is corrupted; post-commit "Chrome on Mac works" observations never tested the single-encoder path Safari fails on (correctness contrasts survive — the workaround path passes correctness on Dawn).
+---
+## 2. Conclusively ruled out — do not retry
+1. **H3 — stale/dropped params writes as the cause of zero-logits (REFUTED).** The staging-buffer "mitigation" in the class comment (executor.ts:95-103) is dead code — allocated (:307-312), destroyed (:820-823), never copied from, in any commit including d574cdb. The `lastParamsBytes` cache (:338-362) would make a drop sticky. All true — but the mechanism is refuted: zeroed params produce *no writes* (write-guard registry.ts:515-521), yet probe[2] shows zeros overwriting pre-seeded nonzero data; the "early writes dropped" hazard predicts entry 0 failing first, the observed pattern is the inverse; and the identical 400-write burst precedes the known-correct per-dispatch runs. Stop chasing params delivery. (Keep the dead-code cleanup and the on-device params readback as hygiene — §5.)
+2. **H4 — silent allocation/validation failure as the cause of zero-logits (REFUTED).** Invalid buffer → invalid bind group → skipped dispatch is submit-strategy-invariant; the observed failure is strategy-dependent with bind groups created once (executor.ts:166-215). Correct per-dispatch logits at maxSeqLen=512 prove the 485MB logits buffer was created and bound successfully on the test iPad, and the default-limits fallback (device.ts:105-115) did not fire there. The missing error scopes remain a real observability gap (zero `pushErrorScope` anywhere in src/gpu) — fix it, but it isn't the cause.
+3. **H6 — packed-f16 / dynamic-loop-bound miscompilation as the cause of the recorded gibberish (REFUTED).** See §1.3(a). Residual risk only for future multi-token decode at S>1 — closed by one cheap A/B (`?kvf32=1`), §5.
+4. **Shader variant alternation (2f0cabc).** Already disproven by the project's own Test Q (intel.md:97-100) and 38bc674's commit message — yet still active (executor.ts:258-261). Delete it; never reintroduce.
+5. **Atomic-barrier passes, uniform→storage conversion, writeBuffer-vs-mappedAtCreation, separate-passes/separate-encoders-one-submit** — all recorded failed (intel.md:49-108).
+6. **Drained per-dispatch loops (DRAIN_EVERY=1/5) as tested** — they failed for memory reasons (H1), not because draining is wrong; don't re-run them at maxSeqLen=512 expecting different results.
+7. **"Metal provides no cross-dispatch coherence by design, not a bug" (intel.md:31-46)** — contradicted by spec, by Tests J/P, and by llama.cpp on iOS 26.4. Stop designing around it as expected behavior; reduce and file it.
+---
+## 3. Fix plan
+### A. High-confidence fixes to make NOW (no iPad needed)
+**A1. Fix the detection predicate (H8).** src/gpu/device.ts:123-152: drop the adapter.info vendor/arch test for `isMetalBackend`; compute it solely from WebKit-implementation detection (the existing fallback at :144-150 is nearly correct — promote it: `AppleWebKit && !Chrome/` or iOS UA; node-dawn with no userAgent → false). Rename to `isWebKitWebGPU`. Expected: node-dawn back to ~160-170 tok/s; verify with `node scripts/engine/benchmark.mjs` — no "[executor] Safari/Metal detected" line. **Do this first: it un-poisons every subsequent desktop measurement.**
+**A2. Delete variant alternation (H7).** Remove executor.ts:258-261 and `assignMetalVariants` (:1026-1073); drop the per-node `uniqueKey` from the fused-SwiGLU pipeline call (executor.ts:941). Collapses ~850-900 Metal pipeline states to ~25, shortens 'engine:compiling-shaders', removes the decode/prefill shared-entry overwrite incoherence (executor.ts:245 aliasing).
+**A3. Fix the attention race (H5).** Two-phase the leader reduction at registry.ts:947-953, :1888-1894, :2148-2154: hoist the sum into a local, `workgroupBarrier()` in *uniform* control flow, then a second guarded block writes `smem[pos_in_tile]`. (The barrier cannot go inside the existing `if` — non-uniform barrier is a WGSL validation error.) One extra barrier per KV tile; benchmark on desktop to confirm negligible cost.
+**A4. Shrink logits to [1, vocab] (H1).** qwen3_5.ts:932-937 → shape `[1, vocab_size]`; slice the last row of `final_norm_out` (small copy or kernel) and run lm_head with M=1; readback offset → 0 in executor.ts:382-393 and :406-417. Saves 485MB at T=512 **and** removes the dominant prefill compute (full-vocab matmul over all T rows). Pure win on desktop too.
+**A5. Activation buffer aliasing (H1; backlog.md:47-51 already calls for it).** In allocateActivationBuffers (executor.ts:838-850): last-use liveness over `graph.executionOrder`, pool by size class. ~430 buffers → ~a dozen live (~50-150MB at T=512). Combined with A4: INT4@512 total ≈ 0.44GB weights + ~0.15GB activations + ~0.03GB ssm/kv ≈ **0.6-0.7GB — inside the iPad budget**.
+**A6. Fix the diagnostic default-model trap.** ipad-diagnostic.html:291: fallback repo → `mlx-community/Qwen3.5-0.8B-4bit`, or refuse non-quantized configs on iOS. The current default silently builds a 6.08GB F32 graph under an "INT4" label.
+**A7. Add error scopes + limit logging.** Wrap allocateActivationBuffers / uploadWeights / initBindGroups in `pushErrorScope('validation')`+`('out-of-memory')`; register `device.onuncapturederror`; emit `device.limits.maxBufferSize`/`maxStorageBufferBindingSize` through the diagnostic stream. Currently every iPad observation is collected blind.
+**A8. Make forwardArgmax consistent.** Split the argmax dispatch and the 4-byte readback copy (executor.ts:500-508) into two submits, mirroring forward()'s :382-395 — or keep one CB *deliberately* as a coherence-theory probe. Pick one intentionally; today the code contradicts itself.
+**A9. Trim load transients.** model-loader.ts:127-128: skip the `slice(0)` cache copy for weight shards on iOS; mlx-adapter.ts:50: copy packed arrays out of the download buffer per-tensor so the multi-hundred-MB ArrayBuffer can be GC'd before upload.
+### B. Experiments requiring iPad validation (in order; all stream results into `.tmp-ipad-results.jsonl` including `location.href`, limits, and crash-phase — none of this exists today)
+**B1. Memory isolation (H1 confirmation).** With A1-A9 in place isn't even required: run `?model=mlx-community/Qwen3.5-0.8B-4bit&maxseq=64` (footprint ≈0.8GB pre-fix) → expect survival; `&maxseq=512` → expect kill mid-run; default URL → expect kill during upload. Three runs, no code changes, finally separates memory from correctness.
+**B2. Grouped-submit sweep (H2/H9 — the decisive correctness experiment).** Replace the per-dispatch loops (executor.ts:367-395, :487-509) with a parameterized grouped loop: N dispatches per CB, one compute pass per dispatch, `await onSubmittedWorkDone()` per group (exactly one CB in flight — Levine's stable iOS 26.4 recipe, WebKit bug 311598). Sweep N ∈ {1, 8, 32, 64, 128, 401} at q4 + maxSeqLen≤256, variants deleted, ~30 tokens each. Readouts: groupSize=1 is the correctness floor (known-correct per intel.md:110-111); if 32-64 is correct and stable → ship it and both zero-logits and crash modes close; if any N>1 zeros at the group's first MatVecInt4 → the WebKit bug bites below CB granularity on this OS version and N=1-with-await ships as the floor. Note the tension honestly: Gerbil's own log shows >1-dispatch-per-CB zeroing (intel.md:49-75), while llama.cpp's 64-passes/CB works — the sweep adjudicates.
+**B3. Test R bisect (H9 → filable WebKit bug).** Add to runGPUDiagnostics (insert before the return at device.ts:~1545): single-submission dependent chains sweeping, one axis at a time from the Test-J baseline: buffer size 4KB→64MB, distinct pipelines 2→32, chain length 3→441, bindings 2→6, workgroup_size(256)+`var<workgroup>`. Each config run batched AND per-dispatch (ground truth). No model load, <700MB. First failing tuple = the minimal repro for bugs.webkit.org and the sizing constant for B2. If nothing fails up to production shape, the residual is local — revisit with the H3 discriminating probe.
+**B4. H3/coherence discriminator (cheap, piggyback on B2).** Force the batched path once, pre-write sentinel 7777.0 into `layer0_qkv` via debugWriteBuffer (executor.ts:773-777), forward, then debugPipelineProbe(1) (executor.ts:686-771) recording `uniformParams` for entries 2-5. params correct + qkv=0.0 → coherence/visibility confirmed on-device (expected); params zeroed + qkv still 7777.0 → reopen H3.
+**B5. Gibberish closure (H5/H6 residual).** After B2 lands a correct config: (i) 200x loop of the production ATTENTION_PACKED_F16 kernel at S=64 vs JS reference — nondeterminism confined to key positions 0/16/32/48 confirms the race manifests on WebKit; (ii) A/B the same >16-token generation pre/post the A3 barrier fix, and packed-f16 vs `?kvf32=1` (index.ts:123-124), diffing token IDs against node-dawn references. Also file the select()/exp(-1e30) miscompiles upstream (intel.md:174 — unreported; WebKit fixes this class in weeks).
+---
+## 4. Memory/crash mitigation (jetsam survival checklist)
+1. **Logits [1, vocab]** (A4): −485MB at T=512.
+2. **Activation aliasing** (A5): −~2.15GB at T=512.
+3. **Honor a hard iOS cap**: backend-selector's 512-token cap (src/browser/backend-selector.ts:43-53) is right; also clamp the engine/React defaults (index.ts:168-172, use-native-engine.ts:84) — 4096 must never reach an iOS device (18.9GB request).
+4. **Never load BF16 on iOS** (A6): refuse non-quantized configs or force q4.
+5. **Bound in-flight CBs**: grouped submits with one CB in flight (B2). Fire-and-forget's ~401 unbounded CBs/token is the documented WebKit resource-exhaustion pattern even after buffer fixes.
+6. **Keep the uncommitted JS-weight-free-after-upload** (executor.ts:147-157) and add the transient trims (A9).
+7. **Post-fix target**: ~0.6-0.7GB total GPU at T=512 — comfortable headroom under the ~1.5-2GB budget for the first time in the project's history.
+---
+## 5. Open questions (on-device only, minimal test each)
+1. **Does grouped submission (N=32-64/CB, one in flight) produce correct logits on this WebKit version?** — B2 sweep. This is the single highest-value unknown; it decides whether the WebKit bug is per-CB or sub-CB granularity.
+2. **What is the zeros threshold (pipelines/bindings/resident set)?** — B3 Test R. Output doubles as the WebKit bug filing. Note: the dependency-carrying buffer at the failing edge (norm1_out, decode T=1) is KB-scale, so sweep *shape*, not just size.
+3. **Do params arrive intact in batched mode on-device?** — B4 sentinel probe (closes the last sliver of H3; never recorded).
+4. **Does the attention race manifest on A-series WebKit?** — B5(i) 200x determinism loop.
+5. **Does packed-f16 KV survive realistic S on-device?** — B5(ii) `?kvf32=1` A/B (never recorded; grep finds kvf32 only at index.ts:123 and paper.md:507).
+6. **Which iOS version is the test iPad on?** — bug 311598 reports 26.4 markedly more stable than 26.3.1; log it in every result row. Also log `device.limits` (256MB-993MB maxBufferSize range across devices determines whether the 0.44GB weights blob needs sharding on smaller phones).
+7. **Is any crash actually JSC/WASM-side, not GPU?** — onnxruntime #26827 phenotype (WebKit 26 OMG/B3 recompilation loop, 400% CPU, runaway RAM). One run with WebGPU idle after tokenizer init, watching CPU/memory, rules it in or out.
+### Bottom line
+Per failure mode: **jetsam-crash** = local memory bug (H1), fully fixable in src/gpu without touching dispatch strategy; **zero-logits** = genuine WebKit within-submission visibility bug at scale (H9), workaround granularity to be set by the B2/B3 sweeps (floor: 1 dispatch/CB with await, proven correct; target: 32-64/CB per llama.cpp's existence proof), and worth filing upstream; **gibberish** = historically a zeros artifact (refuted H6), prospectively the confirmed attention race (H5) — fix unconditionally; **desktop-regression** = local predicate bug (H8), one-line-class fix. The committed HEAD is known-broken on the target device, the working tree implements the documented WebKit anti-pattern, and nothing currently in the repo has a recorded iPad result — land A1-A9, then run B1/B2 before writing another line of workaround code.

package/docs/mobile.md ADDED Viewed

@@ -0,0 +1,99 @@
+# Mobile (iOS / Android / Safari)
+Gerbil runs entirely on-device via WebGPU, so it works on modern phones and
+tablets — but mobile browsers impose two hard ceilings that desktop doesn't.
+Understanding them (and one setup step for your users) is the difference between
+a great mobile experience and a frustrating one.
+## The two ceilings
+On-device models are large — the default Qwen3.5-0.8B 4-bit is ~404 MB; vision and
+larger models are GBs. Mobile browsers, **iOS Safari especially**, wall a web
+origin off from the device with two independent limits:
+| Ceiling | What it limits | iOS Safari (uninstalled) | The fix |
+|---|---|---|---|
+| **Storage quota** | Disk for the model cache | ~1 GB, **evictable**, regardless of free disk | Install to Home Screen → persistent storage |
+| **Tab memory** | RAM during load + inference | ~hundreds of MB | Smaller model / fewer dispatches |
+A device with 40 GB free disk still only grants an *uninstalled* Safari tab ~1 GB,
+and that 1 GB can be evicted between visits — so without the step below, a large
+model **re-downloads every visit**.
+## Recommend "Add to Home Screen" (PWA) for the best mobile UX
+The storage ceiling lifts when your site is **installed to the Home Screen** as a
+PWA. Installed, iOS grants **persistent storage** tied to real disk (tens of GB,
+never evicted), so models download **once** and stay cached forever. This is the
+single biggest mobile UX win you can offer your users.
+iOS Safari has **no programmatic install prompt** — installation is manual (Share →
+Add to Home Screen) — so the right pattern is to *detect* the situation and nudge
+the user before a large download.
+## The storage helpers
+`@tryhamster/gerbil/browser` exposes helpers so you can surface this to users:
+```ts
+import {
+  getStorageStatus,
+  canCacheModel,
+  requestPersistentStorage,
+  getInstallGuidance,
+  isStandalone,
+} from "@tryhamster/gerbil/browser";
+// Before downloading a large model on mobile, check whether it will cache.
+const fit = await canCacheModel(404); // model size in MB
+if (fit.recommendInstall) {
+  // Not installed (or won't fit the current quota) — nudge the user.
+  const { manual, steps } = getInstallGuidance();
+  // manual === true on iOS (show the Share → Add to Home Screen steps);
+  // on Android/Chrome you can capture `beforeinstallprompt` for a one-tap button.
+  showBanner(steps);
+}
+// Ask for eviction-exempt storage (best-effort; on iOS effectively granted only
+// once installed to the Home Screen).
+await requestPersistentStorage();
+// Full snapshot for diagnostics / UI.
+const s = await getStorageStatus();
+// { quotaMB, usageMB, availableMB, persisted, installed, ios }
+```
+| Helper | Returns |
+|---|---|
+| `getStorageStatus()` | `{ quotaMB, usageMB, availableMB, persisted, installed, ios }` |
+| `canCacheModel(sizeMB)` | `{ fits, availableMB, recommendInstall }` |
+| `requestPersistentStorage()` | `boolean` — is storage persistent now |
+| `getInstallGuidance()` | `{ installed, manual, steps }` — platform-aware install steps |
+| `isStandalone()` / `isIOS()` | `boolean` |
+## What Gerbil handles automatically
+- **Per-device speed tuning.** On WebKit, each GPU dispatch is submitted and
+  drained separately, so decode is bound by CPU↔GPU round-trips. Gerbil batches
+  dispatches per command buffer and **auto-calibrates the safe batch size per
+  device** (a crash-surviving probe persisted in `localStorage`): tablets/desktop
+  Safari jump to a fast group automatically, phones stay at the safe floor. On a
+  modern iPad this is ~3–4× faster than the unbatched floor. You can override with
+  the `?group=N` URL parameter.
+- **Durable, self-healing model cache.** Weights are cached per-tensor (OPFS where
+  available, else CacheStorage), and superseded cache namespaces are auto-evicted
+  on load so orphaned data can't fill the quota and block new caching.
+- **`dtype: "auto"`** picks INT4 on mobile to keep both download size and memory
+  in budget.
+## Practical recommendations
+1. Default to a small 4-bit model on mobile (the ~404 MB Qwen3.5-0.8B is a good
+   baseline; avoid GB-scale models unless the user has installed the PWA).
+2. Make your site **installable** (web app manifest + service worker for the app
+   shell — *not* the model; Gerbil caches the model itself) and nudge mobile users
+   to install before a large download using the helpers above.
+3. Call `requestPersistentStorage()` early.
+4. Expect first load to download once; subsequent loads read from cache (instant)
+   — provided the model fits the quota (≈ install on iOS for anything approaching
+   1 GB).

package/docs/observability.md ADDED Viewed

@@ -0,0 +1,230 @@
+# Production Observability
+Gerbil includes built-in support for production observability through telemetry hooks and request queuing.
+## Telemetry Hooks
+Configure telemetry hooks to integrate with Sentry, DataDog, or any monitoring system:
+```typescript
+import { Gerbil } from "@tryhamster/gerbil";
+import * as Sentry from "@sentry/node";
+const g = new Gerbil({
+  telemetry: {
+    // Called on any error (model load, generation, etc.)
+    onError: (error, context) => {
+      Sentry.captureException(error, {
+        extra: context,
+        tags: { operation: context.operation },
+      });
+    },
+    // Called after successful generation
+    onGenerate: (event) => {
+      console.log(`Generated ${event.result.tokensGenerated} tokens`);
+      // Track in your metrics system
+      metrics.histogram("gerbil.tokens_generated", event.result.tokensGenerated);
+      metrics.histogram("gerbil.tokens_per_second", event.result.tokensPerSecond);
+    },
+    // Called after model loading (success or failure)
+    onModelLoad: (event) => {
+      if (event.success) {
+        console.log(`Loaded ${event.modelId} in ${event.loadTimeMs}ms on ${event.device}`);
+      } else {
+        console.error(`Failed to load ${event.modelId}: ${event.error}`);
+      }
+    },
+    // Called when requests wait in queue (>100ms)
+    onQueueWait: (waitTimeMs) => {
+      metrics.histogram("gerbil.queue_wait_ms", waitTimeMs);
+    },
+  },
+});
+```
+### Telemetry Events
+#### `onError(error, context)`
+Called whenever an error occurs during Gerbil operations.
+```typescript
+type ErrorContext = {
+  operation: "generate" | "load" | "embed" | "speak" | "transcribe" | "json";
+  modelId?: string;
+  extra?: Record<string, unknown>;
+};
+```
+#### `onGenerate(event)`
+Called after successful text generation.
+```typescript
+type GenerateEvent = {
+  modelId: string;
+  result: GenerateResult;
+  cached: boolean;
+  queueTimeMs?: number; // Only if waited >100ms
+};
+```
+#### `onModelLoad(event)`
+Called after model loading completes (success or failure).
+```typescript
+type ModelLoadEvent = {
+  modelId: string;
+  loadTimeMs: number;
+  fromCache: boolean;
+  device: "webgpu" | "cpu" | "wasm";
+  success: boolean;
+  error?: string;
+};
+```
+#### `onQueueWait(waitTimeMs)`
+Called when a request waits in the queue for more than 100ms. Useful for detecting congestion.
+## Request Queue
+Gerbil uses a request queue to prevent GPU OOM errors under concurrent load. LLM inference can only run one request at a time on the GPU.
+### Default Behavior
+- **Concurrency**: 1 (single request at a time)
+- **Timeout**: 5 minutes (300,000ms)
+- Requests are processed in FIFO order
+- Timeout errors are thrown if exceeded
+### Custom Configuration
+```typescript
+const g = new Gerbil({
+  concurrency: {
+    maxConcurrent: 1,       // Max parallel requests (default: 1)
+    timeout: 300_000,       // Request timeout in ms (default: 5 min)
+  },
+});
+```
+### Why Queue?
+LLM inference on GPU is:
+1. **Memory-bound**: Models consume most of GPU VRAM
+2. **Non-concurrent**: Running multiple inferences simultaneously causes OOM
+3. **Variable duration**: Generation time depends on output length
+The queue ensures:
+- Predictable memory usage
+- No OOM crashes under load
+- Fair request ordering
+## Rate Limiting
+Gerbil does **not** include rate limiting. This is intentional—rate limiting is best handled at the application layer using middleware specific to your framework:
+```typescript
+// Express
+import rateLimit from "express-rate-limit";
+import { gerbil } from "@tryhamster/gerbil/express";
+app.use("/ai", rateLimit({ windowMs: 60000, max: 10 }));
+app.use("/ai", gerbil());
+// Next.js
+import { Ratelimit } from "@upstash/ratelimit";
+import { Redis } from "@upstash/redis";
+const ratelimit = new Ratelimit({
+  redis: Redis.fromEnv(),
+  limiter: Ratelimit.slidingWindow(10, "60s"),
+});
+export async function POST(req: Request) {
+  const ip = req.headers.get("x-forwarded-for") ?? "anonymous";
+  const { success } = await ratelimit.limit(ip);
+  if (!success) return Response.json({ error: "Rate limited" }, { status: 429 });
+  // Continue with Gerbil...
+}
+```
+## Example: Full Production Setup
+```typescript
+import { Gerbil } from "@tryhamster/gerbil";
+import * as Sentry from "@sentry/node";
+Sentry.init({ dsn: process.env.SENTRY_DSN });
+const g = new Gerbil({
+  model: "qwen3-0.6b",
+  telemetry: {
+    onError: (error, context) => {
+      Sentry.captureException(error, { extra: context });
+    },
+    onGenerate: ({ result, queueTimeMs }) => {
+      // Log slow generations
+      if (result.totalTime > 10000) {
+        console.warn(`Slow generation: ${result.totalTime}ms`);
+      }
+      // Track queue congestion
+      if (queueTimeMs && queueTimeMs > 5000) {
+        Sentry.captureMessage("High queue wait time", {
+          level: "warning",
+          extra: { queueTimeMs },
+        });
+      }
+    },
+    onModelLoad: (event) => {
+      if (!event.success) {
+        Sentry.captureMessage(`Model load failed: ${event.error}`, {
+          level: "error",
+          extra: event,
+        });
+      }
+    },
+  },
+  concurrency: {
+    maxConcurrent: 1,
+    timeout: 120_000, // 2 minute timeout for your use case
+  },
+});
+// Preload model on startup
+await g.loadModel();
+console.log("Gerbil ready for production");
+```
+## Health Checks
+For production deployments, implement a health check endpoint:
+```typescript
+// Express
+app.get("/health", async (req, res) => {
+  try {
+    const info = g.getInfo();
+    res.json({
+      status: "ok",
+      model: info.model?.id,
+      device: info.device.backend,
+      ready: info.device.status === "ready",
+    });
+  } catch (error) {
+    res.status(503).json({ status: "error", message: String(error) });
+  }
+});
+```