npm - @tryhamster/gerbil - Versions diffs - 1.0.0-rc.9 → 1.0.1 - Mend

@tryhamster/gerbil 1.0.0-rc.9 → 1.0.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (179) hide show

package/LICENSE +1 -1
package/README.md +318 -104
package/dist/architectures-C1I5V3Dt.mjs +6070 -0
package/dist/architectures-C1I5V3Dt.mjs.map +1 -0
package/dist/browser/index.d.ts +276 -590
package/dist/browser/index.d.ts.map +1 -1
package/dist/browser/index.js +592 -2334
package/dist/browser/index.js.map +1 -1
package/dist/cli.mjs +625 -1098
package/dist/cli.mjs.map +1 -1
package/dist/defaults-9komdrbY.mjs +24 -0
package/dist/defaults-9komdrbY.mjs.map +1 -0
package/dist/frameworks/express.d.mts +1 -3
package/dist/frameworks/express.d.mts.map +1 -1
package/dist/frameworks/express.mjs +7 -7
package/dist/frameworks/express.mjs.map +1 -1
package/dist/frameworks/fastify.d.mts +1 -1
package/dist/frameworks/fastify.d.mts.map +1 -1
package/dist/frameworks/fastify.mjs +3 -3
package/dist/frameworks/fastify.mjs.map +1 -1
package/dist/frameworks/hono.d.mts +1 -1
package/dist/frameworks/hono.d.mts.map +1 -1
package/dist/frameworks/hono.mjs +4 -4
package/dist/frameworks/hono.mjs.map +1 -1
package/dist/frameworks/next.d.mts +3 -2
package/dist/frameworks/next.d.mts.map +1 -1
package/dist/frameworks/next.mjs +4 -4
package/dist/frameworks/next.mjs.map +1 -1
package/dist/frameworks/react.d.mts +1 -1
package/dist/frameworks/trpc.d.mts +1 -1
package/dist/frameworks/trpc.d.mts.map +1 -1
package/dist/frameworks/trpc.mjs +4 -4
package/dist/frameworks/trpc.mjs.map +1 -1
package/dist/gerbil-BetB5xb0.d.mts +488 -0
package/dist/gerbil-BetB5xb0.d.mts.map +1 -0
package/dist/gerbil-CTZUa8EZ.mjs +4 -0
package/dist/gerbil-DNniplr4.mjs +1656 -0
package/dist/gerbil-DNniplr4.mjs.map +1 -0
package/dist/gpu/hooks.d.mts +640 -0
package/dist/gpu/hooks.d.mts.map +1 -0
package/dist/gpu/hooks.mjs +1369 -0
package/dist/gpu/hooks.mjs.map +1 -0
package/dist/gpu/index.d.mts +2 -0
package/dist/gpu/index.mjs +6 -0
package/dist/gpu-DFuglcEx.mjs +3790 -0
package/dist/gpu-DFuglcEx.mjs.map +1 -0
package/dist/index-Dgmb2kE3.d.mts +245 -0
package/dist/index-Dgmb2kE3.d.mts.map +1 -0
package/dist/index-DukkJRMj.d.mts +2114 -0
package/dist/index-DukkJRMj.d.mts.map +1 -0
package/dist/index.d.mts +22 -487
package/dist/index.d.mts.map +1 -1
package/dist/index.mjs +13 -8
package/dist/index.mjs.map +1 -1
package/dist/indexeddb-store-BWIMtxxH.mjs +103 -0
package/dist/indexeddb-store-BWIMtxxH.mjs.map +1 -0
package/dist/indexeddb-store-ClH12Xnl.mjs +4 -0
package/dist/integrations/ai-sdk.d.mts +75 -6
package/dist/integrations/ai-sdk.d.mts.map +1 -1
package/dist/integrations/ai-sdk.mjs +131 -15
package/dist/integrations/ai-sdk.mjs.map +1 -1
package/dist/integrations/langchain.d.mts +1 -1
package/dist/integrations/langchain.d.mts.map +1 -1
package/dist/integrations/langchain.mjs +5 -5
package/dist/integrations/langchain.mjs.map +1 -1
package/dist/integrations/llamaindex.d.mts +1 -1
package/dist/integrations/llamaindex.d.mts.map +1 -1
package/dist/integrations/llamaindex.mjs +5 -5
package/dist/integrations/llamaindex.mjs.map +1 -1
package/dist/integrations/mcp-client.mjs +3 -3
package/dist/integrations/mcp-client.mjs.map +1 -1
package/dist/integrations/mcp.d.mts +3 -2
package/dist/integrations/mcp.d.mts.map +1 -1
package/dist/integrations/mcp.mjs +5 -5
package/dist/{mcp-BvbriaBy.mjs → mcp-D2vvH1Xc.mjs} +4 -4
package/dist/mcp-D2vvH1Xc.mjs.map +1 -0
package/dist/memory/index.d.mts +3 -0
package/dist/memory/index.mjs +6 -0
package/dist/memory-D1P7Tmda.mjs +4 -0
package/dist/memory-DVN0MnIG.mjs +132 -0
package/dist/memory-DVN0MnIG.mjs.map +1 -0
package/dist/memory-Dj0J1v88.mjs +294 -0
package/dist/memory-Dj0J1v88.mjs.map +1 -0
package/dist/moonshine-stt-17dpP1kr.mjs +4 -0
package/dist/moonshine-stt-4ojLtMq7.mjs +11962 -0
package/dist/moonshine-stt-4ojLtMq7.mjs.map +1 -0
package/dist/{one-liner-s-lD8rCC.mjs → one-liner-JhdIPxzF.mjs} +14 -16
package/dist/one-liner-JhdIPxzF.mjs.map +1 -0
package/dist/repl-BDRkwPGX.mjs +9 -0
package/dist/skills/index.d.mts +270 -320
package/dist/skills/index.d.mts.map +1 -1
package/dist/skills/index.mjs +5 -5
package/dist/{skills-CD3Orlex.mjs → skills-CU694Dc8.mjs} +187 -32
package/dist/skills-CU694Dc8.mjs.map +1 -0
package/dist/{tools-Bi1P7Xoy.mjs → tools-DQ1mPUw5.mjs} +34 -22
package/dist/tools-DQ1mPUw5.mjs.map +1 -0
package/dist/types-DQBe2lFo.d.mts +165 -0
package/dist/types-DQBe2lFo.d.mts.map +1 -0
package/dist/{types-CiTc7ez3.d.mts → types-LlyYILII.d.mts} +112 -14
package/dist/types-LlyYILII.d.mts.map +1 -0
package/dist/{utils-CZBZ8dgR.mjs → utils-DKO55ZmZ.mjs} +1 -1
package/dist/{utils-CZBZ8dgR.mjs.map → utils-DKO55ZmZ.mjs.map} +1 -1
package/dist/vector-B0panuy6.mjs +95 -0
package/dist/vector-B0panuy6.mjs.map +1 -0
package/docs/PROJECT-STATE.md +321 -0
package/docs/adding-a-model-family.md +280 -0
package/docs/ai-sdk.md +70 -61
package/docs/architecture/overview.md +17 -7
package/docs/browser.md +203 -8
package/docs/embeddings.md +156 -0
package/docs/gerbil-site-native-migration.md +217 -0
package/docs/gpu-engine/architectures.md +398 -0
package/docs/gpu-engine/ir.md +372 -0
package/docs/gpu-engine/kernels.md +718 -0
package/docs/gpu-engine/paper.html +1759 -0
package/docs/gpu-engine/paper.md +2109 -0
package/docs/gpu-engine/safetensors.md +312 -0
package/docs/gpu-engine/tokenizer.md +302 -0
package/docs/memory-rag.md +91 -0
package/docs/metal-safari-intel.md +190 -0
package/docs/mobile-failure-diagnosis.md +124 -0
package/docs/mobile.md +99 -0
package/docs/observability.md +230 -0
package/docs/onnx-removal-plan.md +339 -0
package/docs/research/autoresearch-portable.md +904 -0
package/docs/research/dispatch-reduction-hivemind.md +84 -0
package/docs/research/ios-safari-model-caching.md +117 -0
package/docs/research/mobile-webgpu-speed-fusion.md +135 -0
package/docs/research/native-stt-model-selection.md +49 -0
package/docs/research/native-tts-model-selection.md +90 -0
package/docs/research/native-vs-chromium-decision.md +152 -0
package/docs/research/nemotron-mamba2-inference.md +910 -0
package/docs/research/qwen35-multimodal.md +293 -0
package/docs/research/qwen36-gemma4-targets.md +337 -0
package/docs/research/sota-embedding-models.md +179 -0
package/docs/research/sota-mobile-models-2026.md +263 -0
package/docs/research/sota-modality-models.md +202 -0
package/docs/research/tps-baselines.md +71 -0
package/docs/research/webgpu-m4-reference.md +104 -0
package/docs/site-update-plan.md +155 -0
package/docs/structured-output.md +123 -0
package/docs/stt.md +63 -446
package/docs/tts.md +77 -499
package/docs/vision.md +100 -338
package/package.json +22 -7
package/dist/chrome-backend-CORwaIyC.mjs +0 -1212
package/dist/chrome-backend-CORwaIyC.mjs.map +0 -1
package/dist/chrome-backend-DIKYoWj-.mjs +0 -3
package/dist/gerbil-CJ3ifloF.mjs +0 -4
package/dist/gerbil-Dw4Qj77e.mjs +0 -1631
package/dist/gerbil-Dw4Qj77e.mjs.map +0 -1
package/dist/gerbil-qOTe1nl2.d.mts +0 -431
package/dist/gerbil-qOTe1nl2.d.mts.map +0 -1
package/dist/kokoro-BNTb6egA.mjs +0 -20210
package/dist/kokoro-BNTb6egA.mjs.map +0 -1
package/dist/kokoro-CMOGDSgT.js +0 -20212
package/dist/kokoro-CMOGDSgT.js.map +0 -1
package/dist/mcp-BvbriaBy.mjs.map +0 -1
package/dist/one-liner-s-lD8rCC.mjs.map +0 -1
package/dist/repl-DveXw36T.mjs +0 -9
package/dist/skills-CD3Orlex.mjs.map +0 -1
package/dist/stt-Bu-E23Sc.js +0 -433
package/dist/stt-Bu-E23Sc.js.map +0 -1
package/dist/stt-CpLYbGFd.mjs +0 -433
package/dist/stt-CpLYbGFd.mjs.map +0 -1
package/dist/stt-DRPLEEHB.mjs +0 -3
package/dist/tools-Bi1P7Xoy.mjs.map +0 -1
package/dist/transformers.web-DiD1gTwk.js +0 -44695
package/dist/transformers.web-DiD1gTwk.js.map +0 -1
package/dist/transformers.web-u34VxRFM.js +0 -3
package/dist/tts-CqroPaSK.js +0 -724
package/dist/tts-CqroPaSK.js.map +0 -1
package/dist/tts-DXgsKGCe.mjs +0 -3
package/dist/tts-DeGANMNV.mjs +0 -730
package/dist/tts-DeGANMNV.mjs.map +0 -1
package/dist/types-CiTc7ez3.d.mts.map +0 -1
/package/dist/{auto-update-S9s5-g0C.mjs → auto-update-BVaLXcDE.mjs} +0 -0
/package/dist/{chunk-CkXuGtQK.mjs → chunk-B9cbKln6.mjs} +0 -0
/package/dist/{microphone-DaMZFRuR.mjs → microphone-Bqmoz9_K.mjs} +0 -0

package/docs/gpu-engine/kernels.md ADDED Viewed

@@ -0,0 +1,718 @@
+# WGSL Kernel Reference
+Complete reference for all 12 WGSL compute shaders in `src/gpu/kernels/wgsl/`. Each section covers the algorithm, binding layout, uniform struct, dispatch formula, example dispatches, and optimization opportunities.
+---
+## Table of Contents
+1. [Embedding](#1-embedding)
+2. [MatMul (f32)](#2-matmul-f32)
+3. [MatMulInt4](#3-matmulint4)
+4. [RMSNorm](#4-rmsnorm)
+5. [LayerNorm](#5-layernorm)
+6. [RoPE](#6-rope)
+7. [Attention](#7-attention)
+8. [Softmax](#8-softmax)
+9. [SiLU](#9-silu)
+10. [GELU](#10-gelu)
+11. [Add](#11-add)
+12. [Mul](#12-mul)
+13. [Micro-Benchmarking Approach](#13-micro-benchmarking-approach)
+---
+## 1. Embedding
+**File:** `embedding.wgsl`
+**Computes:** `output[t, d] = weight[input_ids[t], d]` for all positions `t` and dimensions `d`.
+**Algorithm:** Flat parallel gather. Each thread handles one element of the output matrix. Thread `idx` maps to position `(idx / hidden_size, idx % hidden_size)`, looks up the token ID at that position, and copies one element from the corresponding row of the embedding weight matrix.
+### Binding Layout
+| Binding | Name | Type | Access | Shape |
+|---------|------|------|--------|-------|
+| 0 | `input_ids` | `array<u32>` | read | `[T]` |
+| 1 | `weight` | `array<f32>` | read | `[vocab_size, hidden_size]` (row-major) |
+| 2 | `output` | `array<f32>` | read_write | `[T, hidden_size]` |
+| 3 | `params` | uniform | - | `Params` struct |
+### Uniform Struct
+```wgsl
+struct Params {
+  seq_len: u32,       // T
+  hidden_size: u32,   // D
+}
+```
+### Dispatch Formula
+```
+workgroups_x = ceil(T * hidden_size / 256)
+workgroups_y = 1
+workgroups_z = 1
+```
+### Example Dispatch
+| T | hidden_size | Total elements | Workgroups |
+|---|-------------|---------------|------------|
+| 1 | 896 | 896 | (4, 1, 1) |
+| 50 | 896 | 44,800 | (176, 1, 1) |
+| 2048 | 896 | 1,835,008 | (7,169, 1, 1) |
+### Optimization Notes
+- Could use `vec4<f32>` loads to process 4 elements per thread (4x fewer threads)
+- Weight matrix could be stored as f16 and converted on read for 2x bandwidth savings
+- For very large vocabularies (150K+), the weight matrix itself is the bottleneck; consider caching hot rows in shared memory
+---
+## 2. MatMul (f32)
+**File:** `matmul.wgsl`
+**Computes:** `C[M, N] = A[M, K] * B[K, N]` (row-major matrices)
+**Algorithm:** Classic 16x16 tiled matrix multiply with shared memory. Each workgroup computes a 16x16 tile of the output. For each tile of the K dimension, threads cooperatively load a 16x16 tile of A and a 16x16 tile of B into shared memory, synchronize, then compute partial dot products.
+### Binding Layout
+| Binding | Name | Type | Access | Shape |
+|---------|------|------|--------|-------|
+| 0 | `A` | `array<f32>` | read | `[M, K]` |
+| 1 | `B` | `array<f32>` | read | `[K, N]` |
+| 2 | `C` | `array<f32>` | read_write | `[M, N]` |
+| 3 | `params` | uniform | - | `Params` struct |
+### Uniform Struct
+```wgsl
+struct Params {
+  M: u32,
+  K: u32,
+  N: u32,
+}
+```
+### Dispatch Formula
+```
+workgroups_x = ceil(N / 16)
+workgroups_y = ceil(M / 16)
+workgroups_z = 1
+```
+### Example Dispatch
+| Operation | M | K | N | Workgroups |
+|-----------|---|---|---|------------|
+| Q projection (T=1) | 1 | 896 | 896 | (56, 1, 1) |
+| Q projection (T=50) | 50 | 896 | 896 | (56, 4, 1) |
+| Gate proj (T=1) | 1 | 896 | 4864 | (304, 1, 1) |
+| Down proj (T=1) | 1 | 4864 | 896 | (56, 1, 1) |
+| LM head (T=1) | 1 | 896 | 151936 | (9,496, 1, 1) |
+### Optimization Notes
+- **Register blocking**: Process 4x4 or 8x8 output elements per thread to improve arithmetic intensity
+- **Vectorized loads**: Use `vec4<f32>` for 128-bit loads from shared memory
+- **Double buffering**: Overlap tile loading with computation by using two shared memory tiles
+- **f16 accumulation**: When `shader-f16` is available, load weights as f16 and accumulate in f32 for 2x bandwidth
+- **Batched matmul**: Fuse multiple small matmuls (Q, K, V projections) into one kernel
+---
+## 3. MatMulInt4
+**File:** `matmul_int4.wgsl`
+**Computes:** `C[M, N] = A[M, K] * dequant(B_q[K, N])` where B_q is packed INT4 with per-group scales and zeros.
+**Algorithm:** Each thread computes one element of C. For each element, it iterates over the K dimension, dequantizing B on-the-fly: extract a 4-bit nibble from the packed u32, apply `(nibble - zero) * scale` using per-group parameters, then multiply-accumulate with the corresponding A element.
+### Binding Layout
+| Binding | Name | Type | Access | Shape |
+|---------|------|------|--------|-------|
+| 0 | `A` | `array<f32>` | read | `[M, K]` (activations) |
+| 1 | `B_q` | `array<u32>` | read | Packed INT4 `[K, N]`, 8 values per u32 |
+| 2 | `scales` | `array<f32>` | read | One per group |
+| 3 | `zeros` | `array<f32>` | read | One per group |
+| 4 | `C` | `array<f32>` | read_write | `[M, N]` |
+| 5 | `params` | uniform | - | `Params` struct |
+### Uniform Struct
+```wgsl
+struct Params {
+  M: u32,
+  K: u32,
+  N: u32,
+  group_size: u32,    // typically 32 or 128
+}
+```
+### INT4 Packing
+Each `u32` in `B_q` stores 8 INT4 values. The `extract_nibble` function extracts a single 4-bit value:
+```wgsl
+fn extract_nibble(packed: u32, pos: u32) -> f32 {
+  let shift = pos * 4u;
+  let nibble = (packed >> shift) & 0xFu;
+  return f32(nibble);
+}
+```
+### Dequantization
+For element at flat index `flat_idx` in B:
+```
+group_idx = flat_idx / group_size
+value = (nibble - zeros[group_idx]) * scales[group_idx]
+```
+### Dispatch Formula
+```
+workgroups_x = ceil(N / 16)
+workgroups_y = ceil(M / 16)
+workgroups_z = 1
+```
+### Optimization Notes
+- **This kernel is not tiled**: Each thread iterates over the full K dimension, resulting in poor data reuse. A tiled version with shared memory (similar to `matmul.wgsl`) would dramatically improve performance.
+- **Vectorized nibble extraction**: Process 8 nibbles from one u32 load simultaneously
+- **Pre-dequantize tiles**: Load a tile of B_q into shared memory, dequantize in parallel, then do the tiled multiply
+- This kernel is defined but **not yet wired** to graph generators. It will be used when GPTQ/AWQ quantized models are supported.
+---
+## 4. RMSNorm
+**File:** `rmsnorm.wgsl`
+**Computes:** `output[t, d] = (input[t, d] / rms(input[t, :])) * weight[d]` where `rms(x) = sqrt(mean(x^2) + eps)`
+**Algorithm:** One workgroup per row (token position). Each thread accumulates squared values for a subset of the hidden dimension using a strided pattern (thread `tid` processes elements `tid, tid+256, tid+512, ...`). A tree reduction in shared memory computes the total sum of squares. Then each thread normalizes and scales its assigned elements.
+### Binding Layout
+| Binding | Name | Type | Access | Shape |
+|---------|------|------|--------|-------|
+| 0 | `input` | `array<f32>` | read | `[T, hidden_size]` |
+| 1 | `weight` | `array<f32>` | read | `[hidden_size]` |
+| 2 | `output` | `array<f32>` | read_write | `[T, hidden_size]` |
+| 3 | `params` | uniform | - | `Params` struct |
+### Uniform Struct
+```wgsl
+struct Params {
+  seq_len: u32,       // T
+  hidden_size: u32,   // D
+  eps_bits: u32,      // f32 epsilon reinterpreted as u32 (for uniform alignment)
+  _pad: u32,          // padding to 16-byte alignment
+}
+```
+Note: `eps` is passed as `bitcast<u32>(eps_f32)` because WebGPU uniform buffers require 16-byte alignment and mixing f32/u32 in a struct can cause alignment issues. The shader uses `bitcast<f32>(params.eps_bits)` to recover the float value.
+### Dispatch Formula
+```
+workgroups_x = T     // one workgroup per row
+workgroups_y = 1
+workgroups_z = 1
+```
+### Example Dispatch
+| T | hidden_size | Workgroups |
+|---|-------------|------------|
+| 1 | 896 | (1, 1, 1) |
+| 50 | 896 | (50, 1, 1) |
+| 2048 | 896 | (2048, 1, 1) |
+### Optimization Notes
+- For hidden_size <= 256, the tree reduction is oversized (many idle threads)
+- Could use `vec4<f32>` accumulation for 4x fewer loop iterations
+- Fusing RMSNorm with the following MatMul would save one global memory round-trip
+---
+## 5. LayerNorm
+**File:** `layernorm.wgsl`
+**Computes:** `output[t, d] = ((input[t, d] - mean) / sqrt(variance + eps)) * weight[d] + bias[d]`
+**Algorithm:** Two-pass reduction per row. First pass: tree reduction to compute mean. Second pass: tree reduction to compute variance (sum of squared differences from mean). Then each thread normalizes, scales, and shifts its assigned elements.
+### Binding Layout
+| Binding | Name | Type | Access | Shape |
+|---------|------|------|--------|-------|
+| 0 | `input` | `array<f32>` | read | `[T, hidden_size]` |
+| 1 | `weight` | `array<f32>` | read | `[hidden_size]` |
+| 2 | `bias` | `array<f32>` | read | `[hidden_size]` |
+| 3 | `output` | `array<f32>` | read_write | `[T, hidden_size]` |
+| 4 | `params` | uniform | - | `Params` struct |
+### Uniform Struct
+```wgsl
+struct Params {
+  seq_len: u32,
+  hidden_size: u32,
+  eps_bits: u32,
+  _pad: u32,
+}
+```
+### Dispatch Formula
+```
+workgroups_x = T
+workgroups_y = 1
+workgroups_z = 1
+```
+### Optimization Notes
+- Uses two separate shared memory arrays (`shared_sum` and `shared_sq_sum`); could use a single-pass Welford algorithm to compute both mean and variance simultaneously
+- Same vectorization and fusion opportunities as RMSNorm
+---
+## 6. RoPE
+**File:** `rope.wgsl`
+**Computes:** For each head, rotates pairs of dimensions by position-dependent angles:
+```
+q_out[2i]   = q[2i] * cos(theta) - q[2i+1] * sin(theta)
+q_out[2i+1] = q[2i] * sin(theta) + q[2i+1] * cos(theta)
+```
+where `theta = pos * base^(-2i/dim)`.
+**Algorithm:** Each thread handles one pair of dimensions for one head at one position. Handles Q and K separately since they may have different head counts (GQA). The kernel supports a `position_offset` for decode steps where the query is at a later position in the sequence.
+**Note:** This kernel operates **in-place** -- Q and K are both input and output buffers (`read_write`).
+### Binding Layout
+| Binding | Name | Type | Access | Shape |
+|---------|------|------|--------|-------|
+| 0 | `q` | `array<f32>` | read_write | `[T, num_q_heads * head_dim]` |
+| 1 | `k` | `array<f32>` | read_write | `[T, num_kv_heads * head_dim]` |
+| 2 | `params` | uniform | - | `Params` struct |
+### Uniform Struct
+```wgsl
+struct Params {
+  seq_len: u32,
+  num_q_heads: u32,
+  num_kv_heads: u32,
+  head_dim: u32,
+  rope_base_bits: u32,     // f32 base reinterpreted as u32
+  position_offset: u32,    // starting position for decode step
+}
+```
+### Dispatch Formula
+```
+total_pairs = T * max(num_q_heads, num_kv_heads) * (head_dim / 2)
+workgroups_x = ceil(total_pairs / 256)
+workgroups_y = 1
+workgroups_z = 1
+```
+### Example Dispatch
+| T | num_q_heads | num_kv_heads | head_dim | Total pairs | Workgroups |
+|---|-------------|-------------|----------|-------------|------------|
+| 1 | 14 | 2 | 64 | 448 | (2, 1, 1) |
+| 50 | 14 | 2 | 64 | 22,400 | (88, 1, 1) |
+### Optimization Notes
+- Q and K are processed by the same dispatch; with GQA (num_q_heads >> num_kv_heads), the K processing finishes much earlier while Q threads are still running
+- Frequency computation (`pow(base, ...)`) could be pre-computed into a lookup table in shared memory
+- The sin/cos computation could use `sincos()` if available, though WGSL doesn't have it natively
+---
+## 7. Attention
+**File:** `attention.wgsl`
+**Computes:** Scaled dot-product attention with causal masking and GQA:
+```
+scores = (Q @ K^T) / sqrt(head_dim)
+scores = causal_mask(scores)
+weights = softmax(scores)
+output = weights @ V
+```
+**Algorithm:** Each workgroup handles one (query_position, q_head) pair. **Correctness-first implementation**: thread 0 performs the full computation (dot products, max-finding, softmax, V accumulation) while other threads are idle. This is explicitly marked as a TODO for optimization.
+GQA is handled by mapping each Q head to its corresponding KV head: `kv_head = q_head / (num_q_heads / num_kv_heads)`.
+### Binding Layout
+| Binding | Name | Type | Access | Shape |
+|---------|------|------|--------|-------|
+| 0 | `Q` | `array<f32>` | read | `[T, num_q_heads * head_dim]` |
+| 1 | `K` | `array<f32>` | read | `[S, num_kv_heads * head_dim]` |
+| 2 | `V` | `array<f32>` | read | `[S, num_kv_heads * head_dim]` |
+| 3 | `output` | `array<f32>` | read_write | `[T, num_q_heads * head_dim]` |
+| 4 | `params` | uniform | - | `Params` struct |
+### Uniform Struct
+```wgsl
+struct Params {
+  T: u32,               // query seq len (prompt during prefill, 1 during decode)
+  S: u32,               // key/value seq len (total including cache)
+  num_q_heads: u32,
+  num_kv_heads: u32,
+  head_dim: u32,
+  position_offset: u32, // start position of Q in full sequence
+}
+```
+### Dispatch Formula
+```
+workgroups_x = T              // one per query position
+workgroups_y = num_q_heads    // one per Q head
+workgroups_z = 1
+```
+### Example Dispatch
+| T | S | num_q_heads | Workgroups |
+|---|---|-------------|------------|
+| 1 (decode) | 100 | 14 | (1, 14, 1) |
+| 50 (prefill) | 50 | 14 | (50, 14, 1) |
+### Optimization Notes
+This is the **highest priority kernel for optimization**:
+- **Current**: Only thread 0 works; 255 threads per workgroup are wasted
+- **Parallel dot products**: Each thread computes one dimension of the Q@K dot product, followed by a shared memory reduction
+- **Tiled attention**: Process S positions in tiles, with online softmax (FlashAttention-style) to avoid materializing the full attention matrix
+- **Shared memory K/V**: Load K/V tiles into shared memory for data reuse across dot products
+- **Multi-query batching**: During prefill, process multiple query positions per workgroup
+---
+## 8. Softmax
+**File:** `softmax.wgsl`
+**Computes:** Row-wise softmax: `output[i] = exp(input[i] - max) / sum(exp(input[:] - max))`
+**Algorithm:** Three-pass parallel reduction, one workgroup per row:
+1. **Pass 1 (max)**: Each thread finds the max of its assigned elements; tree reduction to get row max
+2. **Pass 2 (sum)**: Each thread computes `sum(exp(x - max))` for its elements; tree reduction to get total sum
+3. **Pass 3 (normalize)**: Each thread computes `exp(x - max) / total_sum` for its elements
+### Binding Layout
+| Binding | Name | Type | Access | Shape |
+|---------|------|------|--------|-------|
+| 0 | `input` | `array<f32>` | read | `[num_rows, row_size]` |
+| 1 | `output` | `array<f32>` | read_write | `[num_rows, row_size]` |
+| 2 | `params` | uniform | - | `Params` struct |
+### Uniform Struct
+```wgsl
+struct Params {
+  num_rows: u32,
+  row_size: u32,
+}
+```
+### Dispatch Formula
+```
+workgroups_x = num_rows
+workgroups_y = 1
+workgroups_z = 1
+```
+### Optimization Notes
+- Pass 3 redundantly recomputes `exp(x - max)` -- could store intermediate exp values in a buffer or fuse passes 2 and 3 using online normalization
+- For small row sizes (< 256), many threads are idle
+- Currently a standalone kernel; in practice, softmax is most commonly needed inside the attention kernel, so fusing them eliminates a global memory round-trip
+---
+## 9. SiLU
+**File:** `silu.wgsl`
+**Computes:** `output[i] = input[i] / (1 + exp(-input[i]))` (equivalent to `x * sigmoid(x)`)
+**Algorithm:** Pure element-wise. Each thread processes one element.
+### Binding Layout
+| Binding | Name | Type | Access | Shape |
+|---------|------|------|--------|-------|
+| 0 | `input` | `array<f32>` | read | `[count]` |
+| 1 | `output` | `array<f32>` | read_write | `[count]` |
+| 2 | `params` | uniform | - | `Params` struct |
+### Uniform Struct
+```wgsl
+struct Params {
+  count: u32,
+}
+```
+### Dispatch Formula
+```
+workgroups_x = ceil(count / 256)
+workgroups_y = 1
+workgroups_z = 1
+```
+### Example Dispatch
+| T | intermediate_size | count | Workgroups |
+|---|-------------------|-------|------------|
+| 1 | 4864 | 4,864 | (19, 1, 1) |
+| 50 | 4864 | 243,200 | (951, 1, 1) |
+### Optimization Notes
+- Could process `vec4<f32>` per thread for 4x fewer threads
+- Could be fused with the following Mul (SwiGLU combine): `output = silu(gate) * up`
+---
+## 10. GELU
+**File:** `gelu.wgsl`
+**Computes:** Approximate GELU: `output = 0.5 * x * (1 + tanh(sqrt(2/pi) * (x + 0.044715 * x^3)))`
+**Algorithm:** Pure element-wise. Uses the tanh approximation, which is the standard for transformer models.
+### Binding Layout
+| Binding | Name | Type | Access | Shape |
+|---------|------|------|--------|-------|
+| 0 | `input` | `array<f32>` | read | `[count]` |
+| 1 | `output` | `array<f32>` | read_write | `[count]` |
+| 2 | `params` | uniform | - | `Params` struct |
+### Uniform Struct
+```wgsl
+struct Params {
+  count: u32,
+}
+```
+### Dispatch Formula
+```
+workgroups_x = ceil(count / 256)
+```
+### Constants
+```wgsl
+const SQRT_2_OVER_PI: f32 = 0.7978845608;
+const GELU_COEFF: f32 = 0.044715;
+```
+### Optimization Notes
+- Same vectorization opportunity as SiLU
+- Used by Phi and GPT-2 style models (not Qwen/LLaMA which use SiLU)
+---
+## 11. Add
+**File:** `add.wgsl`
+**Computes:** `output[i] = a[i] + b[i]`
+**Algorithm:** Pure element-wise. Used for residual connections.
+### Binding Layout
+| Binding | Name | Type | Access | Shape |
+|---------|------|------|--------|-------|
+| 0 | `a` | `array<f32>` | read | `[count]` |
+| 1 | `b` | `array<f32>` | read | `[count]` |
+| 2 | `output` | `array<f32>` | read_write | `[count]` |
+| 3 | `params` | uniform | - | `Params` struct |
+### Uniform Struct
+```wgsl
+struct Params {
+  count: u32,
+}
+```
+### Dispatch Formula
+```
+workgroups_x = ceil(count / 256)
+```
+### Optimization Notes
+- Could use `vec4<f32>` for 128-bit operations
+- Residual add could be fused with the preceding kernel (e.g., output projection writes directly to `resid + output`)
+---
+## 12. Mul
+**File:** `mul.wgsl`
+**Computes:** `output[i] = a[i] * b[i]`
+**Algorithm:** Pure element-wise. Used for the SwiGLU combine step (`silu(gate) * up`).
+### Binding Layout
+| Binding | Name | Type | Access | Shape |
+|---------|------|------|--------|-------|
+| 0 | `a` | `array<f32>` | read | `[count]` |
+| 1 | `b` | `array<f32>` | read | `[count]` |
+| 2 | `output` | `array<f32>` | read_write | `[count]` |
+| 3 | `params` | uniform | - | `Params` struct |
+### Uniform Struct
+```wgsl
+struct Params {
+  count: u32,
+}
+```
+### Dispatch Formula
+```
+workgroups_x = ceil(count / 256)
+```
+### Optimization Notes
+- Same vectorization and fusion opportunities as Add
+- Could be fused with preceding SiLU: single kernel computes `silu(a[i]) * b[i]`
+---
+## 13. Micro-Benchmarking Approach
+To measure individual kernel performance, use this pattern:
+### Test Harness
+```typescript
+import { initGPU, createStorageBuffer, createUniformBuffer, getOrCreatePipeline, createBindGroup } from "./device.js";
+async function benchmarkMatmul(M: number, K: number, N: number, iterations: number = 100) {
+  const ctx = await initGPU();
+  // Allocate buffers with random data
+  const A = createStorageBuffer(ctx, "A", M * K * 4, randomF32(M * K));
+  const B = createStorageBuffer(ctx, "B", K * N * 4, randomF32(K * N));
+  const C = createStorageBuffer(ctx, "C", M * N * 4);
+  const params = createUniformBuffer(ctx, "params",
+    new Uint32Array([M, K, N]).buffer);
+  const pipeline = getOrCreatePipeline(ctx, "matmul", matmulWGSL, "main");
+  const bindGroup = createBindGroup(ctx, pipeline,
+    [{ buffer: A }, { buffer: B }, { buffer: C }, { buffer: params }]);
+  const wgX = Math.ceil(N / 16);
+  const wgY = Math.ceil(M / 16);
+  // Warmup
+  for (let i = 0; i < 10; i++) {
+    const enc = ctx.device.createCommandEncoder();
+    const pass = enc.beginComputePass();
+    pass.setPipeline(pipeline);
+    pass.setBindGroup(0, bindGroup);
+    pass.dispatchWorkgroups(wgX, wgY, 1);
+    pass.end();
+    ctx.device.queue.submit([enc.finish()]);
+  }
+  await ctx.device.queue.onSubmittedWorkDone();
+  // Benchmark
+  const start = performance.now();
+  for (let i = 0; i < iterations; i++) {
+    const enc = ctx.device.createCommandEncoder();
+    const pass = enc.beginComputePass();
+    pass.setPipeline(pipeline);
+    pass.setBindGroup(0, bindGroup);
+    pass.dispatchWorkgroups(wgX, wgY, 1);
+    pass.end();
+    ctx.device.queue.submit([enc.finish()]);
+  }
+  await ctx.device.queue.onSubmittedWorkDone();
+  const elapsed = performance.now() - start;
+  // Compute metrics
+  const flops = 2 * M * K * N; // multiply-add = 2 ops
+  const totalFlops = flops * iterations;
+  const gflops = totalFlops / (elapsed * 1e6); // elapsed is in ms
+  console.log(`MatMul ${M}x${K}x${N}:`);
+  console.log(`  ${iterations} iterations in ${elapsed.toFixed(1)} ms`);
+  console.log(`  ${(elapsed / iterations).toFixed(3)} ms/iter`);
+  console.log(`  ${gflops.toFixed(1)} GFLOPS`);
+  A.destroy(); B.destroy(); C.destroy(); params.destroy();
+}
+```
+### Expected Performance Targets
+For reference, approximate theoretical peak GFLOPS for common GPUs:
+| GPU | f32 TFLOPS | Expected matmul efficiency |
+|-----|-----------|---------------------------|
+| Apple M1 (8-core) | ~2.6 | 30-50% with 16x16 tiling |
+| Apple M2 (10-core) | ~3.6 | 30-50% |
+| NVIDIA RTX 3060 | ~12.7 | 40-60% |
+| Intel Arc A770 | ~17.2 | 30-50% |
+The 16x16 tiling strategy without register blocking typically achieves 30-50% of peak throughput. Larger tiles (32x32, 64x64) with register blocking can push this to 70-80%.
+### Key Metrics Per Kernel
+| Kernel | Metric | Formula |
+|--------|--------|---------|
+| MatMul | GFLOPS | `2 * M * K * N / (time_ms * 1e6)` |
+| Embedding | GB/s | `T * hidden_size * 4 / (time_ms * 1e6)` |
+| RMSNorm | GB/s | `T * hidden_size * 4 * 3 / (time_ms * 1e6)` (read input + weight + write output) |
+| Attention | GFLOPS | `2 * T * S * head_dim * num_heads / (time_ms * 1e6)` |
+| Element-wise | GB/s | `count * 4 * (num_inputs + 1) / (time_ms * 1e6)` |