npm - mambacode.js - Versions diffs - 1.0.0 - Mend

mambacode.js 1.0.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (16) hide show

package/LICENSE +21 -0
package/README.md +196 -0
package/package.json +54 -0
package/src/index.js +89 -0
package/src/kernels/activations.js +88 -0
package/src/kernels/conv1d.js +153 -0
package/src/kernels/linear_projection.js +220 -0
package/src/kernels/selective_scan.js +350 -0
package/src/kernels/weight_update.js +120 -0
package/src/model/mamba_block.js +443 -0
package/src/model/mamba_model.js +335 -0
package/src/tokenizer/bpe.js +256 -0
package/src/training/autograd.js +221 -0
package/src/training/trainer.js +394 -0
package/src/utils/gpu_utils.js +217 -0
package/src/utils/quantization.js +215 -0

package/src/kernels/linear_projection.js ADDED Viewed

@@ -0,0 +1,220 @@
+// Linear Projection WGSL Kernel
+// General-purpose matrix multiplication: Y = X @ W^T + b
+// Supports the up-projection and down-projection linear layers in the Mamba block.
+//
+// Shapes:
+//   X : (batch * seq_len, in_features)   – input (rows)
+//   W : (out_features, in_features)      – weight matrix (row-major)
+//   b : (out_features,)                  – bias
+//   Y : (batch * seq_len, out_features)  – output
+export const LINEAR_FORWARD_WGSL = /* wgsl */`
+struct LinearParams {
+    M : u32,   // number of rows    (batch * seq_len)
+    K : u32,   // in_features
+    N : u32,   // out_features
+};
+@group(0) @binding(0) var<uniform>             params : LinearParams;
+@group(0) @binding(1) var<storage, read>       X      : array<f32>;   // (M, K)
+@group(0) @binding(2) var<storage, read>       W      : array<f32>;   // (N, K)
+@group(0) @binding(3) var<storage, read>       bias   : array<f32>;   // (N,)
+@group(0) @binding(4) var<storage, read_write> Y      : array<f32>;   // (M, N)
+// Tiled matmul using workgroup shared memory (16x16 tiles)
+var<workgroup> tile_X : array<f32, 256>;  // 16 * 16
+var<workgroup> tile_W : array<f32, 256>;
+@compute @workgroup_size(16, 16, 1)
+fn linear_forward(
+    @builtin(global_invocation_id)   gid : vec3<u32>,
+    @builtin(local_invocation_id)    lid : vec3<u32>,
+    @builtin(workgroup_id)           wid : vec3<u32>,
+) {
+    let M = params.M;
+    let K = params.K;
+    let N = params.N;
+    let row = gid.x;   // output row (M dimension)
+    let col = gid.y;   // output col (N dimension)
+    var acc: f32 = 0.0;
+    let TILE: u32 = 16u;
+    let num_tiles = (K + TILE - 1u) / TILE;
+    for (var tile_idx: u32 = 0u; tile_idx < num_tiles; tile_idx = tile_idx + 1u) {
+        // Load X tile: shape (TILE_M, TILE_K)
+        let x_col = tile_idx * TILE + lid.y;
+        let x_row = wid.x * TILE + lid.x;
+        if (x_row < M && x_col < K) {
+            tile_X[lid.x * TILE + lid.y] = X[x_row * K + x_col];
+        } else {
+            tile_X[lid.x * TILE + lid.y] = 0.0;
+        }
+        // Load W tile: shape (TILE_N, TILE_K)  — W is (N, K)
+        let w_col = tile_idx * TILE + lid.x;  // K dimension
+        let w_row = wid.y * TILE + lid.y;     // N dimension
+        if (w_row < N && w_col < K) {
+            tile_W[lid.y * TILE + lid.x] = W[w_row * K + w_col];
+        } else {
+            tile_W[lid.y * TILE + lid.x] = 0.0;
+        }
+        workgroupBarrier();
+        // Dot product within tile
+        for (var k: u32 = 0u; k < TILE; k = k + 1u) {
+            acc = acc + tile_X[lid.x * TILE + k] * tile_W[lid.y * TILE + k];
+        }
+        workgroupBarrier();
+    }
+    if (row < M && col < N) {
+        Y[row * N + col] = acc + bias[col];
+    }
+}
+`;
+// ---- Backward pass for linear projection ----
+export const LINEAR_BACKWARD_WGSL = /* wgsl */`
+struct LinearParams {
+    M : u32,
+    K : u32,
+    N : u32,
+};
+@group(0) @binding(0) var<uniform>             params : LinearParams;
+@group(0) @binding(1) var<storage, read>       X      : array<f32>;   // (M, K)
+@group(0) @binding(2) var<storage, read>       W      : array<f32>;   // (N, K)
+@group(0) @binding(3) var<storage, read>       dY     : array<f32>;   // (M, N)
+@group(0) @binding(4) var<storage, read_write> dX     : array<f32>;   // (M, K)
+@group(0) @binding(5) var<storage, read_write> dW     : array<f32>;   // (N, K)
+@group(0) @binding(6) var<storage, read_write> db     : array<f32>;   // (N,)
+// Dispatch: (ceil(M/16), ceil(K/16), 1)  – computes dX = dY @ W
+var<workgroup> tile_dY : array<f32, 256>;
+var<workgroup> tile_W  : array<f32, 256>;
+@compute @workgroup_size(16, 16, 1)
+fn linear_backward_dX(
+    @builtin(global_invocation_id) gid : vec3<u32>,
+    @builtin(local_invocation_id)  lid : vec3<u32>,
+    @builtin(workgroup_id)         wid : vec3<u32>,
+) {
+    let M = params.M;
+    let K = params.K;
+    let N = params.N;
+    let row = gid.x;   // M
+    let col = gid.y;   // K
+    var acc: f32 = 0.0;
+    let TILE: u32 = 16u;
+    let num_tiles = (N + TILE - 1u) / TILE;
+    for (var tile_idx: u32 = 0u; tile_idx < num_tiles; tile_idx = tile_idx + 1u) {
+        // tile_dY: (M, TILE_N) slice
+        let dy_col = tile_idx * TILE + lid.y;
+        let dy_row = wid.x * TILE + lid.x;
+        if (dy_row < M && dy_col < N) {
+            tile_dY[lid.x * TILE + lid.y] = dY[dy_row * N + dy_col];
+        } else {
+            tile_dY[lid.x * TILE + lid.y] = 0.0;
+        }
+        // tile_W: (TILE_N, K) slice  — W[n, k]
+        let w_row = tile_idx * TILE + lid.x;   // N
+        let w_col = wid.y * TILE + lid.y;      // K
+        if (w_row < N && w_col < K) {
+            tile_W[lid.x * TILE + lid.y] = W[w_row * K + w_col];
+        } else {
+            tile_W[lid.x * TILE + lid.y] = 0.0;
+        }
+        workgroupBarrier();
+        for (var n: u32 = 0u; n < TILE; n = n + 1u) {
+            acc = acc + tile_dY[lid.x * TILE + n] * tile_W[n * TILE + lid.y];
+        }
+        workgroupBarrier();
+    }
+    if (row < M && col < K) {
+        dX[row * K + col] = acc;
+    }
+}
+// Dispatch: (ceil(N/16), ceil(K/16), 1)  – computes dW = dY^T @ X
+var<workgroup> tile_dY2 : array<f32, 256>;
+var<workgroup> tile_X2  : array<f32, 256>;
+@compute @workgroup_size(16, 16, 1)
+fn linear_backward_dW(
+    @builtin(global_invocation_id) gid : vec3<u32>,
+    @builtin(local_invocation_id)  lid : vec3<u32>,
+    @builtin(workgroup_id)         wid : vec3<u32>,
+) {
+    let M = params.M;
+    let K = params.K;
+    let N = params.N;
+    let row = gid.x;   // N
+    let col = gid.y;   // K
+    var acc: f32 = 0.0;
+    let TILE: u32 = 16u;
+    let num_tiles = (M + TILE - 1u) / TILE;
+    for (var tile_idx: u32 = 0u; tile_idx < num_tiles; tile_idx = tile_idx + 1u) {
+        // dY^T tile: [N, M] accessed as dY[m, n]
+        let m_idx = tile_idx * TILE + lid.y;
+        let n_idx = wid.x * TILE + lid.x;
+        if (n_idx < N && m_idx < M) {
+            tile_dY2[lid.x * TILE + lid.y] = dY[m_idx * N + n_idx];
+        } else {
+            tile_dY2[lid.x * TILE + lid.y] = 0.0;
+        }
+        // X tile: [M, K]
+        let xm = tile_idx * TILE + lid.x;
+        let xk = wid.y * TILE + lid.y;
+        if (xm < M && xk < K) {
+            tile_X2[lid.x * TILE + lid.y] = X[xm * K + xk];
+        } else {
+            tile_X2[lid.x * TILE + lid.y] = 0.0;
+        }
+        workgroupBarrier();
+        for (var m: u32 = 0u; m < TILE; m = m + 1u) {
+            acc = acc + tile_dY2[lid.x * TILE + m] * tile_X2[m * TILE + lid.y];
+        }
+        workgroupBarrier();
+    }
+    if (row < N && col < K) {
+        dW[row * K + col] = acc;
+    }
+}
+// Dispatch: (N, 1, 1) – accumulates db = sum_M dY
+@compute @workgroup_size(64, 1, 1)
+fn linear_backward_db(
+    @builtin(global_invocation_id) gid : vec3<u32>,
+) {
+    let M = params.M;
+    let N = params.N;
+    let n = gid.x;
+    if (n >= N) { return; }
+    var acc: f32 = 0.0;
+    for (var m: u32 = 0u; m < M; m = m + 1u) {
+        acc = acc + dY[m * N + n];
+    }
+    db[n] = acc;
+}
+`;

package/src/kernels/selective_scan.js ADDED Viewed

@@ -0,0 +1,350 @@
+// Parallel Selective Scan WGSL Kernel
+// Implements the S6 (Selective Scan) core of the Mamba architecture.
+// Uses a Kogge-Stone parallel prefix-sum approach for O(log N) time on GPU.
+//
+// Forward pass recurrence:
+//   h_t = A_t * h_{t-1} + B_t * x_t
+//   y_t = C_t * h_t + D * x_t
+//
+// where A_t, B_t, C_t are input-dependent (selective) gate matrices.
+export const SELECTIVE_SCAN_FORWARD_WGSL = /* wgsl */`
+// ---- Binding layout ----
+// group 0: sequence data
+// group 1: SSM parameters
+struct ScanParams {
+    seq_len   : u32,   // L  – sequence length
+    d_state   : u32,   // N  – state dimension
+    d_inner   : u32,   // D  – inner (expanded) channel dimension
+    batch     : u32,   // B  – batch size
+};
+@group(0) @binding(0) var<uniform>             params   : ScanParams;
+// u (B, L, D)  – projected input after conv
+@group(0) @binding(1) var<storage, read>       u        : array<f32>;
+// delta (B, L, D) – time-step (Δ) after softplus
+@group(0) @binding(2) var<storage, read>       delta    : array<f32>;
+// A (D, N)  – log-space diagonal state matrix (fixed, learned)
+@group(0) @binding(3) var<storage, read>       A        : array<f32>;
+// B (B, L, N) – input projection (selective)
+@group(0) @binding(4) var<storage, read>       B        : array<f32>;
+// C (B, L, N) – output projection (selective)
+@group(0) @binding(5) var<storage, read>       C        : array<f32>;
+// D (D,) – skip-connection scale
+@group(0) @binding(6) var<storage, read>       D_vec    : array<f32>;
+// y (B, L, D) – output (written by this kernel)
+@group(0) @binding(7) var<storage, read_write> y        : array<f32>;
+// h_cache (B, L, D*N) – hidden states cache (for backward pass)
+@group(0) @binding(8) var<storage, read_write> h_cache  : array<f32>;
+// ---- Workgroup shared memory ----
+// Each workgroup processes one (batch, channel) slice across all time steps.
+// We store the associative pair (a_bar, bu_bar) per time step so we can run
+// a Kogge-Stone scan across the workgroup tile.
+var<workgroup> wg_a  : array<f32, 256>;   // discretised A values
+var<workgroup> wg_bu : array<f32, 256>;   // B*u values
+// ---- Helpers ----
+// Softplus: numerically stable log(1 + exp(x))
+fn softplus(x: f32) -> f32 {
+    return log(1.0 + exp(x));
+}
+// ZerO-Order Hold discretisation of continuous A, Δ:
+//   A_bar = exp(Δ * A)
+//   B_bar = (A_bar - 1) / A * B  ≈  Δ * B  (first-order for simplicity)
+fn discretise_A(delta_val: f32, a_log: f32) -> f32 {
+    // A is stored as -exp(a_log) to ensure A_bar < 1 (stable)
+    let a_cont = -exp(a_log);
+    return exp(delta_val * a_cont);
+}
+fn discretise_B(delta_val: f32, a_log: f32, b_val: f32) -> f32 {
+    let a_cont  = -exp(a_log);
+    let a_bar   = exp(delta_val * a_cont);
+    // (A_bar - 1) / A_cont * B
+    let b_bar   = (a_bar - 1.0) / a_cont * b_val;
+    return b_bar;
+}
+// ---- Main kernel ----
+// Dispatch: (ceil(D/8), ceil(N/8), B)
+// Each invocation is responsible for one (d, n, batch) triplet and scans
+// the entire sequence using a two-pass Kogge-Stone scan within workgroup tiles.
+@compute @workgroup_size(64, 1, 1)
+fn forward_scan(
+    @builtin(global_invocation_id)   gid  : vec3<u32>,
+    @builtin(local_invocation_index) lid  : u32,
+    @builtin(workgroup_id)           wgid : vec3<u32>,
+) {
+    let L = params.seq_len;
+    let N = params.d_state;
+    let D = params.d_inner;
+    let B = params.batch;
+    // Each workgroup handles one (batch b, channel d, state n) combination.
+    // We pack d and n into the x dimension: global d = wgid.x, global n = wgid.y
+    let d = wgid.x;
+    let n = wgid.y;
+    let b = gid.z;
+    if (d >= D || n >= N || b >= B) { return; }
+    // Tile size equals workgroup size (64).  We process TILE_SIZE steps at once.
+    let TILE: u32 = 64u;
+    // Running state h for this (b, d, n)
+    var h: f32 = 0.0;
+    var tile_start: u32 = 0u;
+    loop {
+        if (tile_start >= L) { break; }
+        let t = tile_start + lid;      // absolute time step handled by this lane
+        var a_bar: f32 = 1.0;
+        var bu:    f32 = 0.0;
+        if (t < L) {
+            // Indices
+            let delta_idx = b * L * D + t * D + d;
+            let u_idx     = b * L * D + t * D + d;
+            let A_idx     = d * N + n;
+            let B_idx     = b * L * N + t * N + n;
+            let dv = softplus(delta[delta_idx]);
+            a_bar  = discretise_A(dv, A[A_idx]);
+            bu     = discretise_B(dv, A[A_idx], B[B_idx]) * u[u_idx];
+        }
+        wg_a[lid]  = a_bar;
+        wg_bu[lid] = bu;
+        workgroupBarrier();
+        // ---- Kogge-Stone inclusive prefix scan within tile ----
+        // Associative operator: (a1, b1) ∘ (a2, b2) = (a1*a2, a1*b2 + b1)
+        // This computes cumulative state recurrence in log2(TILE) steps.
+        var stride: u32 = 1u;
+        loop {
+            if (stride >= TILE) { break; }
+            if (lid >= stride) {
+                let prev_a  = wg_a[lid - stride];
+                let prev_bu = wg_bu[lid - stride];
+                // Combine: new_state = prev_a * cur_a (product of A_bars)
+                //                      new_bu  = prev_a * cur_bu + prev_bu
+                let new_a  = prev_a * wg_a[lid];
+                let new_bu = prev_a * wg_bu[lid] + prev_bu;
+                workgroupBarrier();
+                wg_a[lid]  = new_a;
+                wg_bu[lid] = new_bu;
+            }
+            workgroupBarrier();
+            stride = stride << 1u;
+        }
+        // Incorporate the carry-in state from the previous tile.
+        // After the scan wg_bu[lid] holds the intra-tile inclusive sum.
+        // The actual h at position t = h_carry * wg_a[lid] + wg_bu[lid]
+        let h_t = h * wg_a[lid] + wg_bu[lid];
+        if (t < L) {
+            // Cache hidden state for backward pass
+            let h_idx = b * L * D * N + t * D * N + d * N + n;
+            h_cache[h_idx] = h_t;
+            // Accumulate y contribution: y_t += C_t[n] * h_t  (over all n)
+            // We use an atomic-style accumulation: each (d, n) lane adds its
+            // contribution to the same y[b, t, d].  This races without atomics,
+            // so we instead write to a full h_cache and reduce in a second pass.
+            // Here we perform direct accumulation using atomicAdd approximation:
+            // (safe because each lane writes a unique n, which is stride 1 in mem)
+            let C_idx = b * L * N + t * N + n;
+            let y_idx = b * L * D + t * D + d;
+            // Direct write for n == 0 (first state dim), add for the rest.
+            // Since all workgroups for the same (b,d) run concurrently we must
+            // accumulate safely: we write each partial into h_cache and reduce
+            // in a subsequent lightweight kernel (forward_reduce).
+            // (For simplicity and correctness here we directly atomically add via
+            //  f32 emulation – real deployment uses atomicAdd on f32 with spirv ext.)
+            // We store C*h contribution separately so forward_reduce can sum them.
+            // Layout: y_partial (B, L, D, N) – one slot per state dim
+            // y reused as y_partial in this kernel; forward_reduce collapses N dim.
+            let y_partial_idx = b * L * D * N + t * D * N + d * N + n;
+            // Reuse h_cache second half as y_partial (offset by B*L*D*N)
+            let offset = B * L * D * N;
+            h_cache[offset + y_partial_idx] = C[C_idx] * h_t;
+        }
+        // Update carry: last lane's h_t is the tile's final state
+        let last = min(TILE, L - tile_start) - 1u;
+        h = wg_a[last] * h + wg_bu[last];   // recombine carry
+        workgroupBarrier();
+        tile_start = tile_start + TILE;
+    }
+}
+// ---- Reduction kernel ----
+// Collapses the N (d_state) dimension of y_partial into y.
+// Adds the D (skip connection) term: y_t[d] += D_vec[d] * u_t[d]
+// Dispatch: (ceil(L/64), D, B)
+@compute @workgroup_size(64, 1, 1)
+fn forward_reduce(
+    @builtin(global_invocation_id) gid : vec3<u32>,
+) {
+    let L = params.seq_len;
+    let N = params.d_state;
+    let D = params.d_inner;
+    let B = params.batch;
+    let t = gid.x;
+    let d = gid.y;
+    let b = gid.z;
+    if (t >= L || d >= D || b >= B) { return; }
+    let offset    = B * L * D * N;
+    var sum: f32  = 0.0;
+    for (var n: u32 = 0u; n < N; n = n + 1u) {
+        let idx = offset + b * L * D * N + t * D * N + d * N + n;
+        sum = sum + h_cache[idx];
+    }
+    // Add skip connection
+    let u_idx = b * L * D + t * D + d;
+    sum = sum + D_vec[d] * u[u_idx];
+    let y_idx = b * L * D + t * D + d;
+    y[y_idx] = sum;
+}
+`;
+// ---- Backward scan kernel (for autograd) ----
+// Computes gradients w.r.t. Δ, A, B, C using the cached hidden states.
+export const SELECTIVE_SCAN_BACKWARD_WGSL = /* wgsl */`
+struct ScanParams {
+    seq_len  : u32,
+    d_state  : u32,
+    d_inner  : u32,
+    batch    : u32,
+};
+@group(0) @binding(0) var<uniform>             params    : ScanParams;
+@group(0) @binding(1) var<storage, read>       u         : array<f32>;
+@group(0) @binding(2) var<storage, read>       delta     : array<f32>;
+@group(0) @binding(3) var<storage, read>       A         : array<f32>;
+@group(0) @binding(4) var<storage, read>       B         : array<f32>;
+@group(0) @binding(5) var<storage, read>       C         : array<f32>;
+@group(0) @binding(6) var<storage, read>       h_cache   : array<f32>;
+@group(0) @binding(7) var<storage, read>       dy        : array<f32>;  // upstream gradient
+@group(0) @binding(8) var<storage, read_write> dA        : array<f32>;
+@group(0) @binding(9) var<storage, read_write> dB        : array<f32>;
+@group(0) @binding(10) var<storage, read_write> dC       : array<f32>;
+@group(0) @binding(11) var<storage, read_write> dDelta   : array<f32>;
+@group(0) @binding(12) var<storage, read_write> du       : array<f32>;
+fn softplus(x: f32) -> f32 {
+    return log(1.0 + exp(x));
+}
+fn softplus_grad(x: f32) -> f32 {
+    // d/dx softplus(x) = sigmoid(x)
+    return 1.0 / (1.0 + exp(-x));
+}
+fn discretise_A(delta_val: f32, a_log: f32) -> f32 {
+    let a_cont = -exp(a_log);
+    return exp(delta_val * a_cont);
+}
+// Reverse scan (backward pass) – processes time from T-1 down to 0.
+// Dispatch: (D, N, B)
+@compute @workgroup_size(1, 1, 1)
+fn backward_scan(
+    @builtin(global_invocation_id) gid : vec3<u32>,
+) {
+    let L = params.seq_len;
+    let N = params.d_state;
+    let D = params.d_inner;
+    let B = params.batch;
+    let d = gid.x;
+    let n = gid.y;
+    let b = gid.z;
+    if (d >= D || n >= N || b >= B) { return; }
+    var dh: f32 = 0.0;   // gradient of loss w.r.t. h_t, accumulated backwards
+    var t: u32 = L;
+    loop {
+        if (t == 0u) { break; }
+        t = t - 1u;
+        let delta_raw_idx = b * L * D + t * D + d;
+        let A_idx         = d * N + n;
+        let B_idx         = b * L * N + t * N + n;
+        let C_idx         = b * L * N + t * N + n;
+        let u_idx         = b * L * D + t * D + d;
+        let h_idx         = b * L * D * N + t * D * N + d * N + n;
+        let delta_raw = delta[delta_raw_idx];
+        let dv        = softplus(delta_raw);
+        let a_log     = A[A_idx];
+        let a_cont    = -exp(a_log);
+        let a_bar     = exp(dv * a_cont);
+        let b_val     = B[B_idx];
+        let c_val     = C[C_idx];
+        let u_val     = u[u_idx];
+        let h_t       = h_cache[h_idx];
+        // dy_t contribution to dh (from C * h_t in the output)
+        // y_t[d] = sum_n C[n] * h_t[n] + D * u   =>  dh_t[n] += C[n] * dy_t[d]
+        let dy_val = dy[b * L * D + t * D + d];
+        dh = dh + c_val * dy_val;
+        // dC[b, t, n] += dy_t[d] * h_t
+        dC[C_idx] = dC[C_idx] + dy_val * h_t;
+        // h_t = a_bar * h_{t-1} + b_bar * u_t
+        // b_bar = (a_bar - 1) / a_cont * b_val
+        let b_bar  = (a_bar - 1.0) / a_cont * b_val;
+        let h_prev = (t > 0u) ? h_cache[b * L * D * N + (t - 1u) * D * N + d * N + n] : 0.0;
+        // dh_{t-1} += a_bar * dh_t
+        // (accumulated in next iteration; here dh already contains upstream)
+        let dh_cur = dh;
+        // dA[d,n] += dh_t * (d a_bar/d a_cont) * (d a_cont/d a_log) * h_{t-1}
+        //          + dh_t * (d b_bar/d a_cont) * ... * b_val * u_val
+        // d(a_bar)/d(a_log) = a_bar * (-exp(a_log)) * dv = a_bar * a_cont * dv
+        let da_bar_da_log = a_bar * a_cont * dv;
+        dA[A_idx] = dA[A_idx] + dh_cur * (da_bar_da_log * h_prev);
+        // dB[b,t,n] += dh_t * b_bar / b_val * u_val  (since b_bar is linear in b)
+        dB[B_idx] = dB[B_idx] + dh_cur * ((a_bar - 1.0) / a_cont) * u_val;
+        // du[b,t,d] += dh_t * b_bar  (accumulate over n in separate kernel)
+        du[u_idx] = du[u_idx] + dh_cur * b_bar;
+        // dDelta[b,t,d]: chain rule through softplus and discretisation
+        // d(b_bar)/d(dv) = d/d(dv)[(a_bar-1)/a_cont * b] = a_bar * b / (a_cont ... )
+        //  actually: d(a_bar)/d(dv) = a_bar * a_cont,  d(b_bar)/d(dv) = a_bar * b_val
+        let da_bar_ddv  = a_bar * a_cont;
+        let db_bar_ddv  = a_bar * b_val;
+        let dLoss_ddv   = dh_cur * (da_bar_ddv * h_prev + db_bar_ddv * u_val);
+        let ddv_ddelta  = softplus_grad(delta_raw);
+        dDelta[delta_raw_idx] = dDelta[delta_raw_idx] + dLoss_ddv * ddv_ddelta;
+        // Propagate dh to previous timestep
+        dh = a_bar * dh_cur;
+    }
+}
+`;

package/src/kernels/weight_update.js ADDED Viewed

@@ -0,0 +1,120 @@
+// Weight Update WGSL Kernel (AdamW Optimizer)
+// Implements fused AdamW parameter update on the GPU.
+//
+// AdamW update rule:
+//   m_t = beta1 * m_{t-1} + (1 - beta1) * g_t
+//   v_t = beta2 * v_{t-1} + (1 - beta2) * g_t^2
+//   m_hat = m_t / (1 - beta1^t)
+//   v_hat = v_t / (1 - beta2^t)
+//   theta_t = theta_{t-1} * (1 - lr * weight_decay) - lr * m_hat / (sqrt(v_hat) + eps)
+export const WEIGHT_UPDATE_WGSL = /* wgsl */`
+struct AdamParams {
+    num_elements   : u32,
+    lr             : f32,   // learning rate
+    beta1          : f32,   // default 0.9
+    beta2          : f32,   // default 0.999
+    eps            : f32,   // default 1e-8
+    weight_decay   : f32,   // default 0.01
+    beta1_t        : f32,   // beta1^t  (precomputed bias correction term)
+    beta2_t        : f32,   // beta2^t
+};
+@group(0) @binding(0) var<uniform>             adam     : AdamParams;
+// param (N,)   – weight tensor (read-write: updated in-place)
+@group(0) @binding(1) var<storage, read_write> param    : array<f32>;
+// grad  (N,)   – gradient
+@group(0) @binding(2) var<storage, read>       grad     : array<f32>;
+// m     (N,)   – first moment
+@group(0) @binding(3) var<storage, read_write> m_state  : array<f32>;
+// v     (N,)   – second moment
+@group(0) @binding(4) var<storage, read_write> v_state  : array<f32>;
+// Dispatch: (ceil(N / 256), 1, 1)
+@compute @workgroup_size(256, 1, 1)
+fn adamw_update(
+    @builtin(global_invocation_id) gid : vec3<u32>,
+) {
+    let i = gid.x;
+    if (i >= adam.num_elements) { return; }
+    let g = grad[i];
+    let p = param[i];
+    // Moment updates
+    let m_new = adam.beta1 * m_state[i] + (1.0 - adam.beta1) * g;
+    let v_new = adam.beta2 * v_state[i] + (1.0 - adam.beta2) * g * g;
+    m_state[i] = m_new;
+    v_state[i] = v_new;
+    // Bias-corrected estimates
+    let m_hat = m_new / (1.0 - adam.beta1_t);
+    let v_hat = v_new / (1.0 - adam.beta2_t);
+    // Weight decay (decoupled) + gradient step
+    param[i] = p * (1.0 - adam.lr * adam.weight_decay) -
+               adam.lr * m_hat / (sqrt(v_hat) + adam.eps);
+}
+`;
+// Gradient clipping kernel – clips global gradient norm to max_norm.
+// Run before weight updates.  Two-pass: first compute squared norm, then scale.
+export const GRAD_CLIP_WGSL = /* wgsl */`
+struct ClipParams {
+    num_elements : u32,
+    max_norm_sq  : f32,   // max_norm^2
+};
+@group(0) @binding(0) var<uniform>             clip_p  : ClipParams;
+@group(0) @binding(1) var<storage, read_write> grad    : array<f32>;
+@group(0) @binding(2) var<storage, read_write> norm_sq : array<f32>;  // size 1, atomic accumulator
+var<workgroup> local_sq : array<f32, 256>;
+// Pass 1: reduce sum of squares into norm_sq[0]
+@compute @workgroup_size(256, 1, 1)
+fn grad_norm_reduce(
+    @builtin(global_invocation_id)   gid : vec3<u32>,
+    @builtin(local_invocation_index) lid : u32,
+) {
+    let i = gid.x;
+    local_sq[lid] = 0.0;
+    if (i < clip_p.num_elements) {
+        local_sq[lid] = grad[i] * grad[i];
+    }
+    workgroupBarrier();
+    // Parallel reduction within workgroup
+    var s: u32 = 128u;
+    loop {
+        if (s == 0u) { break; }
+        if (lid < s) {
+            local_sq[lid] = local_sq[lid] + local_sq[lid + s];
+        }
+        workgroupBarrier();
+        s = s >> 1u;
+    }
+    if (lid == 0u) {
+        // Non-atomic accumulation (single workgroup assumption for small models)
+        norm_sq[0] = norm_sq[0] + local_sq[0];
+    }
+}
+// Pass 2: scale gradients if norm exceeds max_norm
+@compute @workgroup_size(256, 1, 1)
+fn grad_clip_scale(
+    @builtin(global_invocation_id) gid : vec3<u32>,
+) {
+    let i = gid.x;
+    if (i >= clip_p.num_elements) { return; }
+    let ns = norm_sq[0];
+    if (ns > clip_p.max_norm_sq) {
+        let scale = sqrt(clip_p.max_norm_sq / ns);
+        grad[i] = grad[i] * scale;
+    }
+}
+`;