npm - @seanhogg/builderforce-memory-engine - Versions diffs - 2026.6.18 - Mend

@seanhogg/builderforce-memory-engine 2026.6.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (113) hide show

package/LICENSE +21 -0
package/README.md +393 -0
package/dist/index.d.ts +32 -0
package/dist/index.d.ts.map +1 -0
package/dist/index.js +40 -0
package/dist/index.js.map +1 -0
package/dist/kernels/activations.d.ts +5 -0
package/dist/kernels/activations.d.ts.map +1 -0
package/dist/kernels/activations.js +171 -0
package/dist/kernels/activations.js.map +1 -0
package/dist/kernels/attention.d.ts +19 -0
package/dist/kernels/attention.d.ts.map +1 -0
package/dist/kernels/attention.js +263 -0
package/dist/kernels/attention.js.map +1 -0
package/dist/kernels/complex_ssd.d.ts +33 -0
package/dist/kernels/complex_ssd.d.ts.map +1 -0
package/dist/kernels/complex_ssd.js +305 -0
package/dist/kernels/complex_ssd.js.map +1 -0
package/dist/kernels/conv1d.d.ts +3 -0
package/dist/kernels/conv1d.d.ts.map +1 -0
package/dist/kernels/conv1d.js +158 -0
package/dist/kernels/conv1d.js.map +1 -0
package/dist/kernels/linear_projection.d.ts +3 -0
package/dist/kernels/linear_projection.d.ts.map +1 -0
package/dist/kernels/linear_projection.js +219 -0
package/dist/kernels/linear_projection.js.map +1 -0
package/dist/kernels/selective_scan.d.ts +3 -0
package/dist/kernels/selective_scan.d.ts.map +1 -0
package/dist/kernels/selective_scan.js +348 -0
package/dist/kernels/selective_scan.js.map +1 -0
package/dist/kernels/ssd.d.ts +29 -0
package/dist/kernels/ssd.d.ts.map +1 -0
package/dist/kernels/ssd.js +276 -0
package/dist/kernels/ssd.js.map +1 -0
package/dist/kernels/weight_update.d.ts +3 -0
package/dist/kernels/weight_update.d.ts.map +1 -0
package/dist/kernels/weight_update.js +119 -0
package/dist/kernels/weight_update.js.map +1 -0
package/dist/model/attention_block.d.ts +48 -0
package/dist/model/attention_block.d.ts.map +1 -0
package/dist/model/attention_block.js +262 -0
package/dist/model/attention_block.js.map +1 -0
package/dist/model/mamba1_block.d.ts +70 -0
package/dist/model/mamba1_block.d.ts.map +1 -0
package/dist/model/mamba1_block.js +333 -0
package/dist/model/mamba1_block.js.map +1 -0
package/dist/model/mamba2_block.d.ts +44 -0
package/dist/model/mamba2_block.d.ts.map +1 -0
package/dist/model/mamba2_block.js +252 -0
package/dist/model/mamba2_block.js.map +1 -0
package/dist/model/mamba3_block.d.ts +51 -0
package/dist/model/mamba3_block.d.ts.map +1 -0
package/dist/model/mamba3_block.js +270 -0
package/dist/model/mamba3_block.js.map +1 -0
package/dist/model/mamba_block.d.ts +64 -0
package/dist/model/mamba_block.d.ts.map +1 -0
package/dist/model/mamba_block.js +303 -0
package/dist/model/mamba_block.js.map +1 -0
package/dist/model/mamba_model.d.ts +140 -0
package/dist/model/mamba_model.d.ts.map +1 -0
package/dist/model/mamba_model.js +527 -0
package/dist/model/mamba_model.js.map +1 -0
package/dist/model/sequence_layer.d.ts +25 -0
package/dist/model/sequence_layer.d.ts.map +1 -0
package/dist/model/sequence_layer.js +8 -0
package/dist/model/sequence_layer.js.map +1 -0
package/dist/tokenizer/bpe.d.ts +29 -0
package/dist/tokenizer/bpe.d.ts.map +1 -0
package/dist/tokenizer/bpe.js +164 -0
package/dist/tokenizer/bpe.js.map +1 -0
package/dist/training/autograd.d.ts +27 -0
package/dist/training/autograd.d.ts.map +1 -0
package/dist/training/autograd.js +120 -0
package/dist/training/autograd.js.map +1 -0
package/dist/training/trainer.d.ts +36 -0
package/dist/training/trainer.d.ts.map +1 -0
package/dist/training/trainer.js +183 -0
package/dist/training/trainer.js.map +1 -0
package/dist/utils/gpu_utils.d.ts +21 -0
package/dist/utils/gpu_utils.d.ts.map +1 -0
package/dist/utils/gpu_utils.js +111 -0
package/dist/utils/gpu_utils.js.map +1 -0
package/dist/utils/quantization.d.ts +26 -0
package/dist/utils/quantization.d.ts.map +1 -0
package/dist/utils/quantization.js +116 -0
package/dist/utils/quantization.js.map +1 -0
package/dist/utils/rng.d.ts +36 -0
package/dist/utils/rng.d.ts.map +1 -0
package/dist/utils/rng.js +61 -0
package/dist/utils/rng.js.map +1 -0
package/package.json +99 -0
package/src/index.ts +114 -0
package/src/kernels/activations.ts +174 -0
package/src/kernels/attention.ts +268 -0
package/src/kernels/complex_ssd.ts +307 -0
package/src/kernels/conv1d.ts +159 -0
package/src/kernels/linear_projection.ts +220 -0
package/src/kernels/selective_scan.ts +350 -0
package/src/kernels/ssd.ts +278 -0
package/src/kernels/weight_update.ts +120 -0
package/src/model/attention_block.ts +344 -0
package/src/model/mamba1_block.ts +437 -0
package/src/model/mamba2_block.ts +319 -0
package/src/model/mamba3_block.ts +335 -0
package/src/model/mamba_block.ts +401 -0
package/src/model/mamba_model.ts +678 -0
package/src/model/sequence_layer.ts +29 -0
package/src/tokenizer/bpe.ts +186 -0
package/src/training/autograd.ts +135 -0
package/src/training/trainer.ts +309 -0
package/src/utils/gpu_utils.ts +147 -0
package/src/utils/quantization.ts +154 -0
package/src/utils/rng.ts +65 -0

package/dist/kernels/selective_scan.js ADDED Viewed

@@ -0,0 +1,348 @@
+// Parallel Selective Scan WGSL Kernel
+// Implements the S6 (Selective Scan) core of the Mamba architecture.
+// Uses a Kogge-Stone parallel prefix-sum approach for O(log N) time on GPU.
+//
+// Forward pass recurrence:
+//   h_t = A_t * h_{t-1} + B_t * x_t
+//   y_t = C_t * h_t + D * x_t
+//
+// where A_t, B_t, C_t are input-dependent (selective) gate matrices.
+export const SELECTIVE_SCAN_FORWARD_WGSL = /* wgsl */ `
+// ---- Binding layout ----
+// group 0: sequence data
+// group 1: SSM parameters
+struct ScanParams {
+    seq_len   : u32,   // L  – sequence length
+    d_state   : u32,   // N  – state dimension
+    d_inner   : u32,   // D  – inner (expanded) channel dimension
+    batch     : u32,   // B  – batch size
+};
+@group(0) @binding(0) var<uniform>             params   : ScanParams;
+// u (B, L, D)  – projected input after conv
+@group(0) @binding(1) var<storage, read>       u        : array<f32>;
+// delta (B, L, D) – time-step (Δ) after softplus
+@group(0) @binding(2) var<storage, read>       delta    : array<f32>;
+// A (D, N)  – log-space diagonal state matrix (fixed, learned)
+@group(0) @binding(3) var<storage, read>       A        : array<f32>;
+// B (B, L, N) – input projection (selective)
+@group(0) @binding(4) var<storage, read>       B        : array<f32>;
+// C (B, L, N) – output projection (selective)
+@group(0) @binding(5) var<storage, read>       C        : array<f32>;
+// D (D,) – skip-connection scale
+@group(0) @binding(6) var<storage, read>       D_vec    : array<f32>;
+// y (B, L, D) – output (written by this kernel)
+@group(0) @binding(7) var<storage, read_write> y        : array<f32>;
+// h_cache (B, L, D*N) – hidden states cache (for backward pass)
+@group(0) @binding(8) var<storage, read_write> h_cache  : array<f32>;
+// ---- Workgroup shared memory ----
+// Each workgroup processes one (batch, channel) slice across all time steps.
+// We store the associative pair (a_bar, bu_bar) per time step so we can run
+// a Kogge-Stone scan across the workgroup tile.
+var<workgroup> wg_a  : array<f32, 256>;   // discretised A values
+var<workgroup> wg_bu : array<f32, 256>;   // B*u values
+// ---- Helpers ----
+// Softplus: numerically stable log(1 + exp(x))
+fn softplus(x: f32) -> f32 {
+    return log(1.0 + exp(x));
+}
+// ZerO-Order Hold discretisation of continuous A, Δ:
+//   A_bar = exp(Δ * A)
+//   B_bar = (A_bar - 1) / A * B  ≈  Δ * B  (first-order for simplicity)
+fn discretise_A(delta_val: f32, a_log: f32) -> f32 {
+    // A is stored as -exp(a_log) to ensure A_bar < 1 (stable)
+    let a_cont = -exp(a_log);
+    return exp(delta_val * a_cont);
+}
+fn discretise_B(delta_val: f32, a_log: f32, b_val: f32) -> f32 {
+    let a_cont  = -exp(a_log);
+    let a_bar   = exp(delta_val * a_cont);
+    // (A_bar - 1) / A_cont * B
+    let b_bar   = (a_bar - 1.0) / a_cont * b_val;
+    return b_bar;
+}
+// ---- Main kernel ----
+// Dispatch: (ceil(D/8), ceil(N/8), B)
+// Each invocation is responsible for one (d, n, batch) triplet and scans
+// the entire sequence using a two-pass Kogge-Stone scan within workgroup tiles.
+@compute @workgroup_size(64, 1, 1)
+fn forward_scan(
+    @builtin(global_invocation_id)   gid  : vec3<u32>,
+    @builtin(local_invocation_index) lid  : u32,
+    @builtin(workgroup_id)           wgid : vec3<u32>,
+) {
+    let L = params.seq_len;
+    let N = params.d_state;
+    let D = params.d_inner;
+    let B = params.batch;
+    // Each workgroup handles one (batch b, channel d, state n) combination.
+    // We pack d and n into the x dimension: global d = wgid.x, global n = wgid.y
+    let d = wgid.x;
+    let n = wgid.y;
+    let b = gid.z;
+    if (d >= D || n >= N || b >= B) { return; }
+    // Tile size equals workgroup size (64).  We process TILE_SIZE steps at once.
+    let TILE: u32 = 64u;
+    // Running state h for this (b, d, n)
+    var h: f32 = 0.0;
+    var tile_start: u32 = 0u;
+    loop {
+        if (tile_start >= L) { break; }
+        let t = tile_start + lid;      // absolute time step handled by this lane
+        var a_bar: f32 = 1.0;
+        var bu:    f32 = 0.0;
+        if (t < L) {
+            // Indices
+            let delta_idx = b * L * D + t * D + d;
+            let u_idx     = b * L * D + t * D + d;
+            let A_idx     = d * N + n;
+            let B_idx     = b * L * N + t * N + n;
+            let dv = softplus(delta[delta_idx]);
+            a_bar  = discretise_A(dv, A[A_idx]);
+            bu     = discretise_B(dv, A[A_idx], B[B_idx]) * u[u_idx];
+        }
+        wg_a[lid]  = a_bar;
+        wg_bu[lid] = bu;
+        workgroupBarrier();
+        // ---- Kogge-Stone inclusive prefix scan within tile ----
+        // Associative operator: (a1, b1) ∘ (a2, b2) = (a1*a2, a1*b2 + b1)
+        // This computes cumulative state recurrence in log2(TILE) steps.
+        var stride: u32 = 1u;
+        loop {
+            if (stride >= TILE) { break; }
+            if (lid >= stride) {
+                let prev_a  = wg_a[lid - stride];
+                let prev_bu = wg_bu[lid - stride];
+                // Combine: new_state = prev_a * cur_a (product of A_bars)
+                //                      new_bu  = prev_a * cur_bu + prev_bu
+                let new_a  = prev_a * wg_a[lid];
+                let new_bu = prev_a * wg_bu[lid] + prev_bu;
+                workgroupBarrier();
+                wg_a[lid]  = new_a;
+                wg_bu[lid] = new_bu;
+            }
+            workgroupBarrier();
+            stride = stride << 1u;
+        }
+        // Incorporate the carry-in state from the previous tile.
+        // After the scan wg_bu[lid] holds the intra-tile inclusive sum.
+        // The actual h at position t = h_carry * wg_a[lid] + wg_bu[lid]
+        let h_t = h * wg_a[lid] + wg_bu[lid];
+        if (t < L) {
+            // Cache hidden state for backward pass
+            let h_idx = b * L * D * N + t * D * N + d * N + n;
+            h_cache[h_idx] = h_t;
+            // Accumulate y contribution: y_t += C_t[n] * h_t  (over all n)
+            // We use an atomic-style accumulation: each (d, n) lane adds its
+            // contribution to the same y[b, t, d].  This races without atomics,
+            // so we instead write to a full h_cache and reduce in a second pass.
+            // Here we perform direct accumulation using atomicAdd approximation:
+            // (safe because each lane writes a unique n, which is stride 1 in mem)
+            let C_idx = b * L * N + t * N + n;
+            let y_idx = b * L * D + t * D + d;
+            // Direct write for n == 0 (first state dim), add for the rest.
+            // Since all workgroups for the same (b,d) run concurrently we must
+            // accumulate safely: we write each partial into h_cache and reduce
+            // in a subsequent lightweight kernel (forward_reduce).
+            // (For simplicity and correctness here we directly atomically add via
+            //  f32 emulation – real deployment uses atomicAdd on f32 with spirv ext.)
+            // We store C*h contribution separately so forward_reduce can sum them.
+            // Layout: y_partial (B, L, D, N) – one slot per state dim
+            // y reused as y_partial in this kernel; forward_reduce collapses N dim.
+            let y_partial_idx = b * L * D * N + t * D * N + d * N + n;
+            // Reuse h_cache second half as y_partial (offset by B*L*D*N)
+            let offset = B * L * D * N;
+            h_cache[offset + y_partial_idx] = C[C_idx] * h_t;
+        }
+        // Update carry: last lane's h_t is the tile's final state
+        let last = min(TILE, L - tile_start) - 1u;
+        h = wg_a[last] * h + wg_bu[last];   // recombine carry
+        workgroupBarrier();
+        tile_start = tile_start + TILE;
+    }
+}
+// ---- Reduction kernel ----
+// Collapses the N (d_state) dimension of y_partial into y.
+// Adds the D (skip connection) term: y_t[d] += D_vec[d] * u_t[d]
+// Dispatch: (ceil(L/64), D, B)
+@compute @workgroup_size(64, 1, 1)
+fn forward_reduce(
+    @builtin(global_invocation_id) gid : vec3<u32>,
+) {
+    let L = params.seq_len;
+    let N = params.d_state;
+    let D = params.d_inner;
+    let B = params.batch;
+    let t = gid.x;
+    let d = gid.y;
+    let b = gid.z;
+    if (t >= L || d >= D || b >= B) { return; }
+    let offset    = B * L * D * N;
+    var sum: f32  = 0.0;
+    for (var n: u32 = 0u; n < N; n = n + 1u) {
+        let idx = offset + b * L * D * N + t * D * N + d * N + n;
+        sum = sum + h_cache[idx];
+    }
+    // Add skip connection
+    let u_idx = b * L * D + t * D + d;
+    sum = sum + D_vec[d] * u[u_idx];
+    let y_idx = b * L * D + t * D + d;
+    y[y_idx] = sum;
+}
+`;
+// ---- Backward scan kernel (for autograd) ----
+// Computes gradients w.r.t. Δ, A, B, C using the cached hidden states.
+export const SELECTIVE_SCAN_BACKWARD_WGSL = /* wgsl */ `
+struct ScanParams {
+    seq_len  : u32,
+    d_state  : u32,
+    d_inner  : u32,
+    batch    : u32,
+};
+@group(0) @binding(0) var<uniform>             params    : ScanParams;
+@group(0) @binding(1) var<storage, read>       u         : array<f32>;
+@group(0) @binding(2) var<storage, read>       delta     : array<f32>;
+@group(0) @binding(3) var<storage, read>       A         : array<f32>;
+@group(0) @binding(4) var<storage, read>       B         : array<f32>;
+@group(0) @binding(5) var<storage, read>       C         : array<f32>;
+@group(0) @binding(6) var<storage, read>       h_cache   : array<f32>;
+@group(0) @binding(7) var<storage, read>       dy        : array<f32>;  // upstream gradient
+@group(0) @binding(8) var<storage, read_write> dA        : array<f32>;
+@group(0) @binding(9) var<storage, read_write> dB        : array<f32>;
+@group(0) @binding(10) var<storage, read_write> dC       : array<f32>;
+@group(0) @binding(11) var<storage, read_write> dDelta   : array<f32>;
+@group(0) @binding(12) var<storage, read_write> du       : array<f32>;
+fn softplus(x: f32) -> f32 {
+    return log(1.0 + exp(x));
+}
+fn softplus_grad(x: f32) -> f32 {
+    // d/dx softplus(x) = sigmoid(x)
+    return 1.0 / (1.0 + exp(-x));
+}
+fn discretise_A(delta_val: f32, a_log: f32) -> f32 {
+    let a_cont = -exp(a_log);
+    return exp(delta_val * a_cont);
+}
+// Reverse scan (backward pass) – processes time from T-1 down to 0.
+// Dispatch: (D, N, B)
+@compute @workgroup_size(1, 1, 1)
+fn backward_scan(
+    @builtin(global_invocation_id) gid : vec3<u32>,
+) {
+    let L = params.seq_len;
+    let N = params.d_state;
+    let D = params.d_inner;
+    let B = params.batch;
+    let d = gid.x;
+    let n = gid.y;
+    let b = gid.z;
+    if (d >= D || n >= N || b >= B) { return; }
+    var dh: f32 = 0.0;   // gradient of loss w.r.t. h_t, accumulated backwards
+    var t: u32 = L;
+    loop {
+        if (t == 0u) { break; }
+        t = t - 1u;
+        let delta_raw_idx = b * L * D + t * D + d;
+        let A_idx         = d * N + n;
+        let B_idx         = b * L * N + t * N + n;
+        let C_idx         = b * L * N + t * N + n;
+        let u_idx         = b * L * D + t * D + d;
+        let h_idx         = b * L * D * N + t * D * N + d * N + n;
+        let delta_raw = delta[delta_raw_idx];
+        let dv        = softplus(delta_raw);
+        let a_log     = A[A_idx];
+        let a_cont    = -exp(a_log);
+        let a_bar     = exp(dv * a_cont);
+        let b_val     = B[B_idx];
+        let c_val     = C[C_idx];
+        let u_val     = u[u_idx];
+        let h_t       = h_cache[h_idx];
+        // dy_t contribution to dh (from C * h_t in the output)
+        // y_t[d] = sum_n C[n] * h_t[n] + D * u   =>  dh_t[n] += C[n] * dy_t[d]
+        let dy_val = dy[b * L * D + t * D + d];
+        dh = dh + c_val * dy_val;
+        // dC[b, t, n] += dy_t[d] * h_t
+        dC[C_idx] = dC[C_idx] + dy_val * h_t;
+        // h_t = a_bar * h_{t-1} + b_bar * u_t
+        // b_bar = (a_bar - 1) / a_cont * b_val
+        let b_bar  = (a_bar - 1.0) / a_cont * b_val;
+        let h_prev = (t > 0u) ? h_cache[b * L * D * N + (t - 1u) * D * N + d * N + n] : 0.0;
+        // dh_{t-1} += a_bar * dh_t
+        // (accumulated in next iteration; here dh already contains upstream)
+        let dh_cur = dh;
+        // dA[d,n] += dh_t * (d a_bar/d a_cont) * (d a_cont/d a_log) * h_{t-1}
+        //          + dh_t * (d b_bar/d a_cont) * ... * b_val * u_val
+        // d(a_bar)/d(a_log) = a_bar * (-exp(a_log)) * dv = a_bar * a_cont * dv
+        let da_bar_da_log = a_bar * a_cont * dv;
+        dA[A_idx] = dA[A_idx] + dh_cur * (da_bar_da_log * h_prev);
+        // dB[b,t,n] += dh_t * b_bar / b_val * u_val  (since b_bar is linear in b)
+        dB[B_idx] = dB[B_idx] + dh_cur * ((a_bar - 1.0) / a_cont) * u_val;
+        // du[b,t,d] += dh_t * b_bar  (accumulate over n in separate kernel)
+        du[u_idx] = du[u_idx] + dh_cur * b_bar;
+        // dDelta[b,t,d]: chain rule through softplus and discretisation
+        // d(b_bar)/d(dv) = d/d(dv)[(a_bar-1)/a_cont * b] = a_bar * b / (a_cont ... )
+        //  actually: d(a_bar)/d(dv) = a_bar * a_cont,  d(b_bar)/d(dv) = a_bar * b_val
+        let da_bar_ddv  = a_bar * a_cont;
+        let db_bar_ddv  = a_bar * b_val;
+        let dLoss_ddv   = dh_cur * (da_bar_ddv * h_prev + db_bar_ddv * u_val);
+        let ddv_ddelta  = softplus_grad(delta_raw);
+        dDelta[delta_raw_idx] = dDelta[delta_raw_idx] + dLoss_ddv * ddv_ddelta;
+        // Propagate dh to previous timestep
+        dh = a_bar * dh_cur;
+    }
+}
+`;
+//# sourceMappingURL=selective_scan.js.map

package/dist/kernels/selective_scan.js.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"selective_scan.js","sourceRoot":"","sources":["../../src/kernels/selective_scan.ts"],"names":[],"mappings":"AAAA,sCAAsC;AACtC,qEAAqE;AACrE,4EAA4E;AAC5E,EAAE;AACF,2BAA2B;AAC3B,oCAAoC;AACpC,8BAA8B;AAC9B,EAAE;AACF,qEAAqE;AAErE,MAAM,CAAC,MAAM,2BAA2B,GAAW,UAAU,CAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;CAsN5D,CAAC;AAEF,gDAAgD;AAChD,uEAAuE;AAEvE,MAAM,CAAC,MAAM,4BAA4B,GAAW,UAAU,CAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;CAwH7D,CAAC"}

package/dist/kernels/ssd.d.ts ADDED Viewed

@@ -0,0 +1,29 @@
+/**
+ * ssd.ts – Structured State Space Duality (SSD) kernels for Mamba-2.
+ *
+ * Implements a chunked SSD algorithm:
+ *   A_bar_t = exp(-softplus(A_h) · softplus(dt_t + dt_bias_h))   [scalar per head]
+ *   h_t     = A_bar_t · h_{t-1} + B_t · x_t                      [MIMO per head]
+ *   y_t     = C_t · h_t
+ *
+ * The sequence is split into chunks of `chunk_len` time steps.
+ * Within each chunk the recurrence is run sequentially; the carry-over
+ * state `h` is passed forward between chunks via the state_carry buffer.
+ *
+ * Dispatch for ssd_chunk_forward:  (num_chunks, H, B)
+ * Dispatch for ssd_chunk_backward: (num_chunks, H, B)
+ *
+ * Buffer layout (all f32, row-major):
+ *   x           : [B, L, D_inner]     where D_inner = H * d_head
+ *   B_proj      : [B, L, n_groups, N]
+ *   C_proj      : [B, L, n_groups, N]
+ *   dt          : [B, L, H]
+ *   A_log       : [H]                 log(-A), positive scalar per head
+ *   dt_bias     : [H]
+ *   D_vec       : [H]                 skip connection per head
+ *   out         : [B, L, D_inner]     scan output (written by kernel)
+ *   state_carry : [num_chunks+1, B, H, N, d_head]  inter-chunk states
+ */
+export declare const SSD_FORWARD_WGSL: string;
+export declare const SSD_BACKWARD_WGSL: string;
+//# sourceMappingURL=ssd.d.ts.map

package/dist/kernels/ssd.d.ts.map ADDED Viewed

	@@ -0,0 +1 @@
1	+ {"version":3,"file":"ssd.d.ts","sourceRoot":"","sources":["../../src/kernels/ssd.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;GAyBG;AAEH,eAAO,MAAM,gBAAgB,EAAE,MA+H9B,CAAC;AAIF,eAAO,MAAM,iBAAiB,EAAE,MAuH/B,CAAC"}

package/dist/kernels/ssd.js ADDED Viewed

@@ -0,0 +1,276 @@
+/**
+ * ssd.ts – Structured State Space Duality (SSD) kernels for Mamba-2.
+ *
+ * Implements a chunked SSD algorithm:
+ *   A_bar_t = exp(-softplus(A_h) · softplus(dt_t + dt_bias_h))   [scalar per head]
+ *   h_t     = A_bar_t · h_{t-1} + B_t · x_t                      [MIMO per head]
+ *   y_t     = C_t · h_t
+ *
+ * The sequence is split into chunks of `chunk_len` time steps.
+ * Within each chunk the recurrence is run sequentially; the carry-over
+ * state `h` is passed forward between chunks via the state_carry buffer.
+ *
+ * Dispatch for ssd_chunk_forward:  (num_chunks, H, B)
+ * Dispatch for ssd_chunk_backward: (num_chunks, H, B)
+ *
+ * Buffer layout (all f32, row-major):
+ *   x           : [B, L, D_inner]     where D_inner = H * d_head
+ *   B_proj      : [B, L, n_groups, N]
+ *   C_proj      : [B, L, n_groups, N]
+ *   dt          : [B, L, H]
+ *   A_log       : [H]                 log(-A), positive scalar per head
+ *   dt_bias     : [H]
+ *   D_vec       : [H]                 skip connection per head
+ *   out         : [B, L, D_inner]     scan output (written by kernel)
+ *   state_carry : [num_chunks+1, B, H, N, d_head]  inter-chunk states
+ */
+export const SSD_FORWARD_WGSL = /* wgsl */ `
+struct SsdParams {
+    seq_len    : u32,
+    d_inner    : u32,
+    n_heads    : u32,
+    d_head     : u32,   // d_inner / n_heads
+    n_groups   : u32,
+    d_state    : u32,   // N
+    chunk_len  : u32,
+    n_chunks   : u32,
+    batch      : u32,
+};
+@group(0) @binding(0) var<uniform>             params      : SsdParams;
+@group(0) @binding(1) var<storage, read>       x_in        : array<f32>; // [B,L,D_inner]
+@group(0) @binding(2) var<storage, read>       B_proj      : array<f32>; // [B,L,n_groups,N]
+@group(0) @binding(3) var<storage, read>       C_proj      : array<f32>; // [B,L,n_groups,N]
+@group(0) @binding(4) var<storage, read>       dt_in       : array<f32>; // [B,L,H]
+@group(0) @binding(5) var<storage, read>       A_log       : array<f32>; // [H]
+@group(0) @binding(6) var<storage, read>       dt_bias     : array<f32>; // [H]
+@group(0) @binding(7) var<storage, read>       D_vec       : array<f32>; // [H]
+@group(0) @binding(8) var<storage, read_write> out_buf     : array<f32>; // [B,L,D_inner]
+@group(0) @binding(9) var<storage, read_write> state_carry : array<f32>; // [n_chunks+1,B,H,N,d_head]
+fn softplus(x: f32) -> f32 {
+    return log(1.0 + exp(x));
+}
+// Workgroup: one chunk × one head × one batch item
+@compute @workgroup_size(1, 1, 1)
+fn ssd_chunk_forward(@builtin(global_invocation_id) gid: vec3<u32>) {
+    let chunk_id = gid.x;
+    let head_id  = gid.y;
+    let batch_id = gid.z;
+    let L  = params.seq_len;
+    let D  = params.d_inner;
+    let H  = params.n_heads;
+    let dh = params.d_head;
+    let G  = params.n_groups;
+    let N  = params.d_state;
+    let CL = params.chunk_len;
+    let NC = params.n_chunks;
+    let B  = params.batch;
+    let t_start = chunk_id * CL;
+    let t_end   = min(t_start + CL, L);
+    // Group index: heads are partitioned across groups
+    let group_id = head_id * G / H;
+    // A scalar for this head
+    let neg_A = softplus(A_log[head_id]);  // A_log stores log(-A) positive
+    let db    = dt_bias[head_id];
+    let d_skip = D_vec[head_id];
+    // Load carry-in state: h[N, dh] (stored flat as N*dh floats)
+    // state_carry layout: [NC+1, B, H, N*dh]
+    let state_stride_chunk = B * H * N * dh;
+    let state_base_in = chunk_id * state_stride_chunk
+                      + batch_id * H * N * dh
+                      + head_id  * N * dh;
+    // We maintain h as a local array (N * dh floats).
+    // WebGPU WGSL does not support variable-length arrays in function scope,
+    // so we use a fixed maximum. Max N*dh = 64*64 = 4096. Here we use dynamic
+    // indexing into state_carry which is shared storage.
+    // Write carry-in into temporary positions — use state_carry directly for
+    // the running state (overwrite in-place from carry-in slot).
+    // Copy carry-in to working slot (chunk_id+1 slot, updated each step).
+    let state_base_out = (chunk_id + 1u) * state_stride_chunk
+                       + batch_id * H * N * dh
+                       + head_id  * N * dh;
+    // Initialise working state from carry-in
+    for (var s: u32 = 0u; s < N * dh; s = s + 1u) {
+        state_carry[state_base_out + s] = state_carry[state_base_in + s];
+    }
+    // Sequential scan over the chunk
+    for (var t: u32 = t_start; t < t_end; t = t + 1u) {
+        // dt scalar for this head at time t
+        let dt_idx = batch_id * L * H + t * H + head_id;
+        let dt_val = softplus(dt_in[dt_idx] + db);
+        // A_bar = exp(-neg_A * dt_val)
+        let a_bar = exp(-neg_A * dt_val);
+        // Head slice of x: x[batch, t, head*dh .. (head+1)*dh]
+        let x_base = batch_id * L * D + t * D + head_id * dh;
+        // B at this time step: B_proj[batch, t, group_id, *] shape [N]
+        let b_base = batch_id * L * G * N + t * G * N + group_id * N;
+        // C at this time step: C_proj[batch, t, group_id, *] shape [N]
+        let c_base = batch_id * L * G * N + t * G * N + group_id * N;
+        // y accumulator for this head at time t
+        var y_acc: f32 = 0.0;
+        for (var n: u32 = 0u; n < N; n = n + 1u) {
+            let b_val = B_proj[b_base + n];
+            let c_val = C_proj[c_base + n];
+            for (var i: u32 = 0u; i < dh; i = i + 1u) {
+                let s_idx = state_base_out + n * dh + i;
+                let x_val = x_in[x_base + i];
+                // h_t = A_bar * h_{t-1} + B * x
+                let h_new = a_bar * state_carry[s_idx] + b_val * x_val;
+                state_carry[s_idx] = h_new;
+                // y += C * h (summed over n dimension per output channel i)
+                y_acc = y_acc + c_val * h_new;
+            }
+        }
+        // Write y + skip (D * x, averaged over dh for the skip scalar)
+        // out[batch, t, head*dh .. (head+1)*dh]
+        for (var i: u32 = 0u; i < dh; i = i + 1u) {
+            let out_idx = batch_id * L * D + t * D + head_id * dh + i;
+            let x_val   = x_in[x_base + i];
+            out_buf[out_idx] = y_acc + d_skip * x_val;
+        }
+    }
+}
+`;
+// ── Backward ──────────────────────────────────────────────────────────────────
+export const SSD_BACKWARD_WGSL = /* wgsl */ `
+struct SsdParams {
+    seq_len    : u32,
+    d_inner    : u32,
+    n_heads    : u32,
+    d_head     : u32,
+    n_groups   : u32,
+    d_state    : u32,
+    chunk_len  : u32,
+    n_chunks   : u32,
+    batch      : u32,
+};
+@group(0) @binding(0) var<uniform>             params      : SsdParams;
+@group(0) @binding(1) var<storage, read>       x_in        : array<f32>;
+@group(0) @binding(2) var<storage, read>       B_proj      : array<f32>;
+@group(0) @binding(3) var<storage, read>       C_proj      : array<f32>;
+@group(0) @binding(4) var<storage, read>       dt_in       : array<f32>;
+@group(0) @binding(5) var<storage, read>       A_log       : array<f32>;
+@group(0) @binding(6) var<storage, read>       dt_bias     : array<f32>;
+@group(0) @binding(7) var<storage, read>       state_carry : array<f32>; // forward states
+@group(0) @binding(8) var<storage, read>       dy          : array<f32>; // upstream grad
+@group(0) @binding(9) var<storage, read_write> dx          : array<f32>;
+@group(0) @binding(10) var<storage, read_write> dB         : array<f32>;
+@group(0) @binding(11) var<storage, read_write> dC         : array<f32>;
+@group(0) @binding(12) var<storage, read_write> ddt        : array<f32>;
+@group(0) @binding(13) var<storage, read_write> dA_log     : array<f32>;
+@group(0) @binding(14) var<storage, read_write> dD_vec     : array<f32>;
+fn softplus(x: f32) -> f32 {
+    return log(1.0 + exp(x));
+}
+fn d_softplus(x: f32) -> f32 {
+    return 1.0 / (1.0 + exp(-x));
+}
+@compute @workgroup_size(1, 1, 1)
+fn ssd_chunk_backward(@builtin(global_invocation_id) gid: vec3<u32>) {
+    let chunk_id = gid.x;
+    let head_id  = gid.y;
+    let batch_id = gid.z;
+    let L  = params.seq_len;
+    let D  = params.d_inner;
+    let H  = params.n_heads;
+    let dh = params.d_head;
+    let G  = params.n_groups;
+    let N  = params.d_state;
+    let CL = params.chunk_len;
+    let NC = params.n_chunks;
+    let B  = params.batch;
+    let t_start = chunk_id * CL;
+    let t_end   = min(t_start + CL, L);
+    let group_id = head_id * G / H;
+    let neg_A  = softplus(A_log[head_id]);
+    let db     = dt_bias[head_id];
+    let state_stride = B * H * N * dh;
+    let state_base   = chunk_id * state_stride
+                     + batch_id * H * N * dh
+                     + head_id  * N * dh;
+    // Backward: iterate time steps in reverse within the chunk
+    // dh_next starts at zero (or propagated from future chunks — simplified here)
+    for (var t_rev: u32 = 0u; t_rev < t_end - t_start; t_rev = t_rev + 1u) {
+        let t = t_end - 1u - t_rev;
+        let dt_idx = batch_id * L * H + t * H + head_id;
+        let dt_raw = dt_in[dt_idx] + db;
+        let dt_val = softplus(dt_raw);
+        let a_bar  = exp(-neg_A * dt_val);
+        let x_base = batch_id * L * D + t * D + head_id * dh;
+        let b_base = batch_id * L * G * N + t * G * N + group_id * N;
+        let c_base = b_base;
+        for (var i: u32 = 0u; i < dh; i = i + 1u) {
+            let dy_val  = dy[batch_id * L * D + t * D + head_id * dh + i];
+            let x_val   = x_in[x_base + i];
+            // dD_vec
+            dD_vec[head_id] = dD_vec[head_id] + dy_val * x_val;
+            // dx from skip
+            dx[x_base + i] = dx[x_base + i] + dy_val * /* D */ 1.0;
+            for (var n: u32 = 0u; n < N; n = n + 1u) {
+                let s_idx = state_base + n * dh + i;
+                let h_val = state_carry[(chunk_id + 1u) * state_stride
+                                       + batch_id * H * N * dh
+                                       + head_id * N * dh + n * dh + i];
+                let c_val = C_proj[c_base + n];
+                let b_val = B_proj[b_base + n];
+                // dC += dy * h
+                dC[b_base + n] = dC[b_base + n] + dy_val * h_val;
+                // dh = C * dy
+                let dh_val = c_val * dy_val;
+                // dB += dh * x
+                dB[b_base + n] = dB[b_base + n] + dh_val * x_val;
+                // dx += dh * B
+                dx[x_base + i] = dx[x_base + i] + dh_val * b_val;
+                // ddt += dh * h_prev * (-neg_A) * d_softplus(dt_raw)
+                let h_prev = state_carry[s_idx];
+                ddt[dt_idx] = ddt[dt_idx]
+                    + dh_val * h_prev * (-neg_A) * d_softplus(dt_raw);
+                // dA_log += dh * h_prev * a_bar * (-dt_val) * d_softplus(A_log[head])
+                dA_log[head_id] = dA_log[head_id]
+                    + dh_val * h_prev * a_bar * (-dt_val) * d_softplus(A_log[head_id]);
+            }
+        }
+    }
+}
+`;
+//# sourceMappingURL=ssd.js.map

package/dist/kernels/ssd.js.map ADDED Viewed

@@ -0,0 +1 @@

+ {"version":3,"file":"ssd.js","sourceRoot":"","sources":["../../src/kernels/ssd.ts"],"names":[],"mappings":"AAAA;;;;;;;;;;;;;;;;;;;;;;;;;GAyBG;AAEH,MAAM,CAAC,MAAM,gBAAgB,GAAW,UAAU,CAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;CA+HjD,CAAC;AAEF,iFAAiF;AAEjF,MAAM,CAAC,MAAM,iBAAiB,GAAW,UAAU,CAAA;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;CAuHlD,CAAC"}

package/dist/kernels/weight_update.d.ts ADDED Viewed

@@ -0,0 +1,3 @@
+export declare const WEIGHT_UPDATE_WGSL: string;
+export declare const GRAD_CLIP_WGSL: string;
+//# sourceMappingURL=weight_update.d.ts.map

package/dist/kernels/weight_update.d.ts.map ADDED Viewed

	@@ -0,0 +1 @@
1	+ {"version":3,"file":"weight_update.d.ts","sourceRoot":"","sources":["../../src/kernels/weight_update.ts"],"names":[],"mappings":"AAUA,eAAO,MAAM,kBAAkB,EAAE,MAgDhC,CAAC;AAIF,eAAO,MAAM,cAAc,EAAE,MAyD5B,CAAC"}