npm - mambacode.js - Versions diffs - 1.0.0 → 1.0.2 - Mend

mambacode.js 1.0.0 → 1.0.2

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (72) hide show

package/README.md +198 -76
package/dist/index.d.ts +19 -0
package/dist/index.d.ts.map +1 -0
package/dist/index.js +18 -0
package/dist/index.js.map +1 -0
package/dist/kernels/activations.d.ts +3 -0
package/dist/kernels/activations.d.ts.map +1 -0
package/dist/kernels/activations.js +87 -0
package/dist/kernels/activations.js.map +1 -0
package/dist/kernels/conv1d.d.ts +3 -0
package/dist/kernels/conv1d.d.ts.map +1 -0
package/dist/kernels/conv1d.js +152 -0
package/dist/kernels/conv1d.js.map +1 -0
package/dist/kernels/linear_projection.d.ts +3 -0
package/dist/kernels/linear_projection.d.ts.map +1 -0
package/dist/kernels/linear_projection.js +219 -0
package/dist/kernels/linear_projection.js.map +1 -0
package/dist/kernels/selective_scan.d.ts +3 -0
package/dist/kernels/selective_scan.d.ts.map +1 -0
package/dist/kernels/selective_scan.js +348 -0
package/dist/kernels/selective_scan.js.map +1 -0
package/dist/kernels/weight_update.d.ts +3 -0
package/dist/kernels/weight_update.d.ts.map +1 -0
package/dist/kernels/weight_update.js +119 -0
package/dist/kernels/weight_update.js.map +1 -0
package/dist/model/mamba_block.d.ts +64 -0
package/dist/model/mamba_block.d.ts.map +1 -0
package/dist/model/mamba_block.js +309 -0
package/dist/model/mamba_block.js.map +1 -0
package/dist/model/mamba_model.d.ts +66 -0
package/dist/model/mamba_model.d.ts.map +1 -0
package/dist/model/mamba_model.js +289 -0
package/dist/model/mamba_model.js.map +1 -0
package/dist/tokenizer/bpe.d.ts +29 -0
package/dist/tokenizer/bpe.d.ts.map +1 -0
package/dist/tokenizer/bpe.js +164 -0
package/dist/tokenizer/bpe.js.map +1 -0
package/dist/training/autograd.d.ts +27 -0
package/dist/training/autograd.d.ts.map +1 -0
package/dist/training/autograd.js +120 -0
package/dist/training/autograd.js.map +1 -0
package/dist/training/trainer.d.ts +37 -0
package/dist/training/trainer.d.ts.map +1 -0
package/dist/training/trainer.js +183 -0
package/dist/training/trainer.js.map +1 -0
package/dist/utils/gpu_utils.d.ts +21 -0
package/dist/utils/gpu_utils.d.ts.map +1 -0
package/dist/utils/gpu_utils.js +111 -0
package/dist/utils/gpu_utils.js.map +1 -0
package/dist/utils/quantization.d.ts +26 -0
package/dist/utils/quantization.d.ts.map +1 -0
package/dist/utils/quantization.js +116 -0
package/dist/utils/quantization.js.map +1 -0
package/package.json +43 -18
package/src/index.ts +61 -0
package/src/kernels/{activations.js → activations.ts} +2 -2
package/src/kernels/{linear_projection.js → linear_projection.ts} +2 -2
package/src/kernels/{selective_scan.js → selective_scan.ts} +2 -2
package/src/kernels/{weight_update.js → weight_update.ts} +2 -2
package/src/model/{mamba_block.js → mamba_block.ts} +134 -170
package/src/model/{mamba_model.js → mamba_model.ts} +165 -121
package/src/tokenizer/bpe.ts +186 -0
package/src/training/autograd.ts +135 -0
package/src/training/{trainer.js → trainer.ts} +79 -161
package/src/utils/gpu_utils.ts +147 -0
package/src/utils/quantization.ts +154 -0
package/src/index.js +0 -89
package/src/tokenizer/bpe.js +0 -256
package/src/training/autograd.js +0 -221
package/src/utils/gpu_utils.js +0 -217
package/src/utils/quantization.js +0 -215
/package/src/kernels/{conv1d.js → conv1d.ts} +0 -0

package/src/model/{mamba_block.js → mamba_block.ts} RENAMED Viewed

@@ -1,19 +1,5 @@
 /**
- * mamba_block.js – Mamba Mixer Block
- *
- * Implements one complete Mamba residual layer:
- *
- *   x  ──► Norm ──► Linear up (×2, for z-gate) ──► Conv1D ──► SiLU ──► Scan ──► × z ──► Linear down ──► + x
- *
- * Components (all dispatched as WebGPU compute passes):
- *   1. RMSNorm
- *   2. Linear up-projection: (D_model → 2 × D_inner)
- *   3. 1D Causal Convolution (depthwise, kernel_size=4)
- *   4. SiLU activation
- *   5. Selective Scan (S6 core)
- *   6. Gated multiplication: y * SiLU(z)
- *   7. Linear down-projection: (D_inner → D_model)
- *   8. Residual add
+ * mamba_block.ts – Mamba Mixer Block
  */
 import {
@@ -31,87 +17,126 @@ import { CONV1D_FORWARD_WGSL }          from '../kernels/conv1d.js';
 import { LINEAR_FORWARD_WGSL }          from '../kernels/linear_projection.js';
 import { ACTIVATIONS_WGSL }             from '../kernels/activations.js';
-/**
- * @typedef {Object} MambaBlockConfig
- * @property {number} dModel       – model dimension (embedding size)
- * @property {number} dState       – SSM state dimension (N, default 16)
- * @property {number} dConv        – 1D conv kernel size (default 4)
- * @property {number} expand       – expansion factor (default 2)  → dInner = expand * dModel
- * @property {number} dtRank       – rank of Δ projection (default ceil(dModel/16))
- * @property {boolean} [biasConv]  – use bias in conv (default true)
- */
+export interface MambaBlockConfig {
+  dModel: number;
+  dState?: number;
+  dConv?: number;
+  expand?: number;
+  dtRank?: number;
+  biasConv?: boolean;
+}
+export interface BlockParam {
+  buf: GPUBuffer;
+  numel: number;
+  name: string;
+}
+export interface BlockCache {
+  normInv: GPUBuffer;
+  normIn: GPUBuffer;
+  normOut: GPUBuffer;
+  zBuf: GPUBuffer;
+  xConvIn: GPUBuffer;
+  convOut: GPUBuffer;
+  siluOut: GPUBuffer;
+  deltaFull: GPUBuffer;
+  B_raw: GPUBuffer;
+  C_raw: GPUBuffer;
+  hCache: GPUBuffer;
+}
+export interface BlockForwardResult {
+  output: GPUBuffer;
+  cache: BlockCache;
+}
 export class MambaBlock {
-    /**
-     * @param {GPUDevice}       device
-     * @param {MambaBlockConfig} config
-     */
-    constructor(device, config) {
+    device: GPUDevice;
+    config: Required<MambaBlockConfig>;
+    dInner: number;
+    dtRank: number;
+    wInProj: Float32Array;
+    bInProj: Float32Array;
+    wConv: Float32Array;
+    bConv: Float32Array;
+    wXProj: Float32Array;
+    bXProj: Float32Array;
+    wDtProj: Float32Array;
+    bDtProj: Float32Array;
+    A_log: Float32Array;
+    D_vec: Float32Array;
+    wOutProj: Float32Array;
+    bOutProj: Float32Array;
+    normWeight: Float32Array;
+    gpuWeights: Record<string, GPUBuffer>;
+    pipelines: Record<string, GPUComputePipeline>;
+    private _wslaMode = false;
+    constructor(device: GPUDevice, config: MambaBlockConfig) {
         this.device  = device;
         this.config  = {
             dState  : 16,
             dConv   : 4,
             expand  : 2,
             biasConv: true,
+            dtRank  : Math.ceil(config.dModel / 16),
             ...config,
-        };
+        } as Required<MambaBlockConfig>;
-        const { dModel, dState, dConv, expand } = this.config;
+        const { dModel, expand } = this.config;
         this.dInner  = expand * dModel;
-        this.dtRank  = this.config.dtRank ?? Math.ceil(dModel / 16);
+        this.dtRank  = config.dtRank ?? Math.ceil(dModel / 16);
+        // Initialize these before _initWeights so TypeScript is happy
+        this.wInProj = new Float32Array(0);
+        this.bInProj = new Float32Array(0);
+        this.wConv = new Float32Array(0);
+        this.bConv = new Float32Array(0);
+        this.wXProj = new Float32Array(0);
+        this.bXProj = new Float32Array(0);
+        this.wDtProj = new Float32Array(0);
+        this.bDtProj = new Float32Array(0);
+        this.A_log = new Float32Array(0);
+        this.D_vec = new Float32Array(0);
+        this.wOutProj = new Float32Array(0);
+        this.bOutProj = new Float32Array(0);
+        this.normWeight = new Float32Array(0);
+        this.gpuWeights = {};
+        this.pipelines = {};
-        // ---- Initialise learnable parameters (CPU → GPU) ----
         this._initWeights();
-        // ---- Compile GPU pipelines (once) ----
         this._buildPipelines();
     }
-    // ─── Weight initialisation ────────────────────────────────────────────────
-    _initWeights() {
+    private _initWeights(): void {
         const { dModel, dState, dConv } = this.config;
         const D = this.dInner;
         const N = dState;
         const K = dConv;
         const R = this.dtRank;
-        const randn = (n, std = 0.02) => {
+        const randn = (n: number, std = 0.02): Float32Array => {
             const a = new Float32Array(n);
             for (let i = 0; i < n; i++) {
-                // Box-Muller
                 const u1 = Math.random(), u2 = Math.random();
                 a[i] = std * Math.sqrt(-2 * Math.log(u1 + 1e-12)) * Math.cos(2 * Math.PI * u2);
             }
             return a;
         };
-        const zeros  = (n)    => new Float32Array(n);
-        const ones   = (n)    => new Float32Array(n).fill(1.0);
-        const linspace = (n)  => {
-            const a = new Float32Array(n);
-            for (let i = 0; i < n; i++) a[i] = i;
-            return a;
-        };
+        const zeros  = (n: number): Float32Array    => new Float32Array(n);
+        const ones   = (n: number): Float32Array    => new Float32Array(n).fill(1.0);
-        // in_proj: (2*D_inner, D_model) – up-projection (and z gate)
         this.wInProj  = randn(2 * D * dModel);
         this.bInProj  = zeros(2 * D);
-        // conv1d: weight (D_inner, K), bias (D_inner,)
         this.wConv    = randn(D * K, 0.01);
         this.bConv    = zeros(D);
-        // x_proj: (dt_rank + 2*N, D_inner) – projects x to Δ, B, C
         this.wXProj   = randn((R + 2 * N) * D, 0.01);
         this.bXProj   = zeros(R + 2 * N);
-        // dt_proj: (D_inner, dt_rank) – projects Δ to full D_inner width
         this.wDtProj  = randn(D * R, 0.02);
         this.bDtProj  = zeros(D);
-        // A: (D_inner, N) – log-space negative eigenvalues
-        // Initialised to log(range(1, N+1)) per HiPPO theory
         this.A_log    = new Float32Array(D * N);
         for (let d = 0; d < D; d++) {
             for (let n = 0; n < N; n++) {
@@ -119,23 +144,17 @@ export class MambaBlock {
             }
         }
-        // D: (D_inner,) – skip connection scale (initialised to 1)
         this.D_vec    = ones(D);
-        // out_proj: (D_model, D_inner) – down-projection
         this.wOutProj = randn(dModel * D, 0.02);
         this.bOutProj = zeros(dModel);
-        // RMSNorm scale: (D_model,)
         this.normWeight = ones(dModel);
-        // Upload all to GPU
         this._uploadWeightsToGPU();
     }
-    _uploadWeightsToGPU() {
+    private _uploadWeightsToGPU(): void {
         const d  = this.device;
-        const mk = (arr, readable = true) => createStorageBuffer(d, arr, readable);
+        const mk = (arr: Float32Array, readable = true): GPUBuffer => createStorageBuffer(d, arr, readable);
         this.gpuWeights = {
             wInProj  : mk(this.wInProj),
@@ -154,9 +173,7 @@ export class MambaBlock {
         };
     }
-    // ─── Pipeline compilation ─────────────────────────────────────────────────
-    _buildPipelines() {
+    private _buildPipelines(): void {
         const d = this.device;
         this.pipelines = {
@@ -169,19 +186,7 @@ export class MambaBlock {
         };
     }
-    // ─── Forward pass ─────────────────────────────────────────────────────────
-    /**
-     * Run the Mamba block forward pass on GPU.
-     *
-     * @param {GPUBuffer} xBuf   – input (batch * seqLen, dModel)
-     * @param {number}    batch
-     * @param {number}    seqLen
-     * @returns {{ output: GPUBuffer, cache: Object }}
-     *   output  – (batch * seqLen, dModel)
-     *   cache   – intermediate buffers needed for backward pass
-     */
-    forward(xBuf, batch, seqLen) {
+    forward(xBuf: GPUBuffer, batch: number, seqLen: number): BlockForwardResult {
         const d = this.device;
         const { dModel, dState, dConv } = this.config;
         const D = this.dInner;
@@ -191,45 +196,37 @@ export class MambaBlock {
         const M = B * L;
         const R = this.dtRank;
-        // Intermediate buffers (will be freed after backward or cached)
-        const cache = {};
+        const cache = {} as BlockCache;
-        // 1. RMSNorm: (M, dModel)
         const normOut  = createEmptyStorageBuffer(d, M * dModel * 4, true);
         const normInv  = createEmptyStorageBuffer(d, M * 4,          true);
         cache.normInv  = normInv;
         cache.normIn   = xBuf;
         {
-            // Pack params as Uint32 (num_rows, dim) + f32 (eps) ← 12 bytes padded to 16
             const params = new ArrayBuffer(16);
             new Uint32Array(params, 0, 2).set([M, dModel]);
             new Float32Array(params, 8, 1).set([1e-6]);
             const pBuf = createUniformBuffer(d, params);
-            const bg = createBindGroup(d, this.pipelines.rmsnorm,
-                [pBuf, xBuf, this.gpuWeights.normWeight, normOut, normInv]);
-            dispatchKernel(d, this.pipelines.rmsnorm, bg, [cdiv(M, 64), 1, 1]);
+            const bg = createBindGroup(d, this.pipelines['rmsnorm']!,
+                [pBuf, xBuf, this.gpuWeights['normWeight']!, normOut, normInv]);
+            dispatchKernel(d, this.pipelines['rmsnorm']!, bg, [cdiv(M, 64), 1, 1]);
         }
-        // 2. in_proj: (M, 2*D) = normOut @ wInProj^T + bInProj
         const inProjOut = createEmptyStorageBuffer(d, M * 2 * D * 4, true);
         cache.normOut   = normOut;
         {
             const params = new Uint32Array([M, dModel, 2 * D]).buffer;
             const pBuf   = createUniformBuffer(d, params);
-            const bg = createBindGroup(d, this.pipelines.linear,
-                [pBuf, normOut, this.gpuWeights.wInProj, this.gpuWeights.bInProj, inProjOut]);
-            dispatchKernel(d, this.pipelines.linear, bg, [cdiv(M, 16), cdiv(2 * D, 16), 1]);
+            const bg = createBindGroup(d, this.pipelines['linear']!,
+                [pBuf, normOut, this.gpuWeights['wInProj']!, this.gpuWeights['bInProj']!, inProjOut]);
+            dispatchKernel(d, this.pipelines['linear']!, bg, [cdiv(M, 16), cdiv(2 * D, 16), 1]);
         }
-        // Split inProjOut into x (M, D) and z (M, D) – the z-gate
-        // We reuse the same buffer with offsets since WGSL bindings can be offset.
-        // For simplicity, allocate two separate buffers and copy.
         const xConvIn  = createEmptyStorageBuffer(d, M * D * 4, true);
         const zBuf     = createEmptyStorageBuffer(d, M * D * 4, true);
         {
-            // Copy first D columns into xConvIn, last D columns into zBuf
             const enc = d.createCommandEncoder();
             enc.copyBufferToBuffer(inProjOut, 0,           xConvIn, 0, M * D * 4);
             enc.copyBufferToBuffer(inProjOut, M * D * 4,   zBuf,    0, M * D * 4);
@@ -237,39 +234,35 @@ export class MambaBlock {
         }
         cache.zBuf = zBuf;
-        // 3. Conv1D on xConvIn: (B, L, D) – depthwise causal conv
         const convOut = createEmptyStorageBuffer(d, M * D * 4, true);
         cache.xConvIn = xConvIn;
         {
             const params = new Uint32Array([L, D, dConv, B]).buffer;
             const pBuf   = createUniformBuffer(d, params);
-            const bg = createBindGroup(d, this.pipelines.conv1d,
-                [pBuf, xConvIn, this.gpuWeights.wConv, this.gpuWeights.bConv, convOut]);
-            dispatchKernel(d, this.pipelines.conv1d, bg, [cdiv(L, 16), cdiv(D, 16), B]);
+            const bg = createBindGroup(d, this.pipelines['conv1d']!,
+                [pBuf, xConvIn, this.gpuWeights['wConv']!, this.gpuWeights['bConv']!, convOut]);
+            dispatchKernel(d, this.pipelines['conv1d']!, bg, [cdiv(L, 16), cdiv(D, 16), B]);
         }
-        // 4. SiLU(convOut) in-place
         const siluOut = createEmptyStorageBuffer(d, M * D * 4, true);
         cache.convOut = convOut;
         {
             const params = new Uint32Array([M * D]).buffer;
             const pBuf   = createUniformBuffer(d, params);
-            const bg = createBindGroup(d, this.pipelines.silu,
+            const bg = createBindGroup(d, this.pipelines['silu']!,
                 [pBuf, convOut, siluOut]);
-            dispatchKernel(d, this.pipelines.silu, bg, [cdiv(M * D, 256), 1, 1]);
+            dispatchKernel(d, this.pipelines['silu']!, bg, [cdiv(M * D, 256), 1, 1]);
         }
-        // 5. x_proj: (M, R+2N) = siluOut @ wXProj^T + bXProj
         const xProjOut = createEmptyStorageBuffer(d, M * (R + 2 * N) * 4, true);
         {
             const params = new Uint32Array([M, D, R + 2 * N]).buffer;
             const pBuf   = createUniformBuffer(d, params);
-            const bg = createBindGroup(d, this.pipelines.linear,
-                [pBuf, siluOut, this.gpuWeights.wXProj, this.gpuWeights.bXProj, xProjOut]);
-            dispatchKernel(d, this.pipelines.linear, bg, [cdiv(M, 16), cdiv(R + 2 * N, 16), 1]);
+            const bg = createBindGroup(d, this.pipelines['linear']!,
+                [pBuf, siluOut, this.gpuWeights['wXProj']!, this.gpuWeights['bXProj']!, xProjOut]);
+            dispatchKernel(d, this.pipelines['linear']!, bg, [cdiv(M, 16), cdiv(R + 2 * N, 16), 1]);
         }
-        // Split xProjOut → dtRaw (M, R), B_raw (M*N flattened) = (B, L, N), C_raw (B, L, N)
         const dtRaw = createEmptyStorageBuffer(d, M * R * 4,     true);
         const B_raw = createEmptyStorageBuffer(d, B * L * N * 4, true);
         const C_raw = createEmptyStorageBuffer(d, B * L * N * 4, true);
@@ -281,18 +274,15 @@ export class MambaBlock {
             d.queue.submit([enc.finish()]);
         }
-        // 6. dt_proj: (M, D) = dtRaw @ wDtProj^T + bDtProj
         const deltaFull = createEmptyStorageBuffer(d, M * D * 4, true);
         {
             const params = new Uint32Array([M, R, D]).buffer;
             const pBuf   = createUniformBuffer(d, params);
-            const bg = createBindGroup(d, this.pipelines.linear,
-                [pBuf, dtRaw, this.gpuWeights.wDtProj, this.gpuWeights.bDtProj, deltaFull]);
-            dispatchKernel(d, this.pipelines.linear, bg, [cdiv(M, 16), cdiv(D, 16), 1]);
+            const bg = createBindGroup(d, this.pipelines['linear']!,
+                [pBuf, dtRaw, this.gpuWeights['wDtProj']!, this.gpuWeights['bDtProj']!, deltaFull]);
+            dispatchKernel(d, this.pipelines['linear']!, bg, [cdiv(M, 16), cdiv(D, 16), 1]);
         }
-        // 7. Selective Scan
-        //    Allocate y (B, L, D) and h_cache (2 * B*L*D*N) – first half for h, second for y_partial
         const scanY      = createEmptyStorageBuffer(d, B * L * D * 4,         true);
         const hCache     = createEmptyStorageBuffer(d, 2 * B * L * D * N * 4, true);
         cache.siluOut    = siluOut;
@@ -305,34 +295,28 @@ export class MambaBlock {
             const params = new Uint32Array([L, N, D, B]).buffer;
             const pBuf   = createUniformBuffer(d, params);
-            // forward_scan pass
-            const bg = createBindGroup(d, this.pipelines.scan_fwd,
-                [pBuf, siluOut, deltaFull, this.gpuWeights.A_log, B_raw, C_raw,
-                 this.gpuWeights.D_vec, scanY, hCache]);
-            dispatchKernel(d, this.pipelines.scan_fwd, bg,
+            const bg = createBindGroup(d, this.pipelines['scan_fwd']!,
+                [pBuf, siluOut, deltaFull, this.gpuWeights['A_log']!, B_raw, C_raw,
+                 this.gpuWeights['D_vec']!, scanY, hCache]);
+            dispatchKernel(d, this.pipelines['scan_fwd']!, bg,
                 [cdiv(D, 8), cdiv(N, 8), B]);
-            // forward_reduce pass (collapses N dim → y)
-            const bg2 = createBindGroup(d, this.pipelines.scan_reduce,
-                [pBuf, siluOut, deltaFull, this.gpuWeights.A_log, B_raw, C_raw,
-                 this.gpuWeights.D_vec, scanY, hCache]);
-            dispatchKernel(d, this.pipelines.scan_reduce, bg2,
+            const bg2 = createBindGroup(d, this.pipelines['scan_reduce']!,
+                [pBuf, siluOut, deltaFull, this.gpuWeights['A_log']!, B_raw, C_raw,
+                 this.gpuWeights['D_vec']!, scanY, hCache]);
+            dispatchKernel(d, this.pipelines['scan_reduce']!, bg2,
                 [cdiv(L, 64), D, B]);
         }
-        // 8. Gate: scanY *= SiLU(zBuf)  – element-wise product
         const siluZ   = createEmptyStorageBuffer(d, M * D * 4, true);
         const gatedOut = createEmptyStorageBuffer(d, M * D * 4, true);
         {
-            // SiLU(z)
             const params = new Uint32Array([M * D]).buffer;
             const pBuf   = createUniformBuffer(d, params);
-            const bg = createBindGroup(d, this.pipelines.silu,
+            const bg = createBindGroup(d, this.pipelines['silu']!,
                 [pBuf, zBuf, siluZ]);
-            dispatchKernel(d, this.pipelines.silu, bg, [cdiv(M * D, 256), 1, 1]);
+            dispatchKernel(d, this.pipelines['silu']!, bg, [cdiv(M * D, 256), 1, 1]);
-            // Element-wise multiply scanY * siluZ → gatedOut
-            // We encode this as a trivial compute pass using a small inline shader.
             const mulShader = /* wgsl */`
                 @group(0) @binding(0) var<storage, read>       a : array<f32>;
                 @group(0) @binding(1) var<storage, read>       b : array<f32>;
@@ -351,17 +335,15 @@ export class MambaBlock {
             dispatchKernel(d, mulPipeline, bgMul, [cdiv(M * D, 256), 1, 1]);
         }
-        // 9. out_proj: (M, dModel) = gatedOut @ wOutProj^T + bOutProj
         const outProjOut = createEmptyStorageBuffer(d, M * dModel * 4, true);
         {
             const params = new Uint32Array([M, D, dModel]).buffer;
             const pBuf   = createUniformBuffer(d, params);
-            const bg = createBindGroup(d, this.pipelines.linear,
-                [pBuf, gatedOut, this.gpuWeights.wOutProj, this.gpuWeights.bOutProj, outProjOut]);
-            dispatchKernel(d, this.pipelines.linear, bg, [cdiv(M, 16), cdiv(dModel, 16), 1]);
+            const bg = createBindGroup(d, this.pipelines['linear']!,
+                [pBuf, gatedOut, this.gpuWeights['wOutProj']!, this.gpuWeights['bOutProj']!, outProjOut]);
+            dispatchKernel(d, this.pipelines['linear']!, bg, [cdiv(M, 16), cdiv(dModel, 16), 1]);
         }
-        // 10. Residual add: output = outProjOut + x
         const output = createEmptyStorageBuffer(d, M * dModel * 4, true);
         {
             const addShader = /* wgsl */`
@@ -385,11 +367,7 @@ export class MambaBlock {
         return { output, cache };
     }
-    /**
-     * Return a list of all parameter GPU buffers (for the optimizer).
-     * @returns {Array<{buf: GPUBuffer, numel: number, name: string}>}
-     */
-    parameters() {
+    parameters(): BlockParam[] {
         const { dModel, dState, dConv } = this.config;
         const D = this.dInner;
         const N = dState;
@@ -397,45 +375,31 @@ export class MambaBlock {
         const R = this.dtRank;
         return [
-            { buf: this.gpuWeights.wInProj,   numel: 2 * D * dModel, name: 'wInProj'   },
-            { buf: this.gpuWeights.bInProj,   numel: 2 * D,          name: 'bInProj'   },
-            { buf: this.gpuWeights.wConv,     numel: D * K,           name: 'wConv'     },
-            { buf: this.gpuWeights.bConv,     numel: D,               name: 'bConv'     },
-            { buf: this.gpuWeights.wXProj,    numel: (R + 2*N) * D,   name: 'wXProj'   },
-            { buf: this.gpuWeights.bXProj,    numel: R + 2 * N,       name: 'bXProj'   },
-            { buf: this.gpuWeights.wDtProj,   numel: D * R,           name: 'wDtProj'  },
-            { buf: this.gpuWeights.bDtProj,   numel: D,               name: 'bDtProj'  },
-            { buf: this.gpuWeights.A_log,     numel: D * N,           name: 'A_log'    },
-            { buf: this.gpuWeights.D_vec,     numel: D,               name: 'D_vec'    },
-            { buf: this.gpuWeights.wOutProj,  numel: dModel * D,      name: 'wOutProj' },
-            { buf: this.gpuWeights.bOutProj,  numel: dModel,          name: 'bOutProj' },
-            { buf: this.gpuWeights.normWeight, numel: dModel,          name: 'normWeight'},
+            { buf: this.gpuWeights['wInProj']!,   numel: 2 * D * dModel, name: 'wInProj'   },
+            { buf: this.gpuWeights['bInProj']!,   numel: 2 * D,          name: 'bInProj'   },
+            { buf: this.gpuWeights['wConv']!,     numel: D * K,           name: 'wConv'     },
+            { buf: this.gpuWeights['bConv']!,     numel: D,               name: 'bConv'     },
+            { buf: this.gpuWeights['wXProj']!,    numel: (R + 2*N) * D,   name: 'wXProj'   },
+            { buf: this.gpuWeights['bXProj']!,    numel: R + 2 * N,       name: 'bXProj'   },
+            { buf: this.gpuWeights['wDtProj']!,   numel: D * R,           name: 'wDtProj'  },
+            { buf: this.gpuWeights['bDtProj']!,   numel: D,               name: 'bDtProj'  },
+            { buf: this.gpuWeights['A_log']!,     numel: D * N,           name: 'A_log'    },
+            { buf: this.gpuWeights['D_vec']!,     numel: D,               name: 'D_vec'    },
+            { buf: this.gpuWeights['wOutProj']!,  numel: dModel * D,      name: 'wOutProj' },
+            { buf: this.gpuWeights['bOutProj']!,  numel: dModel,          name: 'bOutProj' },
+            { buf: this.gpuWeights['normWeight']!, numel: dModel,          name: 'normWeight'},
         ];
     }
-    /**
-     * WSLA (Weight-Selective Local Adaptation) mode.
-     * Freezes all parameters except the B and C matrices (wXProj slice).
-     * This allows rapid local adaptation with minimal compute.
-     *
-     * @param {boolean} enabled
-     */
-    setWSLAMode(enabled) {
+    setWSLAMode(enabled: boolean): void {
         this._wslaMode = enabled;
-        // Mark which parameters receive gradients
-        // (The trainer checks this.getTrainableParams() during backward)
     }
-    /**
-     * Returns only the trainable parameters under WSLA mode.
-     * @returns {Array<{buf: GPUBuffer, numel: number, name: string}>}
-     */
-    getTrainableParams() {
+    getTrainableParams(): BlockParam[] {
         if (this._wslaMode) {
-            // Only B and C portions of wXProj
             return [
-                { buf: this.gpuWeights.wXProj, numel: this.wXProj.length, name: 'wXProj' },
-                { buf: this.gpuWeights.bXProj, numel: this.bXProj.length, name: 'bXProj' },
+                { buf: this.gpuWeights['wXProj']!, numel: this.wXProj.length, name: 'wXProj' },
+                { buf: this.gpuWeights['bXProj']!, numel: this.bXProj.length, name: 'bXProj' },
             ];
         }
         return this.parameters();