npm - @seanhogg/builderforce-memory-engine - Versions diffs - 2026.6.18 - Mend

@seanhogg/builderforce-memory-engine 2026.6.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (113) hide show

package/LICENSE +21 -0
package/README.md +393 -0
package/dist/index.d.ts +32 -0
package/dist/index.d.ts.map +1 -0
package/dist/index.js +40 -0
package/dist/index.js.map +1 -0
package/dist/kernels/activations.d.ts +5 -0
package/dist/kernels/activations.d.ts.map +1 -0
package/dist/kernels/activations.js +171 -0
package/dist/kernels/activations.js.map +1 -0
package/dist/kernels/attention.d.ts +19 -0
package/dist/kernels/attention.d.ts.map +1 -0
package/dist/kernels/attention.js +263 -0
package/dist/kernels/attention.js.map +1 -0
package/dist/kernels/complex_ssd.d.ts +33 -0
package/dist/kernels/complex_ssd.d.ts.map +1 -0
package/dist/kernels/complex_ssd.js +305 -0
package/dist/kernels/complex_ssd.js.map +1 -0
package/dist/kernels/conv1d.d.ts +3 -0
package/dist/kernels/conv1d.d.ts.map +1 -0
package/dist/kernels/conv1d.js +158 -0
package/dist/kernels/conv1d.js.map +1 -0
package/dist/kernels/linear_projection.d.ts +3 -0
package/dist/kernels/linear_projection.d.ts.map +1 -0
package/dist/kernels/linear_projection.js +219 -0
package/dist/kernels/linear_projection.js.map +1 -0
package/dist/kernels/selective_scan.d.ts +3 -0
package/dist/kernels/selective_scan.d.ts.map +1 -0
package/dist/kernels/selective_scan.js +348 -0
package/dist/kernels/selective_scan.js.map +1 -0
package/dist/kernels/ssd.d.ts +29 -0
package/dist/kernels/ssd.d.ts.map +1 -0
package/dist/kernels/ssd.js +276 -0
package/dist/kernels/ssd.js.map +1 -0
package/dist/kernels/weight_update.d.ts +3 -0
package/dist/kernels/weight_update.d.ts.map +1 -0
package/dist/kernels/weight_update.js +119 -0
package/dist/kernels/weight_update.js.map +1 -0
package/dist/model/attention_block.d.ts +48 -0
package/dist/model/attention_block.d.ts.map +1 -0
package/dist/model/attention_block.js +262 -0
package/dist/model/attention_block.js.map +1 -0
package/dist/model/mamba1_block.d.ts +70 -0
package/dist/model/mamba1_block.d.ts.map +1 -0
package/dist/model/mamba1_block.js +333 -0
package/dist/model/mamba1_block.js.map +1 -0
package/dist/model/mamba2_block.d.ts +44 -0
package/dist/model/mamba2_block.d.ts.map +1 -0
package/dist/model/mamba2_block.js +252 -0
package/dist/model/mamba2_block.js.map +1 -0
package/dist/model/mamba3_block.d.ts +51 -0
package/dist/model/mamba3_block.d.ts.map +1 -0
package/dist/model/mamba3_block.js +270 -0
package/dist/model/mamba3_block.js.map +1 -0
package/dist/model/mamba_block.d.ts +64 -0
package/dist/model/mamba_block.d.ts.map +1 -0
package/dist/model/mamba_block.js +303 -0
package/dist/model/mamba_block.js.map +1 -0
package/dist/model/mamba_model.d.ts +140 -0
package/dist/model/mamba_model.d.ts.map +1 -0
package/dist/model/mamba_model.js +527 -0
package/dist/model/mamba_model.js.map +1 -0
package/dist/model/sequence_layer.d.ts +25 -0
package/dist/model/sequence_layer.d.ts.map +1 -0
package/dist/model/sequence_layer.js +8 -0
package/dist/model/sequence_layer.js.map +1 -0
package/dist/tokenizer/bpe.d.ts +29 -0
package/dist/tokenizer/bpe.d.ts.map +1 -0
package/dist/tokenizer/bpe.js +164 -0
package/dist/tokenizer/bpe.js.map +1 -0
package/dist/training/autograd.d.ts +27 -0
package/dist/training/autograd.d.ts.map +1 -0
package/dist/training/autograd.js +120 -0
package/dist/training/autograd.js.map +1 -0
package/dist/training/trainer.d.ts +36 -0
package/dist/training/trainer.d.ts.map +1 -0
package/dist/training/trainer.js +183 -0
package/dist/training/trainer.js.map +1 -0
package/dist/utils/gpu_utils.d.ts +21 -0
package/dist/utils/gpu_utils.d.ts.map +1 -0
package/dist/utils/gpu_utils.js +111 -0
package/dist/utils/gpu_utils.js.map +1 -0
package/dist/utils/quantization.d.ts +26 -0
package/dist/utils/quantization.d.ts.map +1 -0
package/dist/utils/quantization.js +116 -0
package/dist/utils/quantization.js.map +1 -0
package/dist/utils/rng.d.ts +36 -0
package/dist/utils/rng.d.ts.map +1 -0
package/dist/utils/rng.js +61 -0
package/dist/utils/rng.js.map +1 -0
package/package.json +99 -0
package/src/index.ts +114 -0
package/src/kernels/activations.ts +174 -0
package/src/kernels/attention.ts +268 -0
package/src/kernels/complex_ssd.ts +307 -0
package/src/kernels/conv1d.ts +159 -0
package/src/kernels/linear_projection.ts +220 -0
package/src/kernels/selective_scan.ts +350 -0
package/src/kernels/ssd.ts +278 -0
package/src/kernels/weight_update.ts +120 -0
package/src/model/attention_block.ts +344 -0
package/src/model/mamba1_block.ts +437 -0
package/src/model/mamba2_block.ts +319 -0
package/src/model/mamba3_block.ts +335 -0
package/src/model/mamba_block.ts +401 -0
package/src/model/mamba_model.ts +678 -0
package/src/model/sequence_layer.ts +29 -0
package/src/tokenizer/bpe.ts +186 -0
package/src/training/autograd.ts +135 -0
package/src/training/trainer.ts +309 -0
package/src/utils/gpu_utils.ts +147 -0
package/src/utils/quantization.ts +154 -0
package/src/utils/rng.ts +65 -0

package/src/kernels/weight_update.ts ADDED Viewed

@@ -0,0 +1,120 @@
+// Weight Update WGSL Kernel (AdamW Optimizer)
+// Implements fused AdamW parameter update on the GPU.
+//
+// AdamW update rule:
+//   m_t = beta1 * m_{t-1} + (1 - beta1) * g_t
+//   v_t = beta2 * v_{t-1} + (1 - beta2) * g_t^2
+//   m_hat = m_t / (1 - beta1^t)
+//   v_hat = v_t / (1 - beta2^t)
+//   theta_t = theta_{t-1} * (1 - lr * weight_decay) - lr * m_hat / (sqrt(v_hat) + eps)
+export const WEIGHT_UPDATE_WGSL: string = /* wgsl */`
+struct AdamParams {
+    num_elements   : u32,
+    lr             : f32,   // learning rate
+    beta1          : f32,   // default 0.9
+    beta2          : f32,   // default 0.999
+    eps            : f32,   // default 1e-8
+    weight_decay   : f32,   // default 0.01
+    beta1_t        : f32,   // beta1^t  (precomputed bias correction term)
+    beta2_t        : f32,   // beta2^t
+};
+@group(0) @binding(0) var<uniform>             adam     : AdamParams;
+// param (N,)   – weight tensor (read-write: updated in-place)
+@group(0) @binding(1) var<storage, read_write> param    : array<f32>;
+// grad  (N,)   – gradient
+@group(0) @binding(2) var<storage, read>       grad     : array<f32>;
+// m     (N,)   – first moment
+@group(0) @binding(3) var<storage, read_write> m_state  : array<f32>;
+// v     (N,)   – second moment
+@group(0) @binding(4) var<storage, read_write> v_state  : array<f32>;
+// Dispatch: (ceil(N / 256), 1, 1)
+@compute @workgroup_size(256, 1, 1)
+fn adamw_update(
+    @builtin(global_invocation_id) gid : vec3<u32>,
+) {
+    let i = gid.x;
+    if (i >= adam.num_elements) { return; }
+    let g = grad[i];
+    let p = param[i];
+    // Moment updates
+    let m_new = adam.beta1 * m_state[i] + (1.0 - adam.beta1) * g;
+    let v_new = adam.beta2 * v_state[i] + (1.0 - adam.beta2) * g * g;
+    m_state[i] = m_new;
+    v_state[i] = v_new;
+    // Bias-corrected estimates
+    let m_hat = m_new / (1.0 - adam.beta1_t);
+    let v_hat = v_new / (1.0 - adam.beta2_t);
+    // Weight decay (decoupled) + gradient step
+    param[i] = p * (1.0 - adam.lr * adam.weight_decay) -
+               adam.lr * m_hat / (sqrt(v_hat) + adam.eps);
+}
+`;
+// Gradient clipping kernel – clips global gradient norm to max_norm.
+// Run before weight updates.  Two-pass: first compute squared norm, then scale.
+export const GRAD_CLIP_WGSL: string = /* wgsl */`
+struct ClipParams {
+    num_elements : u32,
+    max_norm_sq  : f32,   // max_norm^2
+};
+@group(0) @binding(0) var<uniform>             clip_p  : ClipParams;
+@group(0) @binding(1) var<storage, read_write> grad    : array<f32>;
+@group(0) @binding(2) var<storage, read_write> norm_sq : array<f32>;  // size 1, atomic accumulator
+var<workgroup> local_sq : array<f32, 256>;
+// Pass 1: reduce sum of squares into norm_sq[0]
+@compute @workgroup_size(256, 1, 1)
+fn grad_norm_reduce(
+    @builtin(global_invocation_id)   gid : vec3<u32>,
+    @builtin(local_invocation_index) lid : u32,
+) {
+    let i = gid.x;
+    local_sq[lid] = 0.0;
+    if (i < clip_p.num_elements) {
+        local_sq[lid] = grad[i] * grad[i];
+    }
+    workgroupBarrier();
+    // Parallel reduction within workgroup
+    var s: u32 = 128u;
+    loop {
+        if (s == 0u) { break; }
+        if (lid < s) {
+            local_sq[lid] = local_sq[lid] + local_sq[lid + s];
+        }
+        workgroupBarrier();
+        s = s >> 1u;
+    }
+    if (lid == 0u) {
+        // Non-atomic accumulation (single workgroup assumption for small models)
+        norm_sq[0] = norm_sq[0] + local_sq[0];
+    }
+}
+// Pass 2: scale gradients if norm exceeds max_norm
+@compute @workgroup_size(256, 1, 1)
+fn grad_clip_scale(
+    @builtin(global_invocation_id) gid : vec3<u32>,
+) {
+    let i = gid.x;
+    if (i >= clip_p.num_elements) { return; }
+    let ns = norm_sq[0];
+    if (ns > clip_p.max_norm_sq) {
+        let scale = sqrt(clip_p.max_norm_sq / ns);
+        grad[i] = grad[i] * scale;
+    }
+}
+`;

package/src/model/attention_block.ts ADDED Viewed

@@ -0,0 +1,344 @@
+/**
+ * attention_block.ts – Causal Multi-Head Self-Attention Block.
+ *
+ * Intentionally simple for WebGPU — naive O(L²) tiled attention,
+ * no Flash-Attention dependency. Suitable for hybrid (Jamba/Zamba) schedules
+ * where a few attention layers interleave with SSM layers.
+ *
+ * Data flow:
+ *   Input (B, L, D_model)
+ *     └─ RMSNorm
+ *     └─ wQKV → Q (B,L,H,dh), K (B,L,H,dh), V (B,L,H,dh)
+ *     └─ causal attention scores = Q·Kᵀ / √dh  (masked)
+ *     └─ softmax
+ *     └─ weighted V sum
+ *     └─ concat heads → wO → D_model
+ *     └─ + residual
+ *   [optional FFN sublayer]
+ *
+ * Implements SequenceLayer.
+ */
+import {
+    createComputePipeline,
+    createBindGroup,
+    createStorageBuffer,
+    createEmptyStorageBuffer,
+    createUniformBuffer,
+    dispatchKernel,
+    cdiv,
+} from '../utils/gpu_utils.js';
+import {
+    ATTENTION_FORWARD_WGSL,
+    SOFTMAX_WGSL,
+} from '../kernels/attention.js';
+import { LINEAR_FORWARD_WGSL } from '../kernels/linear_projection.js';
+import { gaussianArray } from '../utils/rng.js';
+import { ACTIVATIONS_WGSL }    from '../kernels/activations.js';
+import type { SequenceLayer, LayerForwardResult, LayerParam } from './sequence_layer.js';
+export interface AttentionBlockConfig {
+    dModel  : number;
+    nHeads  : number;
+    dHead?  : number;   // default dModel / nHeads
+    hasFfn? : boolean;  // include 4×dModel FFN sublayer
+    ffnMult?: number;   // FFN expansion factor (default 4)
+}
+export interface AttentionCache {
+    scores: GPUBuffer;  // post-softmax scores for backward
+}
+const ADD_SHADER = /* wgsl */`
+@group(0) @binding(0) var<storage, read>       a : array<f32>;
+@group(0) @binding(1) var<storage, read>       b : array<f32>;
+@group(0) @binding(2) var<storage, read_write> c : array<f32>;
+@group(0) @binding(3) var<uniform>             n : u32;
+@compute @workgroup_size(256)
+fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
+    let i = gid.x;
+    if (i < n) { c[i] = a[i] + b[i]; }
+}
+`;
+// SiLU for FFN
+const SILU_SHADER = /* wgsl */`
+struct ActParams { num_elements: u32; };
+@group(0) @binding(0) var<uniform>             p : ActParams;
+@group(0) @binding(1) var<storage, read>       x : array<f32>;
+@group(0) @binding(2) var<storage, read_write> y : array<f32>;
+@compute @workgroup_size(256, 1, 1)
+fn silu_forward(@builtin(global_invocation_id) gid: vec3<u32>) {
+    let i = gid.x;
+    if (i >= p.num_elements) { return; }
+    let v = x[i];
+    y[i] = v / (1.0 + exp(-v));
+}
+`;
+export class AttentionBlock implements SequenceLayer {
+    readonly layerType = 'attention' as const;
+    device : GPUDevice;
+    config : Required<AttentionBlockConfig>;
+    dHead  : number;
+    gpuWeights: Record<string, GPUBuffer>;
+    pipelines : Record<string, GPUComputePipeline>;
+    constructor(device: GPUDevice, config: AttentionBlockConfig) {
+        this.device = device;
+        if (config.dModel % config.nHeads !== 0) {
+            throw new Error(
+                `AttentionBlock: dModel (${config.dModel}) must be divisible by nHeads (${config.nHeads}).`
+            );
+        }
+        this.config = {
+            dHead  : config.dModel / config.nHeads,
+            hasFfn : false,
+            ffnMult: 4,
+            ...config,
+        } as Required<AttentionBlockConfig>;
+        this.dHead = this.config.dHead;
+        this.gpuWeights = {};
+        this.pipelines  = {};
+        this._initWeights();
+        this._buildPipelines();
+    }
+    private _initWeights(): void {
+        const { dModel, nHeads, hasFfn, ffnMult } = this.config;
+        const randn = (n: number, std = 0.02): Float32Array => gaussianArray(n, std);
+        const zeros = (n: number) => new Float32Array(n);
+        const ones  = (n: number) => new Float32Array(n).fill(1.0);
+        const mk    = (arr: Float32Array) => createStorageBuffer(this.device, arr, true);
+        this.gpuWeights = {
+            wQKV      : mk(randn(3 * dModel * dModel)),
+            bQKV      : mk(zeros(3 * dModel)),
+            wO        : mk(randn(dModel * dModel)),
+            bO        : mk(zeros(dModel)),
+            normWeight: mk(ones(dModel)),
+        };
+        if (hasFfn) {
+            const ffnDim = dModel * ffnMult;
+            this.gpuWeights['wFfn1'] = mk(randn(ffnDim * dModel));
+            this.gpuWeights['bFfn1'] = mk(zeros(ffnDim));
+            this.gpuWeights['wFfn2'] = mk(randn(dModel * ffnDim));
+            this.gpuWeights['bFfn2'] = mk(zeros(dModel));
+        }
+    }
+    private _buildPipelines(): void {
+        const d = this.device;
+        this.pipelines = {
+            linear  : createComputePipeline(d, LINEAR_FORWARD_WGSL,     'linear_forward'),
+            rmsnorm : createComputePipeline(d, ACTIVATIONS_WGSL,        'rmsnorm_forward'),
+            attn_fwd: createComputePipeline(d, ATTENTION_FORWARD_WGSL,  'attention_forward'),
+            attn_val: createComputePipeline(d, ATTENTION_FORWARD_WGSL,  'attention_value'),
+            softmax : createComputePipeline(d, SOFTMAX_WGSL,            'softmax_forward'),
+            elAdd   : createComputePipeline(d, ADD_SHADER,              'main'),
+        };
+        if (this.config.hasFfn) {
+            this.pipelines['silu'] = createComputePipeline(d, SILU_SHADER, 'silu_forward');
+        }
+    }
+    forward(xBuf: GPUBuffer, batch: number, seqLen: number): LayerForwardResult {
+        const d = this.device;
+        const { dModel, nHeads, hasFfn } = this.config;
+        const dh = this.dHead;
+        const B  = batch;
+        const L  = seqLen;
+        const M  = B * L;
+        const H  = nHeads;
+        // 1. Pre-block RMSNorm
+        const normOut = createEmptyStorageBuffer(d, M * dModel * 4, true);
+        const normInv = createEmptyStorageBuffer(d, M * 4, true);
+        {
+            const params = new ArrayBuffer(16);
+            new Uint32Array(params, 0, 2).set([M, dModel]);
+            new Float32Array(params, 8, 1).set([1e-6]);
+            const pBuf = createUniformBuffer(d, params);
+            const bg = createBindGroup(d, this.pipelines['rmsnorm']!,
+                [pBuf, xBuf, this.gpuWeights['normWeight']!, normOut, normInv]);
+            dispatchKernel(d, this.pipelines['rmsnorm']!, bg, [cdiv(M, 64), 1, 1]);
+        }
+        normInv.destroy();
+        // 2. QKV projection: [B, L, 3*D]
+        const qkvOut = createEmptyStorageBuffer(d, M * 3 * dModel * 4, true);
+        {
+            const params = new Uint32Array([M, dModel, 3 * dModel]).buffer;
+            const pBuf   = createUniformBuffer(d, params);
+            const bg = createBindGroup(d, this.pipelines['linear']!,
+                [pBuf, normOut, this.gpuWeights['wQKV']!, this.gpuWeights['bQKV']!, qkvOut]);
+            dispatchKernel(d, this.pipelines['linear']!, bg, [cdiv(M, 16), cdiv(3 * dModel, 16), 1]);
+        }
+        normOut.destroy();
+        // Split QKV into Q, K, V: each [B, L, H, dh] = [B, L, D]
+        const QBuf = createEmptyStorageBuffer(d, M * dModel * 4, true);
+        const KBuf = createEmptyStorageBuffer(d, M * dModel * 4, true);
+        const VBuf = createEmptyStorageBuffer(d, M * dModel * 4, true);
+        {
+            const enc = d.createCommandEncoder();
+            enc.copyBufferToBuffer(qkvOut, 0,               QBuf, 0, M * dModel * 4);
+            enc.copyBufferToBuffer(qkvOut, M * dModel * 4,   KBuf, 0, M * dModel * 4);
+            enc.copyBufferToBuffer(qkvOut, 2 * M * dModel * 4, VBuf, 0, M * dModel * 4);
+            d.queue.submit([enc.finish()]);
+        }
+        qkvOut.destroy();
+        // 3. Attention scores: [B, H, L, L]
+        const scores = createEmptyStorageBuffer(d, B * H * L * L * 4, true);
+        {
+            const attnParams = new Uint32Array([B, L, dModel, H, dh]).buffer;
+            const pBuf = createUniformBuffer(d, attnParams);
+            const bg = createBindGroup(d, this.pipelines['attn_fwd']!,
+                [pBuf, QBuf, KBuf, VBuf, scores,
+                 createEmptyStorageBuffer(d, M * dModel * 4, true)]);  // out_buf placeholder
+            dispatchKernel(d, this.pipelines['attn_fwd']!, bg, [cdiv(L, 16), H, B]);
+        }
+        // 4. Softmax (causal) per row: dispatch (L, H, B)
+        {
+            const smParams = new Uint32Array([L, L, 1]).buffer;
+            const pBuf = createUniformBuffer(d, smParams);
+            const bg = createBindGroup(d, this.pipelines['softmax']!,
+                [pBuf, scores]);
+            dispatchKernel(d, this.pipelines['softmax']!, bg, [L, H, B]);
+        }
+        // 5. Weighted V sum → attn output [B, L, H, dh]
+        const attnOut = createEmptyStorageBuffer(d, M * dModel * 4, true);
+        {
+            const attnParams = new Uint32Array([B, L, dModel, H, dh]).buffer;
+            const pBuf = createUniformBuffer(d, attnParams);
+            const bg = createBindGroup(d, this.pipelines['attn_val']!,
+                [pBuf, QBuf, KBuf, VBuf, scores, attnOut]);
+            dispatchKernel(d, this.pipelines['attn_val']!, bg, [cdiv(L, 16), H, B]);
+        }
+        QBuf.destroy();
+        KBuf.destroy();
+        VBuf.destroy();
+        // 6. Output projection: [B, L, D] → [B, L, D]
+        const outProjOut = createEmptyStorageBuffer(d, M * dModel * 4, true);
+        {
+            const params = new Uint32Array([M, dModel, dModel]).buffer;
+            const pBuf   = createUniformBuffer(d, params);
+            const bg = createBindGroup(d, this.pipelines['linear']!,
+                [pBuf, attnOut, this.gpuWeights['wO']!, this.gpuWeights['bO']!, outProjOut]);
+            dispatchKernel(d, this.pipelines['linear']!, bg, [cdiv(M, 16), cdiv(dModel, 16), 1]);
+        }
+        attnOut.destroy();
+        // 7. Residual add
+        let current = createEmptyStorageBuffer(d, M * dModel * 4, true);
+        {
+            const nBuf = createUniformBuffer(d, new Uint32Array([M * dModel]).buffer);
+            const bg   = createBindGroup(d, this.pipelines['elAdd']!,
+                [outProjOut, xBuf, current, nBuf]);
+            dispatchKernel(d, this.pipelines['elAdd']!, bg, [cdiv(M * dModel, 256), 1, 1]);
+        }
+        outProjOut.destroy();
+        // 8. Optional FFN sublayer
+        if (hasFfn) {
+            const { ffnMult } = this.config;
+            const ffnDim = dModel * ffnMult;
+            const ffn1Out = createEmptyStorageBuffer(d, M * ffnDim * 4, true);
+            {
+                const params = new Uint32Array([M, dModel, ffnDim]).buffer;
+                const pBuf   = createUniformBuffer(d, params);
+                const bg = createBindGroup(d, this.pipelines['linear']!,
+                    [pBuf, current, this.gpuWeights['wFfn1']!, this.gpuWeights['bFfn1']!, ffn1Out]);
+                dispatchKernel(d, this.pipelines['linear']!, bg, [cdiv(M, 16), cdiv(ffnDim, 16), 1]);
+            }
+            const siluOut = createEmptyStorageBuffer(d, M * ffnDim * 4, true);
+            {
+                const nBuf = createUniformBuffer(d, new Uint32Array([M * ffnDim]).buffer);
+                const bg   = createBindGroup(d, this.pipelines['silu']!,
+                    [nBuf, ffn1Out, siluOut]);
+                dispatchKernel(d, this.pipelines['silu']!, bg, [cdiv(M * ffnDim, 256), 1, 1]);
+            }
+            ffn1Out.destroy();
+            const ffn2Out = createEmptyStorageBuffer(d, M * dModel * 4, true);
+            {
+                const params = new Uint32Array([M, ffnDim, dModel]).buffer;
+                const pBuf   = createUniformBuffer(d, params);
+                const bg = createBindGroup(d, this.pipelines['linear']!,
+                    [pBuf, siluOut, this.gpuWeights['wFfn2']!, this.gpuWeights['bFfn2']!, ffn2Out]);
+                dispatchKernel(d, this.pipelines['linear']!, bg, [cdiv(M, 16), cdiv(dModel, 16), 1]);
+            }
+            siluOut.destroy();
+            const residual2 = createEmptyStorageBuffer(d, M * dModel * 4, true);
+            {
+                const nBuf = createUniformBuffer(d, new Uint32Array([M * dModel]).buffer);
+                const bg   = createBindGroup(d, this.pipelines['elAdd']!,
+                    [ffn2Out, current, residual2, nBuf]);
+                dispatchKernel(d, this.pipelines['elAdd']!, bg, [cdiv(M * dModel, 256), 1, 1]);
+            }
+            ffn2Out.destroy();
+            current.destroy();
+            current = residual2;
+        }
+        const cache: AttentionCache = { scores };
+        return { output: current, cache };
+    }
+    parameters(): LayerParam[] {
+        const { dModel, hasFfn, ffnMult } = this.config;
+        const params: LayerParam[] = [
+            { buf: this.gpuWeights['wQKV']!,      numel: 3 * dModel * dModel, name: 'wQKV'      },
+            { buf: this.gpuWeights['bQKV']!,      numel: 3 * dModel,          name: 'bQKV'      },
+            { buf: this.gpuWeights['wO']!,         numel: dModel * dModel,     name: 'wO'        },
+            { buf: this.gpuWeights['bO']!,         numel: dModel,              name: 'bO'        },
+            { buf: this.gpuWeights['normWeight']!, numel: dModel,              name: 'normWeight'},
+        ];
+        if (hasFfn) {
+            const ffnDim = dModel * ffnMult;
+            params.push(
+                { buf: this.gpuWeights['wFfn1']!, numel: ffnDim * dModel, name: 'wFfn1' },
+                { buf: this.gpuWeights['bFfn1']!, numel: ffnDim,          name: 'bFfn1' },
+                { buf: this.gpuWeights['wFfn2']!, numel: dModel * ffnDim, name: 'wFfn2' },
+                { buf: this.gpuWeights['bFfn2']!, numel: dModel,          name: 'bFfn2' },
+            );
+        }
+        return params;
+    }
+    getTrainableParams(): LayerParam[] {
+        // Attention layers are always fully trained — no WSLA subset
+        return this.parameters();
+    }
+    setWSLAMode(_enabled: boolean): void {
+        // No-op for attention: WSLA does not apply
+    }
+    destroy(): void {
+        for (const buf of Object.values(this.gpuWeights)) buf.destroy();
+        this.gpuWeights = {};
+    }
+}