@seanhogg/builderforce-memory-engine 2026.6.18

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (113) hide show
  1. package/LICENSE +21 -0
  2. package/README.md +393 -0
  3. package/dist/index.d.ts +32 -0
  4. package/dist/index.d.ts.map +1 -0
  5. package/dist/index.js +40 -0
  6. package/dist/index.js.map +1 -0
  7. package/dist/kernels/activations.d.ts +5 -0
  8. package/dist/kernels/activations.d.ts.map +1 -0
  9. package/dist/kernels/activations.js +171 -0
  10. package/dist/kernels/activations.js.map +1 -0
  11. package/dist/kernels/attention.d.ts +19 -0
  12. package/dist/kernels/attention.d.ts.map +1 -0
  13. package/dist/kernels/attention.js +263 -0
  14. package/dist/kernels/attention.js.map +1 -0
  15. package/dist/kernels/complex_ssd.d.ts +33 -0
  16. package/dist/kernels/complex_ssd.d.ts.map +1 -0
  17. package/dist/kernels/complex_ssd.js +305 -0
  18. package/dist/kernels/complex_ssd.js.map +1 -0
  19. package/dist/kernels/conv1d.d.ts +3 -0
  20. package/dist/kernels/conv1d.d.ts.map +1 -0
  21. package/dist/kernels/conv1d.js +158 -0
  22. package/dist/kernels/conv1d.js.map +1 -0
  23. package/dist/kernels/linear_projection.d.ts +3 -0
  24. package/dist/kernels/linear_projection.d.ts.map +1 -0
  25. package/dist/kernels/linear_projection.js +219 -0
  26. package/dist/kernels/linear_projection.js.map +1 -0
  27. package/dist/kernels/selective_scan.d.ts +3 -0
  28. package/dist/kernels/selective_scan.d.ts.map +1 -0
  29. package/dist/kernels/selective_scan.js +348 -0
  30. package/dist/kernels/selective_scan.js.map +1 -0
  31. package/dist/kernels/ssd.d.ts +29 -0
  32. package/dist/kernels/ssd.d.ts.map +1 -0
  33. package/dist/kernels/ssd.js +276 -0
  34. package/dist/kernels/ssd.js.map +1 -0
  35. package/dist/kernels/weight_update.d.ts +3 -0
  36. package/dist/kernels/weight_update.d.ts.map +1 -0
  37. package/dist/kernels/weight_update.js +119 -0
  38. package/dist/kernels/weight_update.js.map +1 -0
  39. package/dist/model/attention_block.d.ts +48 -0
  40. package/dist/model/attention_block.d.ts.map +1 -0
  41. package/dist/model/attention_block.js +262 -0
  42. package/dist/model/attention_block.js.map +1 -0
  43. package/dist/model/mamba1_block.d.ts +70 -0
  44. package/dist/model/mamba1_block.d.ts.map +1 -0
  45. package/dist/model/mamba1_block.js +333 -0
  46. package/dist/model/mamba1_block.js.map +1 -0
  47. package/dist/model/mamba2_block.d.ts +44 -0
  48. package/dist/model/mamba2_block.d.ts.map +1 -0
  49. package/dist/model/mamba2_block.js +252 -0
  50. package/dist/model/mamba2_block.js.map +1 -0
  51. package/dist/model/mamba3_block.d.ts +51 -0
  52. package/dist/model/mamba3_block.d.ts.map +1 -0
  53. package/dist/model/mamba3_block.js +270 -0
  54. package/dist/model/mamba3_block.js.map +1 -0
  55. package/dist/model/mamba_block.d.ts +64 -0
  56. package/dist/model/mamba_block.d.ts.map +1 -0
  57. package/dist/model/mamba_block.js +303 -0
  58. package/dist/model/mamba_block.js.map +1 -0
  59. package/dist/model/mamba_model.d.ts +140 -0
  60. package/dist/model/mamba_model.d.ts.map +1 -0
  61. package/dist/model/mamba_model.js +527 -0
  62. package/dist/model/mamba_model.js.map +1 -0
  63. package/dist/model/sequence_layer.d.ts +25 -0
  64. package/dist/model/sequence_layer.d.ts.map +1 -0
  65. package/dist/model/sequence_layer.js +8 -0
  66. package/dist/model/sequence_layer.js.map +1 -0
  67. package/dist/tokenizer/bpe.d.ts +29 -0
  68. package/dist/tokenizer/bpe.d.ts.map +1 -0
  69. package/dist/tokenizer/bpe.js +164 -0
  70. package/dist/tokenizer/bpe.js.map +1 -0
  71. package/dist/training/autograd.d.ts +27 -0
  72. package/dist/training/autograd.d.ts.map +1 -0
  73. package/dist/training/autograd.js +120 -0
  74. package/dist/training/autograd.js.map +1 -0
  75. package/dist/training/trainer.d.ts +36 -0
  76. package/dist/training/trainer.d.ts.map +1 -0
  77. package/dist/training/trainer.js +183 -0
  78. package/dist/training/trainer.js.map +1 -0
  79. package/dist/utils/gpu_utils.d.ts +21 -0
  80. package/dist/utils/gpu_utils.d.ts.map +1 -0
  81. package/dist/utils/gpu_utils.js +111 -0
  82. package/dist/utils/gpu_utils.js.map +1 -0
  83. package/dist/utils/quantization.d.ts +26 -0
  84. package/dist/utils/quantization.d.ts.map +1 -0
  85. package/dist/utils/quantization.js +116 -0
  86. package/dist/utils/quantization.js.map +1 -0
  87. package/dist/utils/rng.d.ts +36 -0
  88. package/dist/utils/rng.d.ts.map +1 -0
  89. package/dist/utils/rng.js +61 -0
  90. package/dist/utils/rng.js.map +1 -0
  91. package/package.json +99 -0
  92. package/src/index.ts +114 -0
  93. package/src/kernels/activations.ts +174 -0
  94. package/src/kernels/attention.ts +268 -0
  95. package/src/kernels/complex_ssd.ts +307 -0
  96. package/src/kernels/conv1d.ts +159 -0
  97. package/src/kernels/linear_projection.ts +220 -0
  98. package/src/kernels/selective_scan.ts +350 -0
  99. package/src/kernels/ssd.ts +278 -0
  100. package/src/kernels/weight_update.ts +120 -0
  101. package/src/model/attention_block.ts +344 -0
  102. package/src/model/mamba1_block.ts +437 -0
  103. package/src/model/mamba2_block.ts +319 -0
  104. package/src/model/mamba3_block.ts +335 -0
  105. package/src/model/mamba_block.ts +401 -0
  106. package/src/model/mamba_model.ts +678 -0
  107. package/src/model/sequence_layer.ts +29 -0
  108. package/src/tokenizer/bpe.ts +186 -0
  109. package/src/training/autograd.ts +135 -0
  110. package/src/training/trainer.ts +309 -0
  111. package/src/utils/gpu_utils.ts +147 -0
  112. package/src/utils/quantization.ts +154 -0
  113. package/src/utils/rng.ts +65 -0
@@ -0,0 +1,437 @@
1
+ /**
2
+ * mamba1_block.ts – Mamba-1 Mixer Block (S6 selective scan).
3
+ *
4
+ * Renamed from mamba_block.ts; MambaBlock is kept as a deprecated alias.
5
+ * Implements SequenceLayer so HybridMambaModel can iterate blocks generically.
6
+ */
7
+
8
+ import {
9
+ createComputePipeline,
10
+ createBindGroup,
11
+ createStorageBuffer,
12
+ createEmptyStorageBuffer,
13
+ createUniformBuffer,
14
+ dispatchKernel,
15
+ cdiv,
16
+ } from '../utils/gpu_utils.js';
17
+
18
+ import { SELECTIVE_SCAN_FORWARD_WGSL } from '../kernels/selective_scan.js';
19
+ import { gaussianArray } from '../utils/rng.js';
20
+ import { CONV1D_FORWARD_WGSL } from '../kernels/conv1d.js';
21
+ import { LINEAR_FORWARD_WGSL } from '../kernels/linear_projection.js';
22
+ import { ACTIVATIONS_WGSL } from '../kernels/activations.js';
23
+
24
+ import type { SequenceLayer, LayerForwardResult, LayerParam } from './sequence_layer.js';
25
+
26
+ export interface Mamba1BlockConfig {
27
+ dModel : number;
28
+ dState? : number;
29
+ dConv? : number;
30
+ expand? : number;
31
+ dtRank? : number;
32
+ biasConv?: boolean;
33
+ }
34
+
35
+ /** @deprecated Use LayerParam */
36
+ export type BlockParam = LayerParam;
37
+
38
+ export interface BlockCache {
39
+ normInv : GPUBuffer;
40
+ normIn : GPUBuffer;
41
+ normOut : GPUBuffer;
42
+ zBuf : GPUBuffer;
43
+ xConvIn : GPUBuffer;
44
+ convOut : GPUBuffer;
45
+ siluOut : GPUBuffer;
46
+ deltaFull : GPUBuffer;
47
+ B_raw : GPUBuffer;
48
+ C_raw : GPUBuffer;
49
+ hCache : GPUBuffer;
50
+ }
51
+
52
+ export interface BlockForwardResult extends LayerForwardResult {
53
+ output : GPUBuffer;
54
+ cache : BlockCache;
55
+ }
56
+
57
+ // ── Element-wise helper shaders (compiled once per pipeline) ─────────────────
58
+
59
+ const MUL_SHADER = /* wgsl */`
60
+ @group(0) @binding(0) var<storage, read> a : array<f32>;
61
+ @group(0) @binding(1) var<storage, read> b : array<f32>;
62
+ @group(0) @binding(2) var<storage, read_write> c : array<f32>;
63
+ @group(0) @binding(3) var<uniform> n : u32;
64
+ @compute @workgroup_size(256)
65
+ fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
66
+ let i = gid.x;
67
+ if (i < n) { c[i] = a[i] * b[i]; }
68
+ }
69
+ `;
70
+
71
+ const ADD_SHADER = /* wgsl */`
72
+ @group(0) @binding(0) var<storage, read> a : array<f32>;
73
+ @group(0) @binding(1) var<storage, read> b : array<f32>;
74
+ @group(0) @binding(2) var<storage, read_write> c : array<f32>;
75
+ @group(0) @binding(3) var<uniform> n : u32;
76
+ @compute @workgroup_size(256)
77
+ fn main(@builtin(global_invocation_id) gid: vec3<u32>) {
78
+ let i = gid.x;
79
+ if (i < n) { c[i] = a[i] + b[i]; }
80
+ }
81
+ `;
82
+
83
+ // ── Mamba1Block ───────────────────────────────────────────────────────────────
84
+
85
+ export class Mamba1Block implements SequenceLayer {
86
+ readonly layerType = 'mamba1' as const;
87
+
88
+ device : GPUDevice;
89
+ config : Required<Mamba1BlockConfig>;
90
+ dInner : number;
91
+ dtRank : number;
92
+
93
+ wInProj : Float32Array;
94
+ bInProj : Float32Array;
95
+ wConv : Float32Array;
96
+ bConv : Float32Array;
97
+ wXProj : Float32Array;
98
+ bXProj : Float32Array;
99
+ wDtProj : Float32Array;
100
+ bDtProj : Float32Array;
101
+ A_log : Float32Array;
102
+ D_vec : Float32Array;
103
+ wOutProj : Float32Array;
104
+ bOutProj : Float32Array;
105
+ normWeight: Float32Array;
106
+
107
+ gpuWeights : Record<string, GPUBuffer>;
108
+ pipelines : Record<string, GPUComputePipeline>;
109
+
110
+ private _wslaMode = false;
111
+
112
+ constructor(device: GPUDevice, config: Mamba1BlockConfig) {
113
+ this.device = device;
114
+ this.config = {
115
+ dState : 16,
116
+ dConv : 4,
117
+ expand : 2,
118
+ biasConv: true,
119
+ dtRank : Math.ceil(config.dModel / 16),
120
+ ...config,
121
+ } as Required<Mamba1BlockConfig>;
122
+
123
+ const { dModel, expand } = this.config;
124
+ this.dInner = expand * dModel;
125
+ this.dtRank = config.dtRank ?? Math.ceil(dModel / 16);
126
+
127
+ this.wInProj = new Float32Array(0);
128
+ this.bInProj = new Float32Array(0);
129
+ this.wConv = new Float32Array(0);
130
+ this.bConv = new Float32Array(0);
131
+ this.wXProj = new Float32Array(0);
132
+ this.bXProj = new Float32Array(0);
133
+ this.wDtProj = new Float32Array(0);
134
+ this.bDtProj = new Float32Array(0);
135
+ this.A_log = new Float32Array(0);
136
+ this.D_vec = new Float32Array(0);
137
+ this.wOutProj = new Float32Array(0);
138
+ this.bOutProj = new Float32Array(0);
139
+ this.normWeight = new Float32Array(0);
140
+ this.gpuWeights = {};
141
+ this.pipelines = {};
142
+
143
+ this._initWeights();
144
+ this._buildPipelines();
145
+ }
146
+
147
+ private _initWeights(): void {
148
+ const { dModel, dState, dConv } = this.config;
149
+ const D = this.dInner;
150
+ const N = dState;
151
+ const K = dConv;
152
+ const R = this.dtRank;
153
+
154
+ const randn = (n: number, std = 0.02): Float32Array => gaussianArray(n, std);
155
+
156
+ const zeros = (n: number): Float32Array => new Float32Array(n);
157
+ const ones = (n: number): Float32Array => new Float32Array(n).fill(1.0);
158
+
159
+ this.wInProj = randn(2 * D * dModel);
160
+ this.bInProj = zeros(2 * D);
161
+ this.wConv = randn(D * K, 0.01);
162
+ this.bConv = zeros(D);
163
+ this.wXProj = randn((R + 2 * N) * D, 0.01);
164
+ this.bXProj = zeros(R + 2 * N);
165
+ this.wDtProj = randn(D * R, 0.02);
166
+ this.bDtProj = zeros(D);
167
+
168
+ this.A_log = new Float32Array(D * N);
169
+ for (let d = 0; d < D; d++) {
170
+ for (let n = 0; n < N; n++) {
171
+ this.A_log[d * N + n] = Math.log(n + 1);
172
+ }
173
+ }
174
+
175
+ this.D_vec = ones(D);
176
+ this.wOutProj = randn(dModel * D, 0.02);
177
+ this.bOutProj = zeros(dModel);
178
+ this.normWeight = ones(dModel);
179
+
180
+ this._uploadWeightsToGPU();
181
+ }
182
+
183
+ private _uploadWeightsToGPU(): void {
184
+ const d = this.device;
185
+ const mk = (arr: Float32Array): GPUBuffer => createStorageBuffer(d, arr, true);
186
+
187
+ this.gpuWeights = {
188
+ wInProj : mk(this.wInProj),
189
+ bInProj : mk(this.bInProj),
190
+ wConv : mk(this.wConv),
191
+ bConv : mk(this.bConv),
192
+ wXProj : mk(this.wXProj),
193
+ bXProj : mk(this.bXProj),
194
+ wDtProj : mk(this.wDtProj),
195
+ bDtProj : mk(this.bDtProj),
196
+ A_log : mk(this.A_log),
197
+ D_vec : mk(this.D_vec),
198
+ wOutProj : mk(this.wOutProj),
199
+ bOutProj : mk(this.bOutProj),
200
+ normWeight: mk(this.normWeight),
201
+ };
202
+ }
203
+
204
+ private _buildPipelines(): void {
205
+ const d = this.device;
206
+ this.pipelines = {
207
+ linear : createComputePipeline(d, LINEAR_FORWARD_WGSL, 'linear_forward'),
208
+ conv1d : createComputePipeline(d, CONV1D_FORWARD_WGSL, 'conv1d_forward'),
209
+ silu : createComputePipeline(d, ACTIVATIONS_WGSL, 'silu_forward'),
210
+ rmsnorm : createComputePipeline(d, ACTIVATIONS_WGSL, 'rmsnorm_forward'),
211
+ scan_fwd : createComputePipeline(d, SELECTIVE_SCAN_FORWARD_WGSL, 'forward_scan'),
212
+ scan_reduce : createComputePipeline(d, SELECTIVE_SCAN_FORWARD_WGSL, 'forward_reduce'),
213
+ elMul : createComputePipeline(d, MUL_SHADER, 'main'),
214
+ elAdd : createComputePipeline(d, ADD_SHADER, 'main'),
215
+ };
216
+ }
217
+
218
+ forward(xBuf: GPUBuffer, batch: number, seqLen: number): BlockForwardResult {
219
+ const d = this.device;
220
+ const { dModel, dState, dConv } = this.config;
221
+ const D = this.dInner;
222
+ const N = dState;
223
+ const B = batch;
224
+ const L = seqLen;
225
+ const M = B * L;
226
+ const R = this.dtRank;
227
+
228
+ const cache = {} as BlockCache;
229
+
230
+ // 1. Pre-block RMSNorm
231
+ const normOut = createEmptyStorageBuffer(d, M * dModel * 4, true);
232
+ const normInv = createEmptyStorageBuffer(d, M * 4, true);
233
+ cache.normInv = normInv;
234
+ cache.normIn = xBuf;
235
+ {
236
+ const params = new ArrayBuffer(16);
237
+ new Uint32Array(params, 0, 2).set([M, dModel]);
238
+ new Float32Array(params, 8, 1).set([1e-6]);
239
+ const pBuf = createUniformBuffer(d, params);
240
+ const bg = createBindGroup(d, this.pipelines['rmsnorm']!,
241
+ [pBuf, xBuf, this.gpuWeights['normWeight']!, normOut, normInv]);
242
+ dispatchKernel(d, this.pipelines['rmsnorm']!, bg, [cdiv(M, 64), 1, 1]);
243
+ }
244
+
245
+ // 2. Input projection → x and z
246
+ const inProjOut = createEmptyStorageBuffer(d, M * 2 * D * 4, true);
247
+ cache.normOut = normOut;
248
+ {
249
+ const params = new Uint32Array([M, dModel, 2 * D]).buffer;
250
+ const pBuf = createUniformBuffer(d, params);
251
+ const bg = createBindGroup(d, this.pipelines['linear']!,
252
+ [pBuf, normOut, this.gpuWeights['wInProj']!, this.gpuWeights['bInProj']!, inProjOut]);
253
+ dispatchKernel(d, this.pipelines['linear']!, bg, [cdiv(M, 16), cdiv(2 * D, 16), 1]);
254
+ }
255
+
256
+ // 3. Split into x and z
257
+ const xConvIn = createEmptyStorageBuffer(d, M * D * 4, true);
258
+ const zBuf = createEmptyStorageBuffer(d, M * D * 4, true);
259
+ {
260
+ const enc = d.createCommandEncoder();
261
+ enc.copyBufferToBuffer(inProjOut, 0, xConvIn, 0, M * D * 4);
262
+ enc.copyBufferToBuffer(inProjOut, M * D * 4, zBuf, 0, M * D * 4);
263
+ d.queue.submit([enc.finish()]);
264
+ }
265
+ inProjOut.destroy();
266
+ cache.zBuf = zBuf;
267
+ cache.xConvIn = xConvIn;
268
+
269
+ // 4. Causal conv1d on x
270
+ const convOut = createEmptyStorageBuffer(d, M * D * 4, true);
271
+ cache.convOut = convOut;
272
+ {
273
+ const params = new Uint32Array([L, D, dConv, B]).buffer;
274
+ const pBuf = createUniformBuffer(d, params);
275
+ const bg = createBindGroup(d, this.pipelines['conv1d']!,
276
+ [pBuf, xConvIn, this.gpuWeights['wConv']!, this.gpuWeights['bConv']!, convOut]);
277
+ dispatchKernel(d, this.pipelines['conv1d']!, bg, [cdiv(L, 16), cdiv(D, 16), B]);
278
+ }
279
+
280
+ // 5. SiLU activation
281
+ const siluOut = createEmptyStorageBuffer(d, M * D * 4, true);
282
+ cache.siluOut = siluOut;
283
+ {
284
+ const params = new Uint32Array([M * D]).buffer;
285
+ const pBuf = createUniformBuffer(d, params);
286
+ const bg = createBindGroup(d, this.pipelines['silu']!,
287
+ [pBuf, convOut, siluOut]);
288
+ dispatchKernel(d, this.pipelines['silu']!, bg, [cdiv(M * D, 256), 1, 1]);
289
+ }
290
+
291
+ // 6. x_proj → Δ (dtRaw), B, C
292
+ const xProjOut = createEmptyStorageBuffer(d, M * (R + 2 * N) * 4, true);
293
+ {
294
+ const params = new Uint32Array([M, D, R + 2 * N]).buffer;
295
+ const pBuf = createUniformBuffer(d, params);
296
+ const bg = createBindGroup(d, this.pipelines['linear']!,
297
+ [pBuf, siluOut, this.gpuWeights['wXProj']!, this.gpuWeights['bXProj']!, xProjOut]);
298
+ dispatchKernel(d, this.pipelines['linear']!, bg, [cdiv(M, 16), cdiv(R + 2 * N, 16), 1]);
299
+ }
300
+
301
+ const dtRaw = createEmptyStorageBuffer(d, M * R * 4, true);
302
+ const B_raw = createEmptyStorageBuffer(d, B * L * N * 4, true);
303
+ const C_raw = createEmptyStorageBuffer(d, B * L * N * 4, true);
304
+ {
305
+ const enc = d.createCommandEncoder();
306
+ enc.copyBufferToBuffer(xProjOut, 0, dtRaw, 0, M * R * 4);
307
+ enc.copyBufferToBuffer(xProjOut, M * R * 4, B_raw, 0, B * L * N * 4);
308
+ enc.copyBufferToBuffer(xProjOut, M * (R + N) * 4, C_raw, 0, B * L * N * 4);
309
+ d.queue.submit([enc.finish()]);
310
+ }
311
+ xProjOut.destroy();
312
+ cache.B_raw = B_raw;
313
+ cache.C_raw = C_raw;
314
+
315
+ // 7. dt_proj: expand Δ to full dim
316
+ const deltaFull = createEmptyStorageBuffer(d, M * D * 4, true);
317
+ cache.deltaFull = deltaFull;
318
+ {
319
+ const params = new Uint32Array([M, R, D]).buffer;
320
+ const pBuf = createUniformBuffer(d, params);
321
+ const bg = createBindGroup(d, this.pipelines['linear']!,
322
+ [pBuf, dtRaw, this.gpuWeights['wDtProj']!, this.gpuWeights['bDtProj']!, deltaFull]);
323
+ dispatchKernel(d, this.pipelines['linear']!, bg, [cdiv(M, 16), cdiv(D, 16), 1]);
324
+ }
325
+ dtRaw.destroy();
326
+
327
+ // 8. Selective scan (S6)
328
+ const scanY = createEmptyStorageBuffer(d, B * L * D * 4, true);
329
+ const hCache = createEmptyStorageBuffer(d, 2 * B * L * D * N * 4, true);
330
+ cache.hCache = hCache;
331
+ {
332
+ const params = new Uint32Array([L, N, D, B]).buffer;
333
+ const pBuf = createUniformBuffer(d, params);
334
+
335
+ const bg1 = createBindGroup(d, this.pipelines['scan_fwd']!,
336
+ [pBuf, siluOut, deltaFull, this.gpuWeights['A_log']!, B_raw, C_raw,
337
+ this.gpuWeights['D_vec']!, scanY, hCache]);
338
+ dispatchKernel(d, this.pipelines['scan_fwd']!, bg1, [cdiv(D, 8), cdiv(N, 8), B]);
339
+
340
+ const bg2 = createBindGroup(d, this.pipelines['scan_reduce']!,
341
+ [pBuf, siluOut, deltaFull, this.gpuWeights['A_log']!, B_raw, C_raw,
342
+ this.gpuWeights['D_vec']!, scanY, hCache]);
343
+ dispatchKernel(d, this.pipelines['scan_reduce']!, bg2, [cdiv(L, 64), D, B]);
344
+ }
345
+
346
+ // 9. Gate: y ⊗ SiLU(z)
347
+ const siluZ = createEmptyStorageBuffer(d, M * D * 4, true);
348
+ const gatedOut = createEmptyStorageBuffer(d, M * D * 4, true);
349
+ {
350
+ const nBuf = createUniformBuffer(d, new Uint32Array([M * D]).buffer);
351
+ const bgZ = createBindGroup(d, this.pipelines['silu']!,
352
+ [nBuf, zBuf, siluZ]);
353
+ dispatchKernel(d, this.pipelines['silu']!, bgZ, [cdiv(M * D, 256), 1, 1]);
354
+
355
+ const nBuf2 = createUniformBuffer(d, new Uint32Array([M * D]).buffer);
356
+ const bgMul = createBindGroup(d, this.pipelines['elMul']!,
357
+ [scanY, siluZ, gatedOut, nBuf2]);
358
+ dispatchKernel(d, this.pipelines['elMul']!, bgMul, [cdiv(M * D, 256), 1, 1]);
359
+ }
360
+ siluZ.destroy();
361
+ scanY.destroy();
362
+
363
+ // 10. Output projection
364
+ const outProjOut = createEmptyStorageBuffer(d, M * dModel * 4, true);
365
+ {
366
+ const params = new Uint32Array([M, D, dModel]).buffer;
367
+ const pBuf = createUniformBuffer(d, params);
368
+ const bg = createBindGroup(d, this.pipelines['linear']!,
369
+ [pBuf, gatedOut, this.gpuWeights['wOutProj']!, this.gpuWeights['bOutProj']!, outProjOut]);
370
+ dispatchKernel(d, this.pipelines['linear']!, bg, [cdiv(M, 16), cdiv(dModel, 16), 1]);
371
+ }
372
+ gatedOut.destroy();
373
+
374
+ // 11. Residual add
375
+ const output = createEmptyStorageBuffer(d, M * dModel * 4, true);
376
+ {
377
+ const nBuf = createUniformBuffer(d, new Uint32Array([M * dModel]).buffer);
378
+ const bg = createBindGroup(d, this.pipelines['elAdd']!,
379
+ [outProjOut, xBuf, output, nBuf]);
380
+ dispatchKernel(d, this.pipelines['elAdd']!, bg, [cdiv(M * dModel, 256), 1, 1]);
381
+ }
382
+ outProjOut.destroy();
383
+
384
+ return { output, cache };
385
+ }
386
+
387
+ parameters(): LayerParam[] {
388
+ const { dModel, dState, dConv } = this.config;
389
+ const D = this.dInner;
390
+ const N = dState;
391
+ const K = dConv;
392
+ const R = this.dtRank;
393
+
394
+ return [
395
+ { buf: this.gpuWeights['wInProj']!, numel: 2 * D * dModel, name: 'wInProj' },
396
+ { buf: this.gpuWeights['bInProj']!, numel: 2 * D, name: 'bInProj' },
397
+ { buf: this.gpuWeights['wConv']!, numel: D * K, name: 'wConv' },
398
+ { buf: this.gpuWeights['bConv']!, numel: D, name: 'bConv' },
399
+ { buf: this.gpuWeights['wXProj']!, numel: (R + 2 * N) * D, name: 'wXProj' },
400
+ { buf: this.gpuWeights['bXProj']!, numel: R + 2 * N, name: 'bXProj' },
401
+ { buf: this.gpuWeights['wDtProj']!, numel: D * R, name: 'wDtProj' },
402
+ { buf: this.gpuWeights['bDtProj']!, numel: D, name: 'bDtProj' },
403
+ { buf: this.gpuWeights['A_log']!, numel: D * N, name: 'A_log' },
404
+ { buf: this.gpuWeights['D_vec']!, numel: D, name: 'D_vec' },
405
+ { buf: this.gpuWeights['wOutProj']!, numel: dModel * D, name: 'wOutProj' },
406
+ { buf: this.gpuWeights['bOutProj']!, numel: dModel, name: 'bOutProj' },
407
+ { buf: this.gpuWeights['normWeight']!, numel: dModel, name: 'normWeight'},
408
+ ];
409
+ }
410
+
411
+ getTrainableParams(): LayerParam[] {
412
+ if (this._wslaMode) {
413
+ return [
414
+ { buf: this.gpuWeights['wXProj']!, numel: this.wXProj.length, name: 'wXProj' },
415
+ { buf: this.gpuWeights['bXProj']!, numel: this.bXProj.length, name: 'bXProj' },
416
+ ];
417
+ }
418
+ return this.parameters();
419
+ }
420
+
421
+ setWSLAMode(enabled: boolean): void {
422
+ this._wslaMode = enabled;
423
+ }
424
+
425
+ destroy(): void {
426
+ for (const buf of Object.values(this.gpuWeights)) {
427
+ buf.destroy();
428
+ }
429
+ this.gpuWeights = {};
430
+ }
431
+ }
432
+
433
+ // Deprecated alias — kept until mambacode.js 3.0.0
434
+ export { Mamba1Block as MambaBlock };
435
+
436
+ /** @deprecated Use Mamba1BlockConfig */
437
+ export type MambaBlockConfig = Mamba1BlockConfig;