@seanhogg/builderforce-memory-engine 2026.6.20 → 2026.6.28

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (72) hide show
  1. package/dist/index.d.ts +9 -0
  2. package/dist/index.d.ts.map +1 -1
  3. package/dist/index.js +8 -0
  4. package/dist/index.js.map +1 -1
  5. package/dist/kernels/limbic_affect.d.ts +2 -0
  6. package/dist/kernels/limbic_affect.d.ts.map +1 -0
  7. package/dist/kernels/limbic_affect.js +74 -0
  8. package/dist/kernels/limbic_affect.js.map +1 -0
  9. package/dist/limbic/index.d.ts +14 -0
  10. package/dist/limbic/index.d.ts.map +1 -0
  11. package/dist/limbic/index.js +11 -0
  12. package/dist/limbic/index.js.map +1 -0
  13. package/dist/limbic/limbic_model.d.ts +111 -0
  14. package/dist/limbic/limbic_model.d.ts.map +1 -0
  15. package/dist/limbic/limbic_model.js +299 -0
  16. package/dist/limbic/limbic_model.js.map +1 -0
  17. package/dist/limbic/limbic_trainer.d.ts +62 -0
  18. package/dist/limbic/limbic_trainer.d.ts.map +1 -0
  19. package/dist/limbic/limbic_trainer.js +172 -0
  20. package/dist/limbic/limbic_trainer.js.map +1 -0
  21. package/dist/limbic/regions.d.ts +79 -0
  22. package/dist/limbic/regions.d.ts.map +1 -0
  23. package/dist/limbic/regions.js +132 -0
  24. package/dist/limbic/regions.js.map +1 -0
  25. package/dist/lm/evermind_lm.d.ts +148 -0
  26. package/dist/lm/evermind_lm.d.ts.map +1 -0
  27. package/dist/lm/evermind_lm.js +479 -0
  28. package/dist/lm/evermind_lm.js.map +1 -0
  29. package/dist/lm/index.d.ts +6 -0
  30. package/dist/lm/index.d.ts.map +1 -0
  31. package/dist/lm/index.js +5 -0
  32. package/dist/lm/index.js.map +1 -0
  33. package/dist/model/attention_block.js +1 -1
  34. package/dist/model/attention_block.js.map +1 -1
  35. package/dist/model/mamba_model.js +1 -1
  36. package/dist/model/mamba_model.js.map +1 -1
  37. package/dist/moe/index.d.ts +10 -0
  38. package/dist/moe/index.d.ts.map +1 -0
  39. package/dist/moe/index.js +7 -0
  40. package/dist/moe/index.js.map +1 -0
  41. package/dist/moe/moe_model.d.ts +134 -0
  42. package/dist/moe/moe_model.d.ts.map +1 -0
  43. package/dist/moe/moe_model.js +415 -0
  44. package/dist/moe/moe_model.js.map +1 -0
  45. package/dist/moe/moe_package.d.ts +81 -0
  46. package/dist/moe/moe_package.d.ts.map +1 -0
  47. package/dist/moe/moe_package.js +157 -0
  48. package/dist/moe/moe_package.js.map +1 -0
  49. package/dist/moe/moe_trainer.d.ts +53 -0
  50. package/dist/moe/moe_trainer.d.ts.map +1 -0
  51. package/dist/moe/moe_trainer.js +93 -0
  52. package/dist/moe/moe_trainer.js.map +1 -0
  53. package/dist/optim/adamw.d.ts +32 -0
  54. package/dist/optim/adamw.d.ts.map +1 -0
  55. package/dist/optim/adamw.js +52 -0
  56. package/dist/optim/adamw.js.map +1 -0
  57. package/package.json +1 -1
  58. package/src/index.ts +59 -0
  59. package/src/kernels/limbic_affect.ts +74 -0
  60. package/src/limbic/index.ts +28 -0
  61. package/src/limbic/limbic_model.ts +373 -0
  62. package/src/limbic/limbic_trainer.ts +253 -0
  63. package/src/limbic/regions.ts +141 -0
  64. package/src/lm/evermind_lm.ts +558 -0
  65. package/src/lm/index.ts +6 -0
  66. package/src/model/attention_block.ts +1 -1
  67. package/src/model/mamba_model.ts +1 -1
  68. package/src/moe/index.ts +23 -0
  69. package/src/moe/moe_model.ts +475 -0
  70. package/src/moe/moe_package.ts +205 -0
  71. package/src/moe/moe_trainer.ts +134 -0
  72. package/src/optim/adamw.ts +72 -0
@@ -0,0 +1,6 @@
1
+ /**
2
+ * EvermindLM — the generative language model (the runnable "AI").
3
+ */
4
+
5
+ export { EvermindLM, EvermindLMTrainer, DEFAULT_LM_CONFIG, DEFAULT_LM_SEED } from "./evermind_lm.js";
6
+ export type { EvermindLMConfig, LMGenerateOptions, TextCodec } from "./evermind_lm.js";
@@ -114,7 +114,7 @@ export class AttentionBlock implements SequenceLayer {
114
114
  }
115
115
 
116
116
  private _initWeights(): void {
117
- const { dModel, nHeads, hasFfn, ffnMult } = this.config;
117
+ const { dModel, hasFfn, ffnMult } = this.config;
118
118
 
119
119
  const randn = (n: number, std = 0.02): Float32Array => gaussianArray(n, std);
120
120
 
@@ -375,7 +375,7 @@ export class HybridMambaModel {
375
375
  const { temperature = 1.0, topK = 50, topP = 0.9 } = samplingOpts;
376
376
  const { vocabSize } = this.config;
377
377
 
378
- let ids = [...promptIds];
378
+ const ids = [...promptIds];
379
379
 
380
380
  for (let step = 0; step < maxNewTokens; step++) {
381
381
  const { logits } = await this.forward(new Uint32Array(ids), 1, ids.length);
@@ -0,0 +1,23 @@
1
+ /**
2
+ * Mixture-of-Experts — shared-expert hybrid sparsity for the Evermind generator.
3
+ */
4
+
5
+ export {
6
+ SharedExpertMoE,
7
+ LoadBalanceAccumulator,
8
+ DEFAULT_MOE_CONFIG,
9
+ DEFAULT_MOE_SEED,
10
+ } from "./moe_model.js";
11
+ export type { MoEConfig, MoEParam, RouteResult } from "./moe_model.js";
12
+
13
+ export { MoETrainer } from "./moe_trainer.js";
14
+ export type { MoESample, MoETrainOptions, MoEEpochResult } from "./moe_trainer.js";
15
+
16
+ export { EvermindModelPackage } from "./moe_package.js";
17
+ export type {
18
+ EvermindModelManifest,
19
+ EvermindModelCard,
20
+ EvermindModelType,
21
+ PackageMeta,
22
+ ValidationResult,
23
+ } from "./moe_package.js";
@@ -0,0 +1,475 @@
1
+ /**
2
+ * moe_model.ts — SharedExpertMoE: a shared-expert hybrid Mixture-of-Experts FFN.
3
+ *
4
+ * The sparsity design behind Evermind's generator. Each token is processed by:
5
+ * • a DENSE shared expert that is ALWAYS active (carries continuous learning;
6
+ * the part the online-distillation signal flows into), plus
7
+ * • the top-k of N routed experts, gated by a learned router and combined by a
8
+ * softmax over the selected experts.
9
+ *
10
+ * y = SharedFFN(x) + Σ_{e ∈ topk(x)} gate_e · Expert_e(x)
11
+ *
12
+ * This is the DeepSeekMoE "shared-expert isolation" pattern: the dense backbone
13
+ * resolves the online-learning attribution problem (you distil into ONE always-on
14
+ * path), while the routed experts add web-pageable capacity (each expert's
15
+ * weights are an independent checkpoint — see {@link SharedExpertMoE.exportExpert}
16
+ * — so a host can stream only the experts a token activates).
17
+ *
18
+ * Pure-TS CPU reference (Float32Array, exact forward + backward), mirroring
19
+ * {@link LimbicModel}'s WebGPU-or-fallback contract — the WGSL kernel path
20
+ * (router gate + expert FFN GEMM) is a numerically-identical future acceleration.
21
+ *
22
+ * Activation is ReLU for an exact, unambiguous gradient in the reference path;
23
+ * production may swap GELU/SwiGLU behind the same shapes.
24
+ */
25
+
26
+ import { SeededRng } from "../utils/rng.js";
27
+ import { quantizeFp16, dequantizeFp16 } from "../utils/quantization.js";
28
+
29
+ export interface MoEConfig {
30
+ /** Model (token) dimension — FFN input/output width. Default 64. */
31
+ modelDim: number;
32
+ /** Hidden width of each expert FFN. Default 128. */
33
+ hiddenDim: number;
34
+ /** Number of routed experts. Default 8. */
35
+ numExperts: number;
36
+ /** Experts activated per token (top-k). Default 2. Must be ≤ numExperts. */
37
+ topK: number;
38
+ /** Deterministic init seed for reproducible cold-start weights. */
39
+ seed?: number;
40
+ }
41
+
42
+ export const DEFAULT_MOE_CONFIG: Required<Omit<MoEConfig, "seed">> = {
43
+ modelDim: 64,
44
+ hiddenDim: 128,
45
+ numExperts: 8,
46
+ topK: 2,
47
+ };
48
+
49
+ /** Fixed default init seed — reproducible byte-identical cold start across machines. */
50
+ export const DEFAULT_MOE_SEED = 0x4d6f4501; // "MoE\x01"
51
+
52
+ const MAGIC = 0x4d6f4530; // "MoE0"
53
+
54
+ /** A named trainable parameter tensor (flat row-major). */
55
+ export interface MoEParam {
56
+ name: string;
57
+ data: Float32Array;
58
+ numel: number;
59
+ }
60
+
61
+ /** Result of routing a token: which experts fire and with what combine weights. */
62
+ export interface RouteResult {
63
+ /** Indices of the selected top-k experts, highest router logit first. */
64
+ experts: number[];
65
+ /** Combine weights (softmax over the selected logits), index-aligned to `experts`. */
66
+ gates: number[];
67
+ /** Full softmax over ALL experts — the load-balancing signal. */
68
+ probs: Float32Array;
69
+ }
70
+
71
+ function relu(x: number): number {
72
+ return x > 0 ? x : 0;
73
+ }
74
+
75
+ /** A 2-layer FFN expert: y = W2·relu(W1·x + b1) + b2. */
76
+ class Expert {
77
+ // Parameters (flat, row-major).
78
+ w1: Float32Array; // hidden × model
79
+ b1: Float32Array; // hidden
80
+ w2: Float32Array; // model × hidden
81
+ b2: Float32Array; // model
82
+ // Gradient accumulators.
83
+ gW1: Float32Array;
84
+ gB1: Float32Array;
85
+ gW2: Float32Array;
86
+ gB2: Float32Array;
87
+
88
+ constructor(
89
+ private readonly modelDim: number,
90
+ private readonly hiddenDim: number,
91
+ gauss: (n: number, std: number) => Float32Array,
92
+ ) {
93
+ // He-style init for the ReLU layer; small output init so an untrained expert
94
+ // contributes little until it has learned.
95
+ this.w1 = gauss(hiddenDim * modelDim, Math.sqrt(2 / modelDim));
96
+ this.b1 = new Float32Array(hiddenDim);
97
+ this.w2 = gauss(modelDim * hiddenDim, 0.02);
98
+ this.b2 = new Float32Array(modelDim);
99
+ this.gW1 = new Float32Array(this.w1.length);
100
+ this.gB1 = new Float32Array(this.b1.length);
101
+ this.gW2 = new Float32Array(this.w2.length);
102
+ this.gB2 = new Float32Array(this.b2.length);
103
+ }
104
+
105
+ /** Forward. Returns the output plus the cache needed for {@link backward}. */
106
+ forward(x: Float32Array): { y: Float32Array; pre: Float32Array; h: Float32Array } {
107
+ const { modelDim, hiddenDim } = this;
108
+ const pre = new Float32Array(hiddenDim);
109
+ const h = new Float32Array(hiddenDim);
110
+ for (let j = 0; j < hiddenDim; j++) {
111
+ let acc = this.b1[j]!;
112
+ const off = j * modelDim;
113
+ for (let i = 0; i < modelDim; i++) acc += this.w1[off + i]! * x[i]!;
114
+ pre[j] = acc;
115
+ h[j] = relu(acc);
116
+ }
117
+ const y = new Float32Array(modelDim);
118
+ for (let d = 0; d < modelDim; d++) {
119
+ let acc = this.b2[d]!;
120
+ const off = d * hiddenDim;
121
+ for (let j = 0; j < hiddenDim; j++) acc += this.w2[off + j]! * h[j]!;
122
+ y[d] = acc;
123
+ }
124
+ return { y, pre, h };
125
+ }
126
+
127
+ /** Accumulate gradients for one token given dL/dy. Returns dL/dx. */
128
+ backward(dy: Float32Array, x: Float32Array, pre: Float32Array, h: Float32Array): Float32Array {
129
+ const { modelDim, hiddenDim } = this;
130
+ const dh = new Float32Array(hiddenDim);
131
+ for (let d = 0; d < modelDim; d++) {
132
+ const dyd = dy[d]!;
133
+ this.gB2[d] = this.gB2[d]! + dyd;
134
+ const off = d * hiddenDim;
135
+ for (let j = 0; j < hiddenDim; j++) {
136
+ this.gW2[off + j] = this.gW2[off + j]! + dyd * h[j]!;
137
+ dh[j] = dh[j]! + dyd * this.w2[off + j]!;
138
+ }
139
+ }
140
+ const dx = new Float32Array(modelDim);
141
+ for (let j = 0; j < hiddenDim; j++) {
142
+ const dpre = pre[j]! > 0 ? dh[j]! : 0; // relu'
143
+ this.gB1[j] = this.gB1[j]! + dpre;
144
+ const off = j * modelDim;
145
+ for (let i = 0; i < modelDim; i++) {
146
+ this.gW1[off + i] = this.gW1[off + i]! + dpre * x[i]!;
147
+ dx[i] = dx[i]! + dpre * this.w1[off + i]!;
148
+ }
149
+ }
150
+ return dx;
151
+ }
152
+
153
+ params(): Float32Array[] {
154
+ return [this.w1, this.b1, this.w2, this.b2];
155
+ }
156
+ grads(): Float32Array[] {
157
+ return [this.gW1, this.gB1, this.gW2, this.gB2];
158
+ }
159
+ }
160
+
161
+ /** Per-token forward intermediates retained for the backward pass. */
162
+ interface MoECache {
163
+ x: Float32Array;
164
+ route: RouteResult;
165
+ sharedPre: Float32Array;
166
+ sharedH: Float32Array;
167
+ expertOut: Float32Array[]; // per selected expert, index-aligned to route.experts
168
+ expertPre: Float32Array[];
169
+ expertH: Float32Array[];
170
+ }
171
+
172
+ /**
173
+ * Accumulates router statistics over a batch to compute the load-balancing
174
+ * auxiliary loss `E · Σ_e f_e · P_e` (Switch/GShard). Minimised (→ near 1) when
175
+ * dispatch is uniform; large (→ near E) when the router collapses onto few
176
+ * experts. Add it to the task loss with a small coefficient to keep experts busy.
177
+ */
178
+ export class LoadBalanceAccumulator {
179
+ private readonly counts: Float32Array;
180
+ private readonly probSum: Float32Array;
181
+ private tokens = 0;
182
+ constructor(private readonly numExperts: number) {
183
+ this.counts = new Float32Array(numExperts);
184
+ this.probSum = new Float32Array(numExperts);
185
+ }
186
+ observe(route: RouteResult): void {
187
+ this.tokens++;
188
+ for (const e of route.experts) this.counts[e] = this.counts[e]! + 1;
189
+ for (let e = 0; e < this.numExperts; e++) this.probSum[e] = this.probSum[e]! + route.probs[e]!;
190
+ }
191
+ /** The load-balance loss over everything observed so far (0 if no tokens). */
192
+ loss(): number {
193
+ if (this.tokens === 0) return 0;
194
+ const E = this.numExperts;
195
+ const dispatched = this.counts.reduce((a, b) => a + b, 0) || 1; // = tokens·topK
196
+ let sum = 0;
197
+ for (let e = 0; e < E; e++) {
198
+ const f = this.counts[e]! / dispatched; // fraction of dispatches to e
199
+ const p = this.probSum[e]! / this.tokens; // mean router prob for e
200
+ sum += f * p;
201
+ }
202
+ return E * sum;
203
+ }
204
+ }
205
+
206
+ export class SharedExpertMoE {
207
+ readonly config: Required<Omit<MoEConfig, "seed">>;
208
+
209
+ /** Router weights: numExperts × modelDim (no bias). */
210
+ wr: Float32Array;
211
+ private gWr: Float32Array;
212
+
213
+ private readonly shared: Expert;
214
+ private readonly experts: Expert[];
215
+
216
+ constructor(config: Partial<MoEConfig> = {}) {
217
+ const cfg = { ...DEFAULT_MOE_CONFIG, ...config };
218
+ if (cfg.topK > cfg.numExperts) {
219
+ throw new Error(`MoE topK (${cfg.topK}) must be ≤ numExperts (${cfg.numExperts})`);
220
+ }
221
+ if (cfg.topK < 1) throw new Error(`MoE topK must be ≥ 1 (got ${cfg.topK})`);
222
+ this.config = cfg;
223
+
224
+ const rng = new SeededRng(((config.seed ?? DEFAULT_MOE_SEED) >>> 0) || 1);
225
+ const gauss = (n: number, std: number): Float32Array => {
226
+ const a = new Float32Array(n);
227
+ for (let i = 0; i < n; i++) {
228
+ const u1 = Math.max(rng.next(), 1e-12);
229
+ const u2 = rng.next();
230
+ a[i] = std * Math.sqrt(-2 * Math.log(u1)) * Math.cos(2 * Math.PI * u2);
231
+ }
232
+ return a;
233
+ };
234
+
235
+ this.wr = gauss(cfg.numExperts * cfg.modelDim, 0.02);
236
+ this.gWr = new Float32Array(this.wr.length);
237
+ this.shared = new Expert(cfg.modelDim, cfg.hiddenDim, gauss);
238
+ this.experts = Array.from({ length: cfg.numExperts }, () => new Expert(cfg.modelDim, cfg.hiddenDim, gauss));
239
+ }
240
+
241
+ /** Route a token: router logits → top-k → combine gates + full softmax probs. */
242
+ route(x: Float32Array): RouteResult {
243
+ const { numExperts, topK, modelDim } = this.config;
244
+ const logits = new Float32Array(numExperts);
245
+ for (let e = 0; e < numExperts; e++) {
246
+ let acc = 0;
247
+ const off = e * modelDim;
248
+ for (let i = 0; i < modelDim; i++) acc += this.wr[off + i]! * x[i]!;
249
+ logits[e] = acc;
250
+ }
251
+ // Full softmax over all experts (load-balancing signal).
252
+ const probs = softmax(logits);
253
+ // Top-k experts by logit.
254
+ const order = Array.from({ length: numExperts }, (_, e) => e).sort((a, b) => logits[b]! - logits[a]!);
255
+ const experts = order.slice(0, topK);
256
+ // Combine gates = softmax over ONLY the selected logits.
257
+ const selLogits = experts.map((e) => logits[e]!);
258
+ const selSoft = softmax(Float32Array.from(selLogits));
259
+ return { experts, gates: Array.from(selSoft), probs };
260
+ }
261
+
262
+ /** Forward a single token. Returns the output and a cache for {@link backward}. */
263
+ forward(input: ArrayLike<number>): { output: Float32Array; route: RouteResult; cache: MoECache } {
264
+ const { modelDim } = this.config;
265
+ const x = Float32Array.from({ length: modelDim }, (_, i) => input[i] ?? 0);
266
+ const route = this.route(x);
267
+
268
+ const s = this.shared.forward(x);
269
+ const output = Float32Array.from(s.y);
270
+
271
+ const expertOut: Float32Array[] = [];
272
+ const expertPre: Float32Array[] = [];
273
+ const expertH: Float32Array[] = [];
274
+ for (let m = 0; m < route.experts.length; m++) {
275
+ const e = this.experts[route.experts[m]!]!;
276
+ const r = e.forward(x);
277
+ const g = route.gates[m]!;
278
+ for (let d = 0; d < modelDim; d++) output[d] = output[d]! + g * r.y[d]!;
279
+ expertOut.push(r.y);
280
+ expertPre.push(r.pre);
281
+ expertH.push(r.h);
282
+ }
283
+
284
+ return {
285
+ output,
286
+ route,
287
+ cache: { x, route, sharedPre: s.pre, sharedH: s.h, expertOut, expertPre, expertH },
288
+ };
289
+ }
290
+
291
+ /**
292
+ * Accumulate gradients for one token given dL/d(output). Trains the shared
293
+ * expert, the selected routed experts, and the router (so it learns to weight
294
+ * the experts that reduce loss). Call {@link zeroGrad} before a batch and apply
295
+ * an optimiser after. Load balancing is a separate signal (see
296
+ * {@link LoadBalanceAccumulator}). Returns dL/d(input) so the FFN can stack
297
+ * inside a residual block (e.g. {@link EvermindLM}).
298
+ */
299
+ backward(dOutput: ArrayLike<number>, cache: MoECache): Float32Array {
300
+ const { modelDim } = this.config;
301
+ const dOut = Float32Array.from({ length: modelDim }, (_, d) => dOutput[d] ?? 0);
302
+ const dx = new Float32Array(modelDim);
303
+
304
+ // Shared expert (always active) sees the full upstream gradient.
305
+ const dxShared = this.shared.backward(dOut, cache.x, cache.sharedPre, cache.sharedH);
306
+ for (let i = 0; i < modelDim; i++) dx[i] = dx[i]! + dxShared[i]!;
307
+
308
+ // Routed experts: each scaled by its gate; collect dL/dgate for the router.
309
+ const k = cache.route.experts.length;
310
+ const dGate = new Float32Array(k);
311
+ for (let m = 0; m < k; m++) {
312
+ const g = cache.route.gates[m]!;
313
+ const scaled = new Float32Array(modelDim);
314
+ let dg = 0;
315
+ for (let d = 0; d < modelDim; d++) {
316
+ scaled[d] = g * dOut[d]!;
317
+ dg += dOut[d]! * cache.expertOut[m]![d]!;
318
+ }
319
+ const dxe = this.experts[cache.route.experts[m]!]!.backward(
320
+ scaled,
321
+ cache.x,
322
+ cache.expertPre[m]!,
323
+ cache.expertH[m]!,
324
+ );
325
+ for (let i = 0; i < modelDim; i++) dx[i] = dx[i]! + dxe[i]!;
326
+ dGate[m] = dg;
327
+ }
328
+
329
+ // Router: gates = softmax(selected logits). Backprop dGate through the
330
+ // softmax Jacobian to the selected logits, then to Wr and the input.
331
+ const gates = cache.route.gates;
332
+ let dot = 0;
333
+ for (let m = 0; m < k; m++) dot += gates[m]! * dGate[m]!;
334
+ for (let m = 0; m < k; m++) {
335
+ const dLogit = gates[m]! * (dGate[m]! - dot);
336
+ const e = cache.route.experts[m]!;
337
+ const off = e * modelDim;
338
+ for (let i = 0; i < modelDim; i++) {
339
+ this.gWr[off + i] = this.gWr[off + i]! + dLogit * cache.x[i]!;
340
+ dx[i] = dx[i]! + dLogit * this.wr[off + i]!;
341
+ }
342
+ }
343
+ return dx;
344
+ }
345
+
346
+ /**
347
+ * Add the load-balancing auxiliary-loss gradient for one token into the router
348
+ * gradient. `L_aux = E·Σ_e f_e·P̄_e` (Switch/GShard); `f` (per-batch dispatch
349
+ * fractions) is treated as a stop-grad constant, so only the full softmax `P`
350
+ * carries gradient: ∂L_aux/∂logit_j = scale·P_j·(f_j − Σ_e f_e·P_e), where the
351
+ * caller passes `scale = auxWeight·E/T`. Keeps the router from collapsing onto a
352
+ * few experts. Call once per token over the batch, after {@link backward}.
353
+ */
354
+ auxGradStep(x: Float32Array, probs: Float32Array, f: Float32Array, scale: number): void {
355
+ const { numExperts, modelDim } = this.config;
356
+ let fp = 0;
357
+ for (let e = 0; e < numExperts; e++) fp += f[e]! * probs[e]!;
358
+ for (let j = 0; j < numExperts; j++) {
359
+ const coeff = scale * probs[j]! * (f[j]! - fp);
360
+ if (coeff === 0) continue;
361
+ const off = j * modelDim;
362
+ for (let i = 0; i < modelDim; i++) this.gWr[off + i] = this.gWr[off + i]! + coeff * x[i]!;
363
+ }
364
+ }
365
+
366
+ // ── Parameters / checkpoint ────────────────────────────────────────────────
367
+
368
+ /** All trainable parameters in canonical order: router, shared, then experts. */
369
+ parameters(): MoEParam[] {
370
+ const out: MoEParam[] = [{ name: "wr", data: this.wr, numel: this.wr.length }];
371
+ const push = (prefix: string, e: Expert) => {
372
+ const names = ["w1", "b1", "w2", "b2"];
373
+ e.params().forEach((p, i) => out.push({ name: `${prefix}.${names[i]}`, data: p, numel: p.length }));
374
+ };
375
+ push("shared", this.shared);
376
+ this.experts.forEach((e, idx) => push(`expert${idx}`, e));
377
+ return out;
378
+ }
379
+
380
+ /** Gradient buffers, index-aligned with {@link parameters}. */
381
+ gradients(): MoEParam[] {
382
+ const out: MoEParam[] = [{ name: "wr", data: this.gWr, numel: this.gWr.length }];
383
+ const push = (prefix: string, e: Expert) => {
384
+ const names = ["w1", "b1", "w2", "b2"];
385
+ e.grads().forEach((g, i) => out.push({ name: `${prefix}.${names[i]}`, data: g, numel: g.length }));
386
+ };
387
+ push("shared", this.shared);
388
+ this.experts.forEach((e, idx) => push(`expert${idx}`, e));
389
+ return out;
390
+ }
391
+
392
+ zeroGrad(): void {
393
+ for (const g of this.gradients()) g.data.fill(0);
394
+ }
395
+
396
+ /** One routed expert's weights as a standalone checkpoint (the web-paging unit). */
397
+ exportExpert(index: number): MoEParam[] {
398
+ const e = this.experts[index];
399
+ if (!e) throw new Error(`exportExpert: index ${index} out of range (0..${this.config.numExperts - 1})`);
400
+ const names = ["w1", "b1", "w2", "b2"];
401
+ return e.params().map((p, i) => ({ name: names[i]!, data: p, numel: p.length }));
402
+ }
403
+
404
+ /**
405
+ * Serialise all weights to a compact "MoE0" binary. Layout: magic, version,
406
+ * [modelDim, hiddenDim, numExperts, topK], then params in {@link parameters}
407
+ * order. fp16 (v2) halves the size; f32 (v1) is exact.
408
+ */
409
+ exportWeights(opts: { fp16?: boolean } = {}): ArrayBuffer {
410
+ const fp16 = opts.fp16 ?? false;
411
+ const params = this.parameters();
412
+ const total = params.reduce((n, p) => n + p.numel, 0);
413
+ const headerEls = 6; // magic, version, modelDim, hiddenDim, numExperts, topK
414
+ const headerBytes = headerEls * 4;
415
+ const buf = new ArrayBuffer(headerBytes + (fp16 ? total * 2 : total * 4));
416
+ const head = new Uint32Array(buf, 0, headerEls);
417
+ head[0] = MAGIC;
418
+ head[1] = fp16 ? 2 : 1;
419
+ head[2] = this.config.modelDim;
420
+ head[3] = this.config.hiddenDim;
421
+ head[4] = this.config.numExperts;
422
+ head[5] = this.config.topK;
423
+
424
+ const flat = new Float32Array(total);
425
+ let o = 0;
426
+ for (const p of params) {
427
+ flat.set(p.data, o);
428
+ o += p.numel;
429
+ }
430
+ if (fp16) new Uint16Array(buf, headerBytes, total).set(quantizeFp16(flat));
431
+ else new Float32Array(buf, headerBytes, total).set(flat);
432
+ return buf;
433
+ }
434
+
435
+ /** Load weights from an "MoE0" binary. Validates magic + dims. */
436
+ loadWeights(buffer: ArrayBuffer): void {
437
+ const head = new Uint32Array(buffer, 0, 6);
438
+ if (head[0] !== MAGIC) throw new Error("SharedExpertMoE.loadWeights: bad magic (not an MoE0 checkpoint)");
439
+ const [, version, modelDim, hiddenDim, numExperts, topK] = head;
440
+ if (
441
+ modelDim !== this.config.modelDim ||
442
+ hiddenDim !== this.config.hiddenDim ||
443
+ numExperts !== this.config.numExperts ||
444
+ topK !== this.config.topK
445
+ ) {
446
+ throw new Error("SharedExpertMoE.loadWeights: config mismatch with checkpoint");
447
+ }
448
+ const params = this.parameters();
449
+ const total = params.reduce((n, p) => n + p.numel, 0);
450
+ const headerBytes = 24;
451
+ const flat =
452
+ version === 2
453
+ ? dequantizeFp16(new Uint16Array(buffer, headerBytes, total))
454
+ : new Float32Array(buffer.slice(headerBytes, headerBytes + total * 4));
455
+ let o = 0;
456
+ for (const p of params) {
457
+ p.data.set(flat.subarray(o, o + p.numel));
458
+ o += p.numel;
459
+ }
460
+ }
461
+ }
462
+
463
+ /** Numerically-stable softmax over a flat array. */
464
+ function softmax(logits: Float32Array): Float32Array {
465
+ let max = -Infinity;
466
+ for (let i = 0; i < logits.length; i++) if (logits[i]! > max) max = logits[i]!;
467
+ const out = new Float32Array(logits.length);
468
+ let sum = 0;
469
+ for (let i = 0; i < logits.length; i++) {
470
+ out[i] = Math.exp(logits[i]! - max);
471
+ sum += out[i]!;
472
+ }
473
+ for (let i = 0; i < logits.length; i++) out[i] = out[i]! / sum;
474
+ return out;
475
+ }