@seanhogg/builderforce-memory-engine 2026.6.27 → 2026.6.28

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. package/dist/index.d.ts +6 -0
  2. package/dist/index.d.ts.map +1 -1
  3. package/dist/index.js +5 -0
  4. package/dist/index.js.map +1 -1
  5. package/dist/lm/evermind_lm.d.ts +148 -0
  6. package/dist/lm/evermind_lm.d.ts.map +1 -0
  7. package/dist/lm/evermind_lm.js +479 -0
  8. package/dist/lm/evermind_lm.js.map +1 -0
  9. package/dist/lm/index.d.ts +6 -0
  10. package/dist/lm/index.d.ts.map +1 -0
  11. package/dist/lm/index.js +5 -0
  12. package/dist/lm/index.js.map +1 -0
  13. package/dist/model/attention_block.js +1 -1
  14. package/dist/model/attention_block.js.map +1 -1
  15. package/dist/model/mamba_model.js +1 -1
  16. package/dist/model/mamba_model.js.map +1 -1
  17. package/dist/moe/index.d.ts +10 -0
  18. package/dist/moe/index.d.ts.map +1 -0
  19. package/dist/moe/index.js +7 -0
  20. package/dist/moe/index.js.map +1 -0
  21. package/dist/moe/moe_model.d.ts +134 -0
  22. package/dist/moe/moe_model.d.ts.map +1 -0
  23. package/dist/moe/moe_model.js +415 -0
  24. package/dist/moe/moe_model.js.map +1 -0
  25. package/dist/moe/moe_package.d.ts +81 -0
  26. package/dist/moe/moe_package.d.ts.map +1 -0
  27. package/dist/moe/moe_package.js +157 -0
  28. package/dist/moe/moe_package.js.map +1 -0
  29. package/dist/moe/moe_trainer.d.ts +53 -0
  30. package/dist/moe/moe_trainer.d.ts.map +1 -0
  31. package/dist/moe/moe_trainer.js +93 -0
  32. package/dist/moe/moe_trainer.js.map +1 -0
  33. package/dist/optim/adamw.d.ts +32 -0
  34. package/dist/optim/adamw.d.ts.map +1 -0
  35. package/dist/optim/adamw.js +52 -0
  36. package/dist/optim/adamw.js.map +1 -0
  37. package/package.json +1 -1
  38. package/src/index.ts +28 -0
  39. package/src/lm/evermind_lm.ts +558 -0
  40. package/src/lm/index.ts +6 -0
  41. package/src/model/attention_block.ts +1 -1
  42. package/src/model/mamba_model.ts +1 -1
  43. package/src/moe/index.ts +23 -0
  44. package/src/moe/moe_model.ts +475 -0
  45. package/src/moe/moe_package.ts +205 -0
  46. package/src/moe/moe_trainer.ts +134 -0
  47. package/src/optim/adamw.ts +72 -0
@@ -0,0 +1,558 @@
1
+ /**
2
+ * evermind_lm.ts — EvermindLM: a small but complete generative language model.
3
+ *
4
+ * This is what turns a trained checkpoint into an *AI that generates text* (the
5
+ * thing a marketplace buyer actually runs). Architecture (Mamba-flavoured, the
6
+ * minimal exact-gradient CPU reference):
7
+ *
8
+ * x_t = Embed[token_t]
9
+ * per layer:
10
+ * x_t += DepthwiseCausalConv(x)_t // temporal mixing (short conv)
11
+ * x_t += SharedExpertMoE(x_t) // per-position channel mixing (sparse)
12
+ * logits_t = x_t · Embedᵀ // tied output head
13
+ *
14
+ * The token mixer is a depthwise causal convolution (each channel sees a short
15
+ * window of its own past — Mamba's pre-conv) and the channel mixer is the
16
+ * shared-expert MoE, so the model is genuinely sparse. Embeddings are tied
17
+ * (input lookup == output head), which the gradient code accounts for.
18
+ *
19
+ * Pure CPU, exact forward + backward (finite-difference checked), reusing the
20
+ * engine's MoE, cross-entropy, and AdamW. The WGSL/WebGPU path is a future
21
+ * acceleration with the same shapes.
22
+ */
23
+
24
+ import { SharedExpertMoE } from "../moe/moe_model.js";
25
+ import { crossEntropyLoss, crossEntropyGrad } from "../training/autograd.js";
26
+ import { AdamW, type AdamWOptions } from "../optim/adamw.js";
27
+ import { SeededRng } from "../utils/rng.js";
28
+ import { quantizeFp16, dequantizeFp16 } from "../utils/quantization.js";
29
+
30
+ export interface EvermindLMConfig {
31
+ /** Vocabulary size. */
32
+ vocabSize: number;
33
+ /** Model (channel) dimension. Default 64. */
34
+ dModel: number;
35
+ /** Number of (conv + MoE) blocks. Default 2. */
36
+ numLayers: number;
37
+ /** Causal conv kernel width. Default 3. */
38
+ convKernel: number;
39
+ /** Hidden width of each MoE expert FFN. Default 2·dModel. */
40
+ hiddenDim: number;
41
+ /** Routed experts per MoE layer. Default 4. */
42
+ numExperts: number;
43
+ /** Experts activated per token. Default 2. */
44
+ topK: number;
45
+ /** Deterministic init seed. */
46
+ seed?: number;
47
+ }
48
+
49
+ export const DEFAULT_LM_CONFIG: Required<Omit<EvermindLMConfig, "seed" | "vocabSize">> = {
50
+ dModel: 64,
51
+ numLayers: 2,
52
+ convKernel: 3,
53
+ hiddenDim: 128,
54
+ numExperts: 4,
55
+ topK: 2,
56
+ };
57
+
58
+ export const DEFAULT_LM_SEED = 0x45564c4d; // "EVLM"
59
+ const MAGIC = 0x45564c30; // "EVL0"
60
+
61
+ interface MoECacheLike {
62
+ x: Float32Array;
63
+ route: { experts: number[]; gates: number[]; probs: Float32Array };
64
+ sharedPre: Float32Array;
65
+ sharedH: Float32Array;
66
+ expertOut: Float32Array[];
67
+ expertPre: Float32Array[];
68
+ expertH: Float32Array[];
69
+ }
70
+
71
+ interface LayerCache {
72
+ layerIn: Float32Array[]; // residual base for the conv sub-block (the layer input)
73
+ normedConv: Float32Array[]; // RMSNorm(layerIn) — the conv input
74
+ rmsConv: number[]; // per-position RMS denom for the conv norm
75
+ afterConv: Float32Array[]; // residual base for the MoE sub-block
76
+ rmsMoe: number[]; // per-position RMS denom for the MoE norm
77
+ moeCache: MoECacheLike[]; // per position
78
+ }
79
+
80
+ interface ForwardCache {
81
+ tokens: number[];
82
+ layers: LayerCache[];
83
+ finalX: Float32Array[]; // per position, fed to the tied head
84
+ }
85
+
86
+ /** A tokenizer the LM can read/write text through (the engine's `BPETokenizer` fits). */
87
+ export interface TextCodec {
88
+ encode(text: string): number[];
89
+ decode(ids: number[]): string;
90
+ }
91
+
92
+ export interface LMGenerateOptions {
93
+ maxNewTokens: number;
94
+ /** Sampling temperature; ≤0 ⇒ greedy argmax. Default 0 (greedy). */
95
+ temperature?: number;
96
+ /** Deterministic sampler seed (only used when temperature > 0). */
97
+ seed?: number;
98
+ /** Stop generating when this token id is produced. */
99
+ stopToken?: number;
100
+ }
101
+
102
+ export class EvermindLM {
103
+ readonly config: Required<Omit<EvermindLMConfig, "seed">>;
104
+
105
+ /** Tied token embedding / output head: vocabSize × dModel (row-major). */
106
+ emb: Float32Array;
107
+ private gEmb: Float32Array;
108
+ /** Per-layer depthwise causal conv kernels: dModel × convKernel. */
109
+ private readonly conv: Float32Array[];
110
+ private readonly gConv: Float32Array[];
111
+ /** Per-layer pre-conv / pre-MoE RMSNorm gains (dModel each). */
112
+ private readonly nConv: Float32Array[];
113
+ private readonly gNConv: Float32Array[];
114
+ private readonly nMoe: Float32Array[];
115
+ private readonly gNMoe: Float32Array[];
116
+ /** Per-layer channel mixer. */
117
+ private readonly moe: SharedExpertMoE[];
118
+
119
+ constructor(config: EvermindLMConfig) {
120
+ const dModel = config.dModel ?? DEFAULT_LM_CONFIG.dModel;
121
+ const cfg: Required<Omit<EvermindLMConfig, "seed">> = {
122
+ vocabSize: config.vocabSize,
123
+ dModel,
124
+ numLayers: config.numLayers ?? DEFAULT_LM_CONFIG.numLayers,
125
+ convKernel: config.convKernel ?? DEFAULT_LM_CONFIG.convKernel,
126
+ hiddenDim: config.hiddenDim ?? dModel * 2,
127
+ numExperts: config.numExperts ?? DEFAULT_LM_CONFIG.numExperts,
128
+ topK: config.topK ?? DEFAULT_LM_CONFIG.topK,
129
+ };
130
+ if (cfg.vocabSize <= 0) throw new Error("EvermindLM: vocabSize must be > 0");
131
+ if (cfg.topK > cfg.numExperts) throw new Error("EvermindLM: topK must be ≤ numExperts");
132
+ this.config = cfg;
133
+
134
+ const seed = (config.seed ?? DEFAULT_LM_SEED) >>> 0 || 1;
135
+ const rng = new SeededRng(seed);
136
+ const gauss = (n: number, std: number): Float32Array => {
137
+ const a = new Float32Array(n);
138
+ for (let i = 0; i < n; i++) {
139
+ const u1 = Math.max(rng.next(), 1e-12);
140
+ const u2 = rng.next();
141
+ a[i] = std * Math.sqrt(-2 * Math.log(u1)) * Math.cos(2 * Math.PI * u2);
142
+ }
143
+ return a;
144
+ };
145
+
146
+ this.emb = gauss(cfg.vocabSize * cfg.dModel, 0.02);
147
+ this.gEmb = new Float32Array(this.emb.length);
148
+ this.conv = [];
149
+ this.gConv = [];
150
+ this.nConv = [];
151
+ this.gNConv = [];
152
+ this.nMoe = [];
153
+ this.gNMoe = [];
154
+ this.moe = [];
155
+ for (let l = 0; l < cfg.numLayers; l++) {
156
+ // Conv init near an identity passthrough (current tap ≈ 1, history ≈ 0) so
157
+ // an untrained block is close to a residual no-op.
158
+ const k = new Float32Array(cfg.dModel * cfg.convKernel);
159
+ for (let c = 0; c < cfg.dModel; c++) k[c * cfg.convKernel] = 1;
160
+ this.conv.push(k);
161
+ this.gConv.push(new Float32Array(k.length));
162
+ // RMSNorm gains start at 1 (identity scale).
163
+ this.nConv.push(new Float32Array(cfg.dModel).fill(1));
164
+ this.gNConv.push(new Float32Array(cfg.dModel));
165
+ this.nMoe.push(new Float32Array(cfg.dModel).fill(1));
166
+ this.gNMoe.push(new Float32Array(cfg.dModel));
167
+ // Each MoE layer gets a distinct seed for varied expert init.
168
+ this.moe.push(
169
+ new SharedExpertMoE({
170
+ modelDim: cfg.dModel,
171
+ hiddenDim: cfg.hiddenDim,
172
+ numExperts: cfg.numExperts,
173
+ topK: cfg.topK,
174
+ seed: seed + 1 + l,
175
+ }),
176
+ );
177
+ }
178
+ }
179
+
180
+ // ── Forward ────────────────────────────────────────────────────────────────
181
+
182
+ /** Run the model over a token sequence; returns per-position logits + a cache. */
183
+ forward(tokens: number[]): { logits: Float32Array[]; cache: ForwardCache } {
184
+ const { dModel, convKernel, numLayers, vocabSize } = this.config;
185
+ const T = tokens.length;
186
+
187
+ // Embed.
188
+ let x: Float32Array[] = tokens.map((tok) => {
189
+ const row = new Float32Array(dModel);
190
+ const off = tok * dModel;
191
+ for (let c = 0; c < dModel; c++) row[c] = this.emb[off + c]!;
192
+ return row;
193
+ });
194
+
195
+ const layers: LayerCache[] = [];
196
+ for (let l = 0; l < numLayers; l++) {
197
+ const layerIn = x;
198
+ const ker = this.conv[l]!;
199
+ const nConv = this.nConv[l]!;
200
+ const nMoe = this.nMoe[l]!;
201
+
202
+ // Pre-norm → depthwise causal conv → residual.
203
+ const normedConv: Float32Array[] = [];
204
+ const rmsConv: number[] = [];
205
+ for (let t = 0; t < T; t++) {
206
+ const { y, r } = rmsNorm(layerIn[t]!, nConv);
207
+ normedConv.push(y);
208
+ rmsConv.push(r);
209
+ }
210
+ const afterConv: Float32Array[] = [];
211
+ for (let t = 0; t < T; t++) {
212
+ const out = Float32Array.from(layerIn[t]!); // residual base
213
+ for (let c = 0; c < dModel; c++) {
214
+ let acc = 0;
215
+ for (let j = 0; j < convKernel; j++) {
216
+ const ti = t - j;
217
+ if (ti >= 0) acc += ker[c * convKernel + j]! * normedConv[ti]![c]!;
218
+ }
219
+ out[c] = out[c]! + acc;
220
+ }
221
+ afterConv.push(out);
222
+ }
223
+
224
+ // Pre-norm → MoE channel mixer → residual.
225
+ const rmsMoe: number[] = [];
226
+ const moeCache: MoECacheLike[] = [];
227
+ const afterMoe: Float32Array[] = [];
228
+ for (let t = 0; t < T; t++) {
229
+ const { y, r } = rmsNorm(afterConv[t]!, nMoe);
230
+ rmsMoe.push(r);
231
+ const out = Float32Array.from(afterConv[t]!); // residual base
232
+ const mr = this.moe[l]!.forward(y);
233
+ for (let c = 0; c < dModel; c++) out[c] = out[c]! + mr.output[c]!;
234
+ afterMoe.push(out);
235
+ moeCache.push(mr.cache as unknown as MoECacheLike);
236
+ }
237
+
238
+ layers.push({ layerIn, normedConv, rmsConv, afterConv, rmsMoe, moeCache });
239
+ x = afterMoe;
240
+ }
241
+
242
+ // Tied head: logits_t[v] = x_t · emb[v].
243
+ const logits: Float32Array[] = x.map((xt) => {
244
+ const lg = new Float32Array(vocabSize);
245
+ for (let v = 0; v < vocabSize; v++) {
246
+ let acc = 0;
247
+ const off = v * dModel;
248
+ for (let c = 0; c < dModel; c++) acc += xt[c]! * this.emb[off + c]!;
249
+ lg[v] = acc;
250
+ }
251
+ return lg;
252
+ });
253
+
254
+ return { logits, cache: { tokens, layers, finalX: x } };
255
+ }
256
+
257
+ // ── Loss + backward ──────────────────────────────────────────────────────────
258
+
259
+ /**
260
+ * Next-token cross-entropy over the sequence (predict tokens[t+1] from
261
+ * position t), accumulating exact gradients. Returns the mean loss. Call
262
+ * {@link zeroGrad} before and an optimiser step after.
263
+ */
264
+ lossAndBackward(tokens: number[]): number {
265
+ const { dModel, convKernel, numLayers, vocabSize } = this.config;
266
+ const T = tokens.length;
267
+ if (T < 2) return 0;
268
+ const { logits, cache } = this.forward(tokens);
269
+
270
+ const predPositions = T - 1; // positions 0..T-2 predict the next token
271
+ const inv = 1 / predPositions;
272
+
273
+ // dL/d(finalX_t) and head gradient into the tied embedding.
274
+ const dX: Float32Array[] = Array.from({ length: T }, () => new Float32Array(dModel));
275
+ let loss = 0;
276
+ for (let t = 0; t < predPositions; t++) {
277
+ const target = tokens[t + 1]!;
278
+ loss += crossEntropyLoss(logits[t]!, target) * inv;
279
+ const dLogit = crossEntropyGrad(logits[t]!, target); // probs - onehot
280
+ const xt = cache.finalX[t]!;
281
+ for (let v = 0; v < vocabSize; v++) {
282
+ const g = dLogit[v]! * inv;
283
+ if (g === 0) continue;
284
+ const off = v * dModel;
285
+ for (let c = 0; c < dModel; c++) {
286
+ this.gEmb[off + c] = this.gEmb[off + c]! + g * xt[c]!; // head → emb
287
+ dX[t]![c] = dX[t]![c]! + g * this.emb[off + c]!; // head → x_t
288
+ }
289
+ }
290
+ }
291
+
292
+ // Backprop through layers in reverse.
293
+ for (let l = numLayers - 1; l >= 0; l--) {
294
+ const lc = cache.layers[l]!;
295
+ const ker = this.conv[l]!;
296
+ const gker = this.gConv[l]!;
297
+ const nConv = this.nConv[l]!;
298
+ const gNConv = this.gNConv[l]!;
299
+ const nMoe = this.nMoe[l]!;
300
+ const gNMoe = this.gNMoe[l]!;
301
+
302
+ // MoE sub-block: afterMoe = afterConv + MoE(RMSNorm(afterConv, nMoe)).
303
+ const dAfterConv: Float32Array[] = [];
304
+ for (let t = 0; t < T; t++) {
305
+ const dMoeNormed = this.moe[l]!.backward(dX[t]!, lc.moeCache[t] as never);
306
+ const { dx, dgain } = rmsNormBackward(dMoeNormed, lc.afterConv[t]!, lc.rmsMoe[t]!, nMoe);
307
+ for (let c = 0; c < dModel; c++) gNMoe[c] = gNMoe[c]! + dgain[c]!;
308
+ const d = Float32Array.from(dX[t]!); // residual passthrough
309
+ for (let c = 0; c < dModel; c++) d[c] = d[c]! + dx[c]!;
310
+ dAfterConv.push(d);
311
+ }
312
+
313
+ // Conv sub-block: afterConv = layerIn + conv(RMSNorm(layerIn, nConv)).
314
+ const dNormedConv: Float32Array[] = Array.from({ length: T }, () => new Float32Array(dModel));
315
+ const dLayerIn: Float32Array[] = dAfterConv.map((v) => Float32Array.from(v)); // residual passthrough
316
+ for (let t = 0; t < T; t++) {
317
+ for (let c = 0; c < dModel; c++) {
318
+ const dmix = dAfterConv[t]![c]!;
319
+ if (dmix === 0) continue;
320
+ for (let j = 0; j < convKernel; j++) {
321
+ const ti = t - j;
322
+ if (ti < 0) continue;
323
+ gker[c * convKernel + j] = gker[c * convKernel + j]! + dmix * lc.normedConv[ti]![c]!;
324
+ dNormedConv[ti]![c] = dNormedConv[ti]![c]! + dmix * ker[c * convKernel + j]!;
325
+ }
326
+ }
327
+ }
328
+ for (let t = 0; t < T; t++) {
329
+ const { dx, dgain } = rmsNormBackward(dNormedConv[t]!, lc.layerIn[t]!, lc.rmsConv[t]!, nConv);
330
+ for (let c = 0; c < dModel; c++) {
331
+ gNConv[c] = gNConv[c]! + dgain[c]!;
332
+ dLayerIn[t]![c] = dLayerIn[t]![c]! + dx[c]!;
333
+ }
334
+ }
335
+ for (let t = 0; t < T; t++) dX[t] = dLayerIn[t]!;
336
+ }
337
+
338
+ // Embedding lookup: dX at layer-0 input flows into the row for token_t.
339
+ for (let t = 0; t < T; t++) {
340
+ const off = tokens[t]! * dModel;
341
+ for (let c = 0; c < dModel; c++) this.gEmb[off + c] = this.gEmb[off + c]! + dX[t]![c]!;
342
+ }
343
+
344
+ return loss;
345
+ }
346
+
347
+ // ── Generation ───────────────────────────────────────────────────────────────
348
+
349
+ /**
350
+ * Text-level generation: encode the prompt, generate, decode. `codec` is any
351
+ * tokenizer exposing encode/decode (the engine's `BPETokenizer` satisfies it),
352
+ * so the LM consumes and emits real text rather than raw token ids. The model's
353
+ * `vocabSize` must match the codec's vocabulary.
354
+ */
355
+ generateText(prompt: string, codec: TextCodec, opts: LMGenerateOptions): string {
356
+ return codec.decode(this.generate(codec.encode(prompt), opts));
357
+ }
358
+
359
+ /** Greedy / temperature-sampled autoregressive generation. Returns NEW token ids. */
360
+ generate(prompt: number[], opts: LMGenerateOptions): number[] {
361
+ const temperature = opts.temperature ?? 0;
362
+ const rng = temperature > 0 ? new SeededRng((opts.seed ?? 1) >>> 0 || 1) : null;
363
+ const tokens = [...prompt];
364
+ const produced: number[] = [];
365
+ for (let n = 0; n < opts.maxNewTokens; n++) {
366
+ const { logits } = this.forward(tokens.length > 0 ? tokens : [0]);
367
+ const last = logits[logits.length - 1]!;
368
+ const next = rng ? sampleTemperature(last, temperature, rng) : argmax(last);
369
+ produced.push(next);
370
+ tokens.push(next);
371
+ if (opts.stopToken !== undefined && next === opts.stopToken) break;
372
+ }
373
+ return produced;
374
+ }
375
+
376
+ // ── Parameters / checkpoint ──────────────────────────────────────────────────
377
+
378
+ /** All trainable parameters as {data} (AdamW-compatible), canonical order. */
379
+ parameters(): { data: Float32Array }[] {
380
+ const out: { data: Float32Array }[] = [{ data: this.emb }];
381
+ for (let l = 0; l < this.config.numLayers; l++) {
382
+ out.push({ data: this.conv[l]! }, { data: this.nConv[l]! }, { data: this.nMoe[l]! });
383
+ for (const p of this.moe[l]!.parameters()) out.push({ data: p.data });
384
+ }
385
+ return out;
386
+ }
387
+
388
+ /** Gradient buffers, index-aligned with {@link parameters}. */
389
+ gradients(): { data: Float32Array }[] {
390
+ const out: { data: Float32Array }[] = [{ data: this.gEmb }];
391
+ for (let l = 0; l < this.config.numLayers; l++) {
392
+ out.push({ data: this.gConv[l]! }, { data: this.gNConv[l]! }, { data: this.gNMoe[l]! });
393
+ for (const g of this.moe[l]!.gradients()) out.push({ data: g.data });
394
+ }
395
+ return out;
396
+ }
397
+
398
+ zeroGrad(): void {
399
+ this.gEmb.fill(0);
400
+ for (let l = 0; l < this.config.numLayers; l++) {
401
+ this.gConv[l]!.fill(0);
402
+ this.gNConv[l]!.fill(0);
403
+ this.gNMoe[l]!.fill(0);
404
+ this.moe[l]!.zeroGrad();
405
+ }
406
+ }
407
+
408
+ /** Serialise to an "EVL0" binary (fp16 or f32), params in {@link parameters} order. */
409
+ exportWeights(opts: { fp16?: boolean } = {}): ArrayBuffer {
410
+ const fp16 = opts.fp16 ?? false;
411
+ const params = this.parameters();
412
+ const total = params.reduce((n, p) => n + p.data.length, 0);
413
+ // magic, version, vocab, dModel, numLayers, convKernel, hiddenDim, numExperts, topK.
414
+ // numExperts and topK get distinct slots (an earlier *16 packing collided once
415
+ // numExperts ≥ 16 — e.g. (20,20) and (21,4) both packed to 340).
416
+ const headerEls = 9;
417
+ const headerBytes = headerEls * 4;
418
+ const buf = new ArrayBuffer(headerBytes + (fp16 ? total * 2 : total * 4));
419
+ const head = new Uint32Array(buf, 0, headerEls);
420
+ head[0] = MAGIC;
421
+ head[1] = fp16 ? 2 : 1;
422
+ head[2] = this.config.vocabSize;
423
+ head[3] = this.config.dModel;
424
+ head[4] = this.config.numLayers;
425
+ head[5] = this.config.convKernel;
426
+ head[6] = this.config.hiddenDim;
427
+ head[7] = this.config.numExperts;
428
+ head[8] = this.config.topK;
429
+ const flat = new Float32Array(total);
430
+ let o = 0;
431
+ for (const p of params) {
432
+ flat.set(p.data, o);
433
+ o += p.data.length;
434
+ }
435
+ if (fp16) new Uint16Array(buf, headerBytes, total).set(quantizeFp16(flat));
436
+ else new Float32Array(buf, headerBytes, total).set(flat);
437
+ return buf;
438
+ }
439
+
440
+ /** Load weights from an "EVL0" binary. Validates magic + dims. */
441
+ loadWeights(buffer: ArrayBuffer): void {
442
+ const head = new Uint32Array(buffer, 0, 9);
443
+ if (head[0] !== MAGIC) throw new Error("EvermindLM.loadWeights: bad magic (not an EVL0 checkpoint)");
444
+ const version = head[1]!;
445
+ if (
446
+ head[2] !== this.config.vocabSize ||
447
+ head[3] !== this.config.dModel ||
448
+ head[4] !== this.config.numLayers ||
449
+ head[5] !== this.config.convKernel ||
450
+ head[6] !== this.config.hiddenDim ||
451
+ head[7] !== this.config.numExperts ||
452
+ head[8] !== this.config.topK
453
+ ) {
454
+ throw new Error("EvermindLM.loadWeights: config mismatch with checkpoint");
455
+ }
456
+ const params = this.parameters();
457
+ const total = params.reduce((n, p) => n + p.data.length, 0);
458
+ const headerBytes = 36;
459
+ const flat =
460
+ version === 2
461
+ ? dequantizeFp16(new Uint16Array(buffer, headerBytes, total))
462
+ : new Float32Array(buffer.slice(headerBytes, headerBytes + total * 4));
463
+ let o = 0;
464
+ for (const p of params) {
465
+ p.data.set(flat.subarray(o, o + p.data.length));
466
+ o += p.data.length;
467
+ }
468
+ }
469
+ }
470
+
471
+ /** Minimal sequence trainer: AdamW over next-token cross-entropy. */
472
+ export class EvermindLMTrainer {
473
+ private readonly adam: AdamW;
474
+ constructor(
475
+ private readonly model: EvermindLM,
476
+ private readonly opts: AdamWOptions & { epochs?: number } = {},
477
+ ) {
478
+ this.adam = new AdamW(model, opts);
479
+ }
480
+ /** Train on a set of token sequences; returns per-epoch mean loss. */
481
+ fit(sequences: number[][]): number[] {
482
+ const epochs = this.opts.epochs ?? 1;
483
+ const history: number[] = [];
484
+ for (let e = 0; e < epochs; e++) {
485
+ let total = 0;
486
+ let n = 0;
487
+ for (const seq of sequences) {
488
+ if (seq.length < 2) continue;
489
+ this.model.zeroGrad();
490
+ total += this.model.lossAndBackward(seq);
491
+ this.adam.step();
492
+ n++;
493
+ }
494
+ history.push(n > 0 ? total / n : 0);
495
+ }
496
+ return history;
497
+ }
498
+ }
499
+
500
+ const RMS_EPS = 1e-5;
501
+
502
+ /** RMSNorm: y[c] = gain[c]·x[c]/rms, rms = sqrt(mean(x²)+eps). Returns y and the denom. */
503
+ function rmsNorm(x: Float32Array, gain: Float32Array): { y: Float32Array; r: number } {
504
+ const D = x.length;
505
+ let ss = 0;
506
+ for (let c = 0; c < D; c++) ss += x[c]! * x[c]!;
507
+ const r = Math.sqrt(ss / D + RMS_EPS);
508
+ const y = new Float32Array(D);
509
+ for (let c = 0; c < D; c++) y[c] = (gain[c]! * x[c]!) / r;
510
+ return { y, r };
511
+ }
512
+
513
+ /**
514
+ * RMSNorm backward. Given dL/dy and the cached input/denom/gain, returns dL/dx and
515
+ * dL/dgain. dx_j = gain_j·dy_j/r − x_j·A/(D·r³) with A = Σ_c dy_c·gain_c·x_c;
516
+ * dgain_c = dy_c·x_c/r.
517
+ */
518
+ function rmsNormBackward(
519
+ dy: Float32Array,
520
+ x: Float32Array,
521
+ r: number,
522
+ gain: Float32Array,
523
+ ): { dx: Float32Array; dgain: Float32Array } {
524
+ const D = x.length;
525
+ let A = 0;
526
+ for (let c = 0; c < D; c++) A += dy[c]! * gain[c]! * x[c]!;
527
+ const dx = new Float32Array(D);
528
+ const dgain = new Float32Array(D);
529
+ const r3 = r * r * r;
530
+ for (let c = 0; c < D; c++) {
531
+ dx[c] = (gain[c]! * dy[c]!) / r - (x[c]! * A) / (D * r3);
532
+ dgain[c] = (dy[c]! * x[c]!) / r;
533
+ }
534
+ return { dx, dgain };
535
+ }
536
+
537
+ function argmax(v: Float32Array): number {
538
+ let best = 0;
539
+ for (let i = 1; i < v.length; i++) if (v[i]! > v[best]!) best = i;
540
+ return best;
541
+ }
542
+
543
+ function sampleTemperature(logits: Float32Array, temperature: number, rng: SeededRng): number {
544
+ let max = -Infinity;
545
+ for (let i = 0; i < logits.length; i++) if (logits[i]! / temperature > max) max = logits[i]! / temperature;
546
+ let sum = 0;
547
+ const probs = new Float32Array(logits.length);
548
+ for (let i = 0; i < logits.length; i++) {
549
+ probs[i] = Math.exp(logits[i]! / temperature - max);
550
+ sum += probs[i]!;
551
+ }
552
+ let r = rng.next() * sum;
553
+ for (let i = 0; i < probs.length; i++) {
554
+ r -= probs[i]!;
555
+ if (r <= 0) return i;
556
+ }
557
+ return probs.length - 1;
558
+ }
@@ -0,0 +1,6 @@
1
+ /**
2
+ * EvermindLM — the generative language model (the runnable "AI").
3
+ */
4
+
5
+ export { EvermindLM, EvermindLMTrainer, DEFAULT_LM_CONFIG, DEFAULT_LM_SEED } from "./evermind_lm.js";
6
+ export type { EvermindLMConfig, LMGenerateOptions, TextCodec } from "./evermind_lm.js";
@@ -114,7 +114,7 @@ export class AttentionBlock implements SequenceLayer {
114
114
  }
115
115
 
116
116
  private _initWeights(): void {
117
- const { dModel, nHeads, hasFfn, ffnMult } = this.config;
117
+ const { dModel, hasFfn, ffnMult } = this.config;
118
118
 
119
119
  const randn = (n: number, std = 0.02): Float32Array => gaussianArray(n, std);
120
120
 
@@ -375,7 +375,7 @@ export class HybridMambaModel {
375
375
  const { temperature = 1.0, topK = 50, topP = 0.9 } = samplingOpts;
376
376
  const { vocabSize } = this.config;
377
377
 
378
- let ids = [...promptIds];
378
+ const ids = [...promptIds];
379
379
 
380
380
  for (let step = 0; step < maxNewTokens; step++) {
381
381
  const { logits } = await this.forward(new Uint32Array(ids), 1, ids.length);
@@ -0,0 +1,23 @@
1
+ /**
2
+ * Mixture-of-Experts — shared-expert hybrid sparsity for the Evermind generator.
3
+ */
4
+
5
+ export {
6
+ SharedExpertMoE,
7
+ LoadBalanceAccumulator,
8
+ DEFAULT_MOE_CONFIG,
9
+ DEFAULT_MOE_SEED,
10
+ } from "./moe_model.js";
11
+ export type { MoEConfig, MoEParam, RouteResult } from "./moe_model.js";
12
+
13
+ export { MoETrainer } from "./moe_trainer.js";
14
+ export type { MoESample, MoETrainOptions, MoEEpochResult } from "./moe_trainer.js";
15
+
16
+ export { EvermindModelPackage } from "./moe_package.js";
17
+ export type {
18
+ EvermindModelManifest,
19
+ EvermindModelCard,
20
+ EvermindModelType,
21
+ PackageMeta,
22
+ ValidationResult,
23
+ } from "./moe_package.js";