@seanhogg/builderforce-memory-engine 2026.6.27 → 2026.6.28

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (47) hide show
  1. package/dist/index.d.ts +6 -0
  2. package/dist/index.d.ts.map +1 -1
  3. package/dist/index.js +5 -0
  4. package/dist/index.js.map +1 -1
  5. package/dist/lm/evermind_lm.d.ts +148 -0
  6. package/dist/lm/evermind_lm.d.ts.map +1 -0
  7. package/dist/lm/evermind_lm.js +479 -0
  8. package/dist/lm/evermind_lm.js.map +1 -0
  9. package/dist/lm/index.d.ts +6 -0
  10. package/dist/lm/index.d.ts.map +1 -0
  11. package/dist/lm/index.js +5 -0
  12. package/dist/lm/index.js.map +1 -0
  13. package/dist/model/attention_block.js +1 -1
  14. package/dist/model/attention_block.js.map +1 -1
  15. package/dist/model/mamba_model.js +1 -1
  16. package/dist/model/mamba_model.js.map +1 -1
  17. package/dist/moe/index.d.ts +10 -0
  18. package/dist/moe/index.d.ts.map +1 -0
  19. package/dist/moe/index.js +7 -0
  20. package/dist/moe/index.js.map +1 -0
  21. package/dist/moe/moe_model.d.ts +134 -0
  22. package/dist/moe/moe_model.d.ts.map +1 -0
  23. package/dist/moe/moe_model.js +415 -0
  24. package/dist/moe/moe_model.js.map +1 -0
  25. package/dist/moe/moe_package.d.ts +81 -0
  26. package/dist/moe/moe_package.d.ts.map +1 -0
  27. package/dist/moe/moe_package.js +157 -0
  28. package/dist/moe/moe_package.js.map +1 -0
  29. package/dist/moe/moe_trainer.d.ts +53 -0
  30. package/dist/moe/moe_trainer.d.ts.map +1 -0
  31. package/dist/moe/moe_trainer.js +93 -0
  32. package/dist/moe/moe_trainer.js.map +1 -0
  33. package/dist/optim/adamw.d.ts +32 -0
  34. package/dist/optim/adamw.d.ts.map +1 -0
  35. package/dist/optim/adamw.js +52 -0
  36. package/dist/optim/adamw.js.map +1 -0
  37. package/package.json +1 -1
  38. package/src/index.ts +28 -0
  39. package/src/lm/evermind_lm.ts +558 -0
  40. package/src/lm/index.ts +6 -0
  41. package/src/model/attention_block.ts +1 -1
  42. package/src/model/mamba_model.ts +1 -1
  43. package/src/moe/index.ts +23 -0
  44. package/src/moe/moe_model.ts +475 -0
  45. package/src/moe/moe_package.ts +205 -0
  46. package/src/moe/moe_trainer.ts +134 -0
  47. package/src/optim/adamw.ts +72 -0
@@ -0,0 +1,134 @@
1
+ /**
2
+ * moe_trainer.ts — AdamW training loop for {@link SharedExpertMoE}.
3
+ *
4
+ * Makes "train your own Evermind AI" real: given labelled (input → target)
5
+ * samples, runs minibatch AdamW over the flat parameters with the load-balancing
6
+ * auxiliary loss mixed in (so the router spreads load instead of collapsing onto
7
+ * a few experts). Pure CPU, deterministic given a seeded model — the same loop a
8
+ * WebGPU optimiser kernel would accelerate.
9
+ */
10
+
11
+ import { SharedExpertMoE } from "./moe_model.js";
12
+ import { AdamW } from "../optim/adamw.js";
13
+
14
+ export interface MoESample {
15
+ input: ArrayLike<number>;
16
+ target: ArrayLike<number>;
17
+ }
18
+
19
+ export interface MoETrainOptions {
20
+ /** Learning rate. Default 0.01. */
21
+ lr?: number;
22
+ /** AdamW β1. Default 0.9. */
23
+ beta1?: number;
24
+ /** AdamW β2. Default 0.999. */
25
+ beta2?: number;
26
+ /** AdamW ε. Default 1e-8. */
27
+ eps?: number;
28
+ /** Decoupled weight decay. Default 0. */
29
+ weightDecay?: number;
30
+ /** Weight of the load-balancing auxiliary loss. Default 0.01. */
31
+ auxWeight?: number;
32
+ /** Minibatch size. Default = all samples (full batch). */
33
+ batchSize?: number;
34
+ /** Passes over the dataset. Default 1. */
35
+ epochs?: number;
36
+ }
37
+
38
+ export interface MoEEpochResult {
39
+ /** Mean per-sample task (MSE·½) loss over the epoch. */
40
+ loss: number;
41
+ /** Load-balancing auxiliary loss at the end of the epoch (≈1 balanced … E collapsed). */
42
+ auxLoss: number;
43
+ }
44
+
45
+ /**
46
+ * AdamW optimiser over a model's flat parameter list. State (m, v) is keyed by
47
+ * parameter index and persists across {@link step} calls.
48
+ */
49
+ export class MoETrainer {
50
+ private readonly adam: AdamW;
51
+ private readonly opt: Required<MoETrainOptions>;
52
+
53
+ constructor(
54
+ private readonly model: SharedExpertMoE,
55
+ options: MoETrainOptions = {},
56
+ ) {
57
+ this.opt = {
58
+ lr: options.lr ?? 0.01,
59
+ beta1: options.beta1 ?? 0.9,
60
+ beta2: options.beta2 ?? 0.999,
61
+ eps: options.eps ?? 1e-8,
62
+ weightDecay: options.weightDecay ?? 0,
63
+ auxWeight: options.auxWeight ?? 0.01,
64
+ batchSize: options.batchSize ?? 0,
65
+ epochs: options.epochs ?? 1,
66
+ };
67
+ this.adam = new AdamW(model, this.opt);
68
+ }
69
+
70
+ /** Train for the configured epochs. Returns the per-epoch loss history. */
71
+ fit(samples: MoESample[]): MoEEpochResult[] {
72
+ const history: MoEEpochResult[] = [];
73
+ for (let e = 0; e < this.opt.epochs; e++) history.push(this.runEpoch(samples));
74
+ return history;
75
+ }
76
+
77
+ private runEpoch(samples: MoESample[]): MoEEpochResult {
78
+ const batchSize = this.opt.batchSize > 0 ? this.opt.batchSize : samples.length;
79
+ const { numExperts, modelDim } = this.model.config;
80
+ let epochLoss = 0;
81
+ let lastAux = 0;
82
+
83
+ for (let start = 0; start < samples.length; start += batchSize) {
84
+ const batch = samples.slice(start, start + batchSize);
85
+ this.model.zeroGrad();
86
+
87
+ // Forward + task backward, retaining (x, probs) for the batch aux gradient.
88
+ const xs: Float32Array[] = [];
89
+ const probsList: Float32Array[] = [];
90
+ const counts = new Float32Array(numExperts);
91
+ let batchLoss = 0;
92
+
93
+ for (const s of batch) {
94
+ const f = this.model.forward(s.input);
95
+ const dOut = new Float32Array(modelDim);
96
+ for (let d = 0; d < modelDim; d++) {
97
+ const diff = f.output[d]! - (s.target[d] ?? 0);
98
+ dOut[d] = diff;
99
+ batchLoss += 0.5 * diff * diff;
100
+ }
101
+ this.model.backward(dOut, f.cache);
102
+ xs.push(f.cache.x);
103
+ probsList.push(f.route.probs);
104
+ for (const ex of f.route.experts) counts[ex] = counts[ex]! + 1;
105
+ }
106
+
107
+ // Load-balancing aux gradient (batch-level): f = dispatch fractions.
108
+ const dispatched = counts.reduce((a, b) => a + b, 0) || 1;
109
+ const fVec = Float32Array.from(counts, (c) => c / dispatched);
110
+ const scale = (this.opt.auxWeight * numExperts) / batch.length;
111
+ for (let i = 0; i < xs.length; i++) {
112
+ this.model.auxGradStep(xs[i]!, probsList[i]!, fVec, scale);
113
+ }
114
+
115
+ // Average the task gradient over the batch, then AdamW step.
116
+ this.scaleGradients(1 / batch.length);
117
+ this.adam.step();
118
+
119
+ epochLoss += batchLoss;
120
+ lastAux = numExperts * fVec.reduce((sum, f, e) => sum + f * (probsList.length
121
+ ? probsList.reduce((s, p) => s + p[e]!, 0) / probsList.length
122
+ : 0), 0);
123
+ }
124
+
125
+ return { loss: epochLoss / Math.max(1, samples.length), auxLoss: lastAux };
126
+ }
127
+
128
+ private scaleGradients(k: number): void {
129
+ if (k === 1) return;
130
+ for (const g of this.model.gradients()) {
131
+ for (let i = 0; i < g.data.length; i++) g.data[i] = g.data[i]! * k;
132
+ }
133
+ }
134
+ }
@@ -0,0 +1,72 @@
1
+ /**
2
+ * adamw.ts — AdamW optimiser over a model's flat parameter list.
3
+ *
4
+ * Shared by every CPU-reference trainer in the engine (MoE FFN, the full
5
+ * EvermindLM) so the optimiser maths lives in exactly one place. Operates on any
6
+ * object exposing index-aligned `parameters()` / `gradients()` Float32Arrays.
7
+ */
8
+
9
+ export interface OptimParam {
10
+ data: Float32Array;
11
+ }
12
+
13
+ export interface OptimTarget {
14
+ parameters(): OptimParam[];
15
+ gradients(): OptimParam[];
16
+ }
17
+
18
+ export interface AdamWOptions {
19
+ lr?: number;
20
+ beta1?: number;
21
+ beta2?: number;
22
+ eps?: number;
23
+ weightDecay?: number;
24
+ }
25
+
26
+ export class AdamW {
27
+ private readonly m: Float32Array[] = [];
28
+ private readonly v: Float32Array[] = [];
29
+ private t = 0;
30
+ private readonly opt: Required<AdamWOptions>;
31
+
32
+ constructor(
33
+ private readonly target: OptimTarget,
34
+ options: AdamWOptions = {},
35
+ ) {
36
+ this.opt = {
37
+ lr: options.lr ?? 0.01,
38
+ beta1: options.beta1 ?? 0.9,
39
+ beta2: options.beta2 ?? 0.999,
40
+ eps: options.eps ?? 1e-8,
41
+ weightDecay: options.weightDecay ?? 0,
42
+ };
43
+ for (const p of target.parameters()) {
44
+ this.m.push(new Float32Array(p.data.length));
45
+ this.v.push(new Float32Array(p.data.length));
46
+ }
47
+ }
48
+
49
+ /** One optimiser step from the currently-accumulated gradients. */
50
+ step(): void {
51
+ this.t++;
52
+ const { lr, beta1, beta2, eps, weightDecay } = this.opt;
53
+ const params = this.target.parameters();
54
+ const grads = this.target.gradients();
55
+ const bc1 = 1 - Math.pow(beta1, this.t);
56
+ const bc2 = 1 - Math.pow(beta2, this.t);
57
+ for (let p = 0; p < params.length; p++) {
58
+ const w = params[p]!.data;
59
+ const g = grads[p]!.data;
60
+ const m = this.m[p]!;
61
+ const v = this.v[p]!;
62
+ for (let i = 0; i < w.length; i++) {
63
+ const gi = g[i]!;
64
+ m[i] = beta1 * m[i]! + (1 - beta1) * gi;
65
+ v[i] = beta2 * v[i]! + (1 - beta2) * gi * gi;
66
+ const mh = m[i]! / bc1;
67
+ const vh = v[i]! / bc2;
68
+ w[i] = w[i]! - lr * (mh / (Math.sqrt(vh) + eps) + weightDecay * w[i]!);
69
+ }
70
+ }
71
+ }
72
+ }