@seanhogg/builderforce-memory-engine 2026.6.20 → 2026.6.28
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +9 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +8 -0
- package/dist/index.js.map +1 -1
- package/dist/kernels/limbic_affect.d.ts +2 -0
- package/dist/kernels/limbic_affect.d.ts.map +1 -0
- package/dist/kernels/limbic_affect.js +74 -0
- package/dist/kernels/limbic_affect.js.map +1 -0
- package/dist/limbic/index.d.ts +14 -0
- package/dist/limbic/index.d.ts.map +1 -0
- package/dist/limbic/index.js +11 -0
- package/dist/limbic/index.js.map +1 -0
- package/dist/limbic/limbic_model.d.ts +111 -0
- package/dist/limbic/limbic_model.d.ts.map +1 -0
- package/dist/limbic/limbic_model.js +299 -0
- package/dist/limbic/limbic_model.js.map +1 -0
- package/dist/limbic/limbic_trainer.d.ts +62 -0
- package/dist/limbic/limbic_trainer.d.ts.map +1 -0
- package/dist/limbic/limbic_trainer.js +172 -0
- package/dist/limbic/limbic_trainer.js.map +1 -0
- package/dist/limbic/regions.d.ts +79 -0
- package/dist/limbic/regions.d.ts.map +1 -0
- package/dist/limbic/regions.js +132 -0
- package/dist/limbic/regions.js.map +1 -0
- package/dist/lm/evermind_lm.d.ts +148 -0
- package/dist/lm/evermind_lm.d.ts.map +1 -0
- package/dist/lm/evermind_lm.js +479 -0
- package/dist/lm/evermind_lm.js.map +1 -0
- package/dist/lm/index.d.ts +6 -0
- package/dist/lm/index.d.ts.map +1 -0
- package/dist/lm/index.js +5 -0
- package/dist/lm/index.js.map +1 -0
- package/dist/model/attention_block.js +1 -1
- package/dist/model/attention_block.js.map +1 -1
- package/dist/model/mamba_model.js +1 -1
- package/dist/model/mamba_model.js.map +1 -1
- package/dist/moe/index.d.ts +10 -0
- package/dist/moe/index.d.ts.map +1 -0
- package/dist/moe/index.js +7 -0
- package/dist/moe/index.js.map +1 -0
- package/dist/moe/moe_model.d.ts +134 -0
- package/dist/moe/moe_model.d.ts.map +1 -0
- package/dist/moe/moe_model.js +415 -0
- package/dist/moe/moe_model.js.map +1 -0
- package/dist/moe/moe_package.d.ts +81 -0
- package/dist/moe/moe_package.d.ts.map +1 -0
- package/dist/moe/moe_package.js +157 -0
- package/dist/moe/moe_package.js.map +1 -0
- package/dist/moe/moe_trainer.d.ts +53 -0
- package/dist/moe/moe_trainer.d.ts.map +1 -0
- package/dist/moe/moe_trainer.js +93 -0
- package/dist/moe/moe_trainer.js.map +1 -0
- package/dist/optim/adamw.d.ts +32 -0
- package/dist/optim/adamw.d.ts.map +1 -0
- package/dist/optim/adamw.js +52 -0
- package/dist/optim/adamw.js.map +1 -0
- package/package.json +1 -1
- package/src/index.ts +59 -0
- package/src/kernels/limbic_affect.ts +74 -0
- package/src/limbic/index.ts +28 -0
- package/src/limbic/limbic_model.ts +373 -0
- package/src/limbic/limbic_trainer.ts +253 -0
- package/src/limbic/regions.ts +141 -0
- package/src/lm/evermind_lm.ts +558 -0
- package/src/lm/index.ts +6 -0
- package/src/model/attention_block.ts +1 -1
- package/src/model/mamba_model.ts +1 -1
- package/src/moe/index.ts +23 -0
- package/src/moe/moe_model.ts +475 -0
- package/src/moe/moe_package.ts +205 -0
- package/src/moe/moe_trainer.ts +134 -0
- package/src/optim/adamw.ts +72 -0
package/src/lm/index.ts
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* EvermindLM — the generative language model (the runnable "AI").
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
export { EvermindLM, EvermindLMTrainer, DEFAULT_LM_CONFIG, DEFAULT_LM_SEED } from "./evermind_lm.js";
|
|
6
|
+
export type { EvermindLMConfig, LMGenerateOptions, TextCodec } from "./evermind_lm.js";
|
|
@@ -114,7 +114,7 @@ export class AttentionBlock implements SequenceLayer {
|
|
|
114
114
|
}
|
|
115
115
|
|
|
116
116
|
private _initWeights(): void {
|
|
117
|
-
const { dModel,
|
|
117
|
+
const { dModel, hasFfn, ffnMult } = this.config;
|
|
118
118
|
|
|
119
119
|
const randn = (n: number, std = 0.02): Float32Array => gaussianArray(n, std);
|
|
120
120
|
|
package/src/model/mamba_model.ts
CHANGED
|
@@ -375,7 +375,7 @@ export class HybridMambaModel {
|
|
|
375
375
|
const { temperature = 1.0, topK = 50, topP = 0.9 } = samplingOpts;
|
|
376
376
|
const { vocabSize } = this.config;
|
|
377
377
|
|
|
378
|
-
|
|
378
|
+
const ids = [...promptIds];
|
|
379
379
|
|
|
380
380
|
for (let step = 0; step < maxNewTokens; step++) {
|
|
381
381
|
const { logits } = await this.forward(new Uint32Array(ids), 1, ids.length);
|
package/src/moe/index.ts
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Mixture-of-Experts — shared-expert hybrid sparsity for the Evermind generator.
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
export {
|
|
6
|
+
SharedExpertMoE,
|
|
7
|
+
LoadBalanceAccumulator,
|
|
8
|
+
DEFAULT_MOE_CONFIG,
|
|
9
|
+
DEFAULT_MOE_SEED,
|
|
10
|
+
} from "./moe_model.js";
|
|
11
|
+
export type { MoEConfig, MoEParam, RouteResult } from "./moe_model.js";
|
|
12
|
+
|
|
13
|
+
export { MoETrainer } from "./moe_trainer.js";
|
|
14
|
+
export type { MoESample, MoETrainOptions, MoEEpochResult } from "./moe_trainer.js";
|
|
15
|
+
|
|
16
|
+
export { EvermindModelPackage } from "./moe_package.js";
|
|
17
|
+
export type {
|
|
18
|
+
EvermindModelManifest,
|
|
19
|
+
EvermindModelCard,
|
|
20
|
+
EvermindModelType,
|
|
21
|
+
PackageMeta,
|
|
22
|
+
ValidationResult,
|
|
23
|
+
} from "./moe_package.js";
|
|
@@ -0,0 +1,475 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* moe_model.ts — SharedExpertMoE: a shared-expert hybrid Mixture-of-Experts FFN.
|
|
3
|
+
*
|
|
4
|
+
* The sparsity design behind Evermind's generator. Each token is processed by:
|
|
5
|
+
* • a DENSE shared expert that is ALWAYS active (carries continuous learning;
|
|
6
|
+
* the part the online-distillation signal flows into), plus
|
|
7
|
+
* • the top-k of N routed experts, gated by a learned router and combined by a
|
|
8
|
+
* softmax over the selected experts.
|
|
9
|
+
*
|
|
10
|
+
* y = SharedFFN(x) + Σ_{e ∈ topk(x)} gate_e · Expert_e(x)
|
|
11
|
+
*
|
|
12
|
+
* This is the DeepSeekMoE "shared-expert isolation" pattern: the dense backbone
|
|
13
|
+
* resolves the online-learning attribution problem (you distil into ONE always-on
|
|
14
|
+
* path), while the routed experts add web-pageable capacity (each expert's
|
|
15
|
+
* weights are an independent checkpoint — see {@link SharedExpertMoE.exportExpert}
|
|
16
|
+
* — so a host can stream only the experts a token activates).
|
|
17
|
+
*
|
|
18
|
+
* Pure-TS CPU reference (Float32Array, exact forward + backward), mirroring
|
|
19
|
+
* {@link LimbicModel}'s WebGPU-or-fallback contract — the WGSL kernel path
|
|
20
|
+
* (router gate + expert FFN GEMM) is a numerically-identical future acceleration.
|
|
21
|
+
*
|
|
22
|
+
* Activation is ReLU for an exact, unambiguous gradient in the reference path;
|
|
23
|
+
* production may swap GELU/SwiGLU behind the same shapes.
|
|
24
|
+
*/
|
|
25
|
+
|
|
26
|
+
import { SeededRng } from "../utils/rng.js";
|
|
27
|
+
import { quantizeFp16, dequantizeFp16 } from "../utils/quantization.js";
|
|
28
|
+
|
|
29
|
+
export interface MoEConfig {
|
|
30
|
+
/** Model (token) dimension — FFN input/output width. Default 64. */
|
|
31
|
+
modelDim: number;
|
|
32
|
+
/** Hidden width of each expert FFN. Default 128. */
|
|
33
|
+
hiddenDim: number;
|
|
34
|
+
/** Number of routed experts. Default 8. */
|
|
35
|
+
numExperts: number;
|
|
36
|
+
/** Experts activated per token (top-k). Default 2. Must be ≤ numExperts. */
|
|
37
|
+
topK: number;
|
|
38
|
+
/** Deterministic init seed for reproducible cold-start weights. */
|
|
39
|
+
seed?: number;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
export const DEFAULT_MOE_CONFIG: Required<Omit<MoEConfig, "seed">> = {
|
|
43
|
+
modelDim: 64,
|
|
44
|
+
hiddenDim: 128,
|
|
45
|
+
numExperts: 8,
|
|
46
|
+
topK: 2,
|
|
47
|
+
};
|
|
48
|
+
|
|
49
|
+
/** Fixed default init seed — reproducible byte-identical cold start across machines. */
|
|
50
|
+
export const DEFAULT_MOE_SEED = 0x4d6f4501; // "MoE\x01"
|
|
51
|
+
|
|
52
|
+
const MAGIC = 0x4d6f4530; // "MoE0"
|
|
53
|
+
|
|
54
|
+
/** A named trainable parameter tensor (flat row-major). */
|
|
55
|
+
export interface MoEParam {
|
|
56
|
+
name: string;
|
|
57
|
+
data: Float32Array;
|
|
58
|
+
numel: number;
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
/** Result of routing a token: which experts fire and with what combine weights. */
|
|
62
|
+
export interface RouteResult {
|
|
63
|
+
/** Indices of the selected top-k experts, highest router logit first. */
|
|
64
|
+
experts: number[];
|
|
65
|
+
/** Combine weights (softmax over the selected logits), index-aligned to `experts`. */
|
|
66
|
+
gates: number[];
|
|
67
|
+
/** Full softmax over ALL experts — the load-balancing signal. */
|
|
68
|
+
probs: Float32Array;
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
function relu(x: number): number {
|
|
72
|
+
return x > 0 ? x : 0;
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
/** A 2-layer FFN expert: y = W2·relu(W1·x + b1) + b2. */
|
|
76
|
+
class Expert {
|
|
77
|
+
// Parameters (flat, row-major).
|
|
78
|
+
w1: Float32Array; // hidden × model
|
|
79
|
+
b1: Float32Array; // hidden
|
|
80
|
+
w2: Float32Array; // model × hidden
|
|
81
|
+
b2: Float32Array; // model
|
|
82
|
+
// Gradient accumulators.
|
|
83
|
+
gW1: Float32Array;
|
|
84
|
+
gB1: Float32Array;
|
|
85
|
+
gW2: Float32Array;
|
|
86
|
+
gB2: Float32Array;
|
|
87
|
+
|
|
88
|
+
constructor(
|
|
89
|
+
private readonly modelDim: number,
|
|
90
|
+
private readonly hiddenDim: number,
|
|
91
|
+
gauss: (n: number, std: number) => Float32Array,
|
|
92
|
+
) {
|
|
93
|
+
// He-style init for the ReLU layer; small output init so an untrained expert
|
|
94
|
+
// contributes little until it has learned.
|
|
95
|
+
this.w1 = gauss(hiddenDim * modelDim, Math.sqrt(2 / modelDim));
|
|
96
|
+
this.b1 = new Float32Array(hiddenDim);
|
|
97
|
+
this.w2 = gauss(modelDim * hiddenDim, 0.02);
|
|
98
|
+
this.b2 = new Float32Array(modelDim);
|
|
99
|
+
this.gW1 = new Float32Array(this.w1.length);
|
|
100
|
+
this.gB1 = new Float32Array(this.b1.length);
|
|
101
|
+
this.gW2 = new Float32Array(this.w2.length);
|
|
102
|
+
this.gB2 = new Float32Array(this.b2.length);
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
/** Forward. Returns the output plus the cache needed for {@link backward}. */
|
|
106
|
+
forward(x: Float32Array): { y: Float32Array; pre: Float32Array; h: Float32Array } {
|
|
107
|
+
const { modelDim, hiddenDim } = this;
|
|
108
|
+
const pre = new Float32Array(hiddenDim);
|
|
109
|
+
const h = new Float32Array(hiddenDim);
|
|
110
|
+
for (let j = 0; j < hiddenDim; j++) {
|
|
111
|
+
let acc = this.b1[j]!;
|
|
112
|
+
const off = j * modelDim;
|
|
113
|
+
for (let i = 0; i < modelDim; i++) acc += this.w1[off + i]! * x[i]!;
|
|
114
|
+
pre[j] = acc;
|
|
115
|
+
h[j] = relu(acc);
|
|
116
|
+
}
|
|
117
|
+
const y = new Float32Array(modelDim);
|
|
118
|
+
for (let d = 0; d < modelDim; d++) {
|
|
119
|
+
let acc = this.b2[d]!;
|
|
120
|
+
const off = d * hiddenDim;
|
|
121
|
+
for (let j = 0; j < hiddenDim; j++) acc += this.w2[off + j]! * h[j]!;
|
|
122
|
+
y[d] = acc;
|
|
123
|
+
}
|
|
124
|
+
return { y, pre, h };
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
/** Accumulate gradients for one token given dL/dy. Returns dL/dx. */
|
|
128
|
+
backward(dy: Float32Array, x: Float32Array, pre: Float32Array, h: Float32Array): Float32Array {
|
|
129
|
+
const { modelDim, hiddenDim } = this;
|
|
130
|
+
const dh = new Float32Array(hiddenDim);
|
|
131
|
+
for (let d = 0; d < modelDim; d++) {
|
|
132
|
+
const dyd = dy[d]!;
|
|
133
|
+
this.gB2[d] = this.gB2[d]! + dyd;
|
|
134
|
+
const off = d * hiddenDim;
|
|
135
|
+
for (let j = 0; j < hiddenDim; j++) {
|
|
136
|
+
this.gW2[off + j] = this.gW2[off + j]! + dyd * h[j]!;
|
|
137
|
+
dh[j] = dh[j]! + dyd * this.w2[off + j]!;
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
const dx = new Float32Array(modelDim);
|
|
141
|
+
for (let j = 0; j < hiddenDim; j++) {
|
|
142
|
+
const dpre = pre[j]! > 0 ? dh[j]! : 0; // relu'
|
|
143
|
+
this.gB1[j] = this.gB1[j]! + dpre;
|
|
144
|
+
const off = j * modelDim;
|
|
145
|
+
for (let i = 0; i < modelDim; i++) {
|
|
146
|
+
this.gW1[off + i] = this.gW1[off + i]! + dpre * x[i]!;
|
|
147
|
+
dx[i] = dx[i]! + dpre * this.w1[off + i]!;
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
return dx;
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
params(): Float32Array[] {
|
|
154
|
+
return [this.w1, this.b1, this.w2, this.b2];
|
|
155
|
+
}
|
|
156
|
+
grads(): Float32Array[] {
|
|
157
|
+
return [this.gW1, this.gB1, this.gW2, this.gB2];
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
/** Per-token forward intermediates retained for the backward pass. */
|
|
162
|
+
interface MoECache {
|
|
163
|
+
x: Float32Array;
|
|
164
|
+
route: RouteResult;
|
|
165
|
+
sharedPre: Float32Array;
|
|
166
|
+
sharedH: Float32Array;
|
|
167
|
+
expertOut: Float32Array[]; // per selected expert, index-aligned to route.experts
|
|
168
|
+
expertPre: Float32Array[];
|
|
169
|
+
expertH: Float32Array[];
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
/**
|
|
173
|
+
* Accumulates router statistics over a batch to compute the load-balancing
|
|
174
|
+
* auxiliary loss `E · Σ_e f_e · P_e` (Switch/GShard). Minimised (→ near 1) when
|
|
175
|
+
* dispatch is uniform; large (→ near E) when the router collapses onto few
|
|
176
|
+
* experts. Add it to the task loss with a small coefficient to keep experts busy.
|
|
177
|
+
*/
|
|
178
|
+
export class LoadBalanceAccumulator {
|
|
179
|
+
private readonly counts: Float32Array;
|
|
180
|
+
private readonly probSum: Float32Array;
|
|
181
|
+
private tokens = 0;
|
|
182
|
+
constructor(private readonly numExperts: number) {
|
|
183
|
+
this.counts = new Float32Array(numExperts);
|
|
184
|
+
this.probSum = new Float32Array(numExperts);
|
|
185
|
+
}
|
|
186
|
+
observe(route: RouteResult): void {
|
|
187
|
+
this.tokens++;
|
|
188
|
+
for (const e of route.experts) this.counts[e] = this.counts[e]! + 1;
|
|
189
|
+
for (let e = 0; e < this.numExperts; e++) this.probSum[e] = this.probSum[e]! + route.probs[e]!;
|
|
190
|
+
}
|
|
191
|
+
/** The load-balance loss over everything observed so far (0 if no tokens). */
|
|
192
|
+
loss(): number {
|
|
193
|
+
if (this.tokens === 0) return 0;
|
|
194
|
+
const E = this.numExperts;
|
|
195
|
+
const dispatched = this.counts.reduce((a, b) => a + b, 0) || 1; // = tokens·topK
|
|
196
|
+
let sum = 0;
|
|
197
|
+
for (let e = 0; e < E; e++) {
|
|
198
|
+
const f = this.counts[e]! / dispatched; // fraction of dispatches to e
|
|
199
|
+
const p = this.probSum[e]! / this.tokens; // mean router prob for e
|
|
200
|
+
sum += f * p;
|
|
201
|
+
}
|
|
202
|
+
return E * sum;
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
export class SharedExpertMoE {
|
|
207
|
+
readonly config: Required<Omit<MoEConfig, "seed">>;
|
|
208
|
+
|
|
209
|
+
/** Router weights: numExperts × modelDim (no bias). */
|
|
210
|
+
wr: Float32Array;
|
|
211
|
+
private gWr: Float32Array;
|
|
212
|
+
|
|
213
|
+
private readonly shared: Expert;
|
|
214
|
+
private readonly experts: Expert[];
|
|
215
|
+
|
|
216
|
+
constructor(config: Partial<MoEConfig> = {}) {
|
|
217
|
+
const cfg = { ...DEFAULT_MOE_CONFIG, ...config };
|
|
218
|
+
if (cfg.topK > cfg.numExperts) {
|
|
219
|
+
throw new Error(`MoE topK (${cfg.topK}) must be ≤ numExperts (${cfg.numExperts})`);
|
|
220
|
+
}
|
|
221
|
+
if (cfg.topK < 1) throw new Error(`MoE topK must be ≥ 1 (got ${cfg.topK})`);
|
|
222
|
+
this.config = cfg;
|
|
223
|
+
|
|
224
|
+
const rng = new SeededRng(((config.seed ?? DEFAULT_MOE_SEED) >>> 0) || 1);
|
|
225
|
+
const gauss = (n: number, std: number): Float32Array => {
|
|
226
|
+
const a = new Float32Array(n);
|
|
227
|
+
for (let i = 0; i < n; i++) {
|
|
228
|
+
const u1 = Math.max(rng.next(), 1e-12);
|
|
229
|
+
const u2 = rng.next();
|
|
230
|
+
a[i] = std * Math.sqrt(-2 * Math.log(u1)) * Math.cos(2 * Math.PI * u2);
|
|
231
|
+
}
|
|
232
|
+
return a;
|
|
233
|
+
};
|
|
234
|
+
|
|
235
|
+
this.wr = gauss(cfg.numExperts * cfg.modelDim, 0.02);
|
|
236
|
+
this.gWr = new Float32Array(this.wr.length);
|
|
237
|
+
this.shared = new Expert(cfg.modelDim, cfg.hiddenDim, gauss);
|
|
238
|
+
this.experts = Array.from({ length: cfg.numExperts }, () => new Expert(cfg.modelDim, cfg.hiddenDim, gauss));
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
/** Route a token: router logits → top-k → combine gates + full softmax probs. */
|
|
242
|
+
route(x: Float32Array): RouteResult {
|
|
243
|
+
const { numExperts, topK, modelDim } = this.config;
|
|
244
|
+
const logits = new Float32Array(numExperts);
|
|
245
|
+
for (let e = 0; e < numExperts; e++) {
|
|
246
|
+
let acc = 0;
|
|
247
|
+
const off = e * modelDim;
|
|
248
|
+
for (let i = 0; i < modelDim; i++) acc += this.wr[off + i]! * x[i]!;
|
|
249
|
+
logits[e] = acc;
|
|
250
|
+
}
|
|
251
|
+
// Full softmax over all experts (load-balancing signal).
|
|
252
|
+
const probs = softmax(logits);
|
|
253
|
+
// Top-k experts by logit.
|
|
254
|
+
const order = Array.from({ length: numExperts }, (_, e) => e).sort((a, b) => logits[b]! - logits[a]!);
|
|
255
|
+
const experts = order.slice(0, topK);
|
|
256
|
+
// Combine gates = softmax over ONLY the selected logits.
|
|
257
|
+
const selLogits = experts.map((e) => logits[e]!);
|
|
258
|
+
const selSoft = softmax(Float32Array.from(selLogits));
|
|
259
|
+
return { experts, gates: Array.from(selSoft), probs };
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
/** Forward a single token. Returns the output and a cache for {@link backward}. */
|
|
263
|
+
forward(input: ArrayLike<number>): { output: Float32Array; route: RouteResult; cache: MoECache } {
|
|
264
|
+
const { modelDim } = this.config;
|
|
265
|
+
const x = Float32Array.from({ length: modelDim }, (_, i) => input[i] ?? 0);
|
|
266
|
+
const route = this.route(x);
|
|
267
|
+
|
|
268
|
+
const s = this.shared.forward(x);
|
|
269
|
+
const output = Float32Array.from(s.y);
|
|
270
|
+
|
|
271
|
+
const expertOut: Float32Array[] = [];
|
|
272
|
+
const expertPre: Float32Array[] = [];
|
|
273
|
+
const expertH: Float32Array[] = [];
|
|
274
|
+
for (let m = 0; m < route.experts.length; m++) {
|
|
275
|
+
const e = this.experts[route.experts[m]!]!;
|
|
276
|
+
const r = e.forward(x);
|
|
277
|
+
const g = route.gates[m]!;
|
|
278
|
+
for (let d = 0; d < modelDim; d++) output[d] = output[d]! + g * r.y[d]!;
|
|
279
|
+
expertOut.push(r.y);
|
|
280
|
+
expertPre.push(r.pre);
|
|
281
|
+
expertH.push(r.h);
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
return {
|
|
285
|
+
output,
|
|
286
|
+
route,
|
|
287
|
+
cache: { x, route, sharedPre: s.pre, sharedH: s.h, expertOut, expertPre, expertH },
|
|
288
|
+
};
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
/**
|
|
292
|
+
* Accumulate gradients for one token given dL/d(output). Trains the shared
|
|
293
|
+
* expert, the selected routed experts, and the router (so it learns to weight
|
|
294
|
+
* the experts that reduce loss). Call {@link zeroGrad} before a batch and apply
|
|
295
|
+
* an optimiser after. Load balancing is a separate signal (see
|
|
296
|
+
* {@link LoadBalanceAccumulator}). Returns dL/d(input) so the FFN can stack
|
|
297
|
+
* inside a residual block (e.g. {@link EvermindLM}).
|
|
298
|
+
*/
|
|
299
|
+
backward(dOutput: ArrayLike<number>, cache: MoECache): Float32Array {
|
|
300
|
+
const { modelDim } = this.config;
|
|
301
|
+
const dOut = Float32Array.from({ length: modelDim }, (_, d) => dOutput[d] ?? 0);
|
|
302
|
+
const dx = new Float32Array(modelDim);
|
|
303
|
+
|
|
304
|
+
// Shared expert (always active) sees the full upstream gradient.
|
|
305
|
+
const dxShared = this.shared.backward(dOut, cache.x, cache.sharedPre, cache.sharedH);
|
|
306
|
+
for (let i = 0; i < modelDim; i++) dx[i] = dx[i]! + dxShared[i]!;
|
|
307
|
+
|
|
308
|
+
// Routed experts: each scaled by its gate; collect dL/dgate for the router.
|
|
309
|
+
const k = cache.route.experts.length;
|
|
310
|
+
const dGate = new Float32Array(k);
|
|
311
|
+
for (let m = 0; m < k; m++) {
|
|
312
|
+
const g = cache.route.gates[m]!;
|
|
313
|
+
const scaled = new Float32Array(modelDim);
|
|
314
|
+
let dg = 0;
|
|
315
|
+
for (let d = 0; d < modelDim; d++) {
|
|
316
|
+
scaled[d] = g * dOut[d]!;
|
|
317
|
+
dg += dOut[d]! * cache.expertOut[m]![d]!;
|
|
318
|
+
}
|
|
319
|
+
const dxe = this.experts[cache.route.experts[m]!]!.backward(
|
|
320
|
+
scaled,
|
|
321
|
+
cache.x,
|
|
322
|
+
cache.expertPre[m]!,
|
|
323
|
+
cache.expertH[m]!,
|
|
324
|
+
);
|
|
325
|
+
for (let i = 0; i < modelDim; i++) dx[i] = dx[i]! + dxe[i]!;
|
|
326
|
+
dGate[m] = dg;
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
// Router: gates = softmax(selected logits). Backprop dGate through the
|
|
330
|
+
// softmax Jacobian to the selected logits, then to Wr and the input.
|
|
331
|
+
const gates = cache.route.gates;
|
|
332
|
+
let dot = 0;
|
|
333
|
+
for (let m = 0; m < k; m++) dot += gates[m]! * dGate[m]!;
|
|
334
|
+
for (let m = 0; m < k; m++) {
|
|
335
|
+
const dLogit = gates[m]! * (dGate[m]! - dot);
|
|
336
|
+
const e = cache.route.experts[m]!;
|
|
337
|
+
const off = e * modelDim;
|
|
338
|
+
for (let i = 0; i < modelDim; i++) {
|
|
339
|
+
this.gWr[off + i] = this.gWr[off + i]! + dLogit * cache.x[i]!;
|
|
340
|
+
dx[i] = dx[i]! + dLogit * this.wr[off + i]!;
|
|
341
|
+
}
|
|
342
|
+
}
|
|
343
|
+
return dx;
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
/**
|
|
347
|
+
* Add the load-balancing auxiliary-loss gradient for one token into the router
|
|
348
|
+
* gradient. `L_aux = E·Σ_e f_e·P̄_e` (Switch/GShard); `f` (per-batch dispatch
|
|
349
|
+
* fractions) is treated as a stop-grad constant, so only the full softmax `P`
|
|
350
|
+
* carries gradient: ∂L_aux/∂logit_j = scale·P_j·(f_j − Σ_e f_e·P_e), where the
|
|
351
|
+
* caller passes `scale = auxWeight·E/T`. Keeps the router from collapsing onto a
|
|
352
|
+
* few experts. Call once per token over the batch, after {@link backward}.
|
|
353
|
+
*/
|
|
354
|
+
auxGradStep(x: Float32Array, probs: Float32Array, f: Float32Array, scale: number): void {
|
|
355
|
+
const { numExperts, modelDim } = this.config;
|
|
356
|
+
let fp = 0;
|
|
357
|
+
for (let e = 0; e < numExperts; e++) fp += f[e]! * probs[e]!;
|
|
358
|
+
for (let j = 0; j < numExperts; j++) {
|
|
359
|
+
const coeff = scale * probs[j]! * (f[j]! - fp);
|
|
360
|
+
if (coeff === 0) continue;
|
|
361
|
+
const off = j * modelDim;
|
|
362
|
+
for (let i = 0; i < modelDim; i++) this.gWr[off + i] = this.gWr[off + i]! + coeff * x[i]!;
|
|
363
|
+
}
|
|
364
|
+
}
|
|
365
|
+
|
|
366
|
+
// ── Parameters / checkpoint ────────────────────────────────────────────────
|
|
367
|
+
|
|
368
|
+
/** All trainable parameters in canonical order: router, shared, then experts. */
|
|
369
|
+
parameters(): MoEParam[] {
|
|
370
|
+
const out: MoEParam[] = [{ name: "wr", data: this.wr, numel: this.wr.length }];
|
|
371
|
+
const push = (prefix: string, e: Expert) => {
|
|
372
|
+
const names = ["w1", "b1", "w2", "b2"];
|
|
373
|
+
e.params().forEach((p, i) => out.push({ name: `${prefix}.${names[i]}`, data: p, numel: p.length }));
|
|
374
|
+
};
|
|
375
|
+
push("shared", this.shared);
|
|
376
|
+
this.experts.forEach((e, idx) => push(`expert${idx}`, e));
|
|
377
|
+
return out;
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
/** Gradient buffers, index-aligned with {@link parameters}. */
|
|
381
|
+
gradients(): MoEParam[] {
|
|
382
|
+
const out: MoEParam[] = [{ name: "wr", data: this.gWr, numel: this.gWr.length }];
|
|
383
|
+
const push = (prefix: string, e: Expert) => {
|
|
384
|
+
const names = ["w1", "b1", "w2", "b2"];
|
|
385
|
+
e.grads().forEach((g, i) => out.push({ name: `${prefix}.${names[i]}`, data: g, numel: g.length }));
|
|
386
|
+
};
|
|
387
|
+
push("shared", this.shared);
|
|
388
|
+
this.experts.forEach((e, idx) => push(`expert${idx}`, e));
|
|
389
|
+
return out;
|
|
390
|
+
}
|
|
391
|
+
|
|
392
|
+
zeroGrad(): void {
|
|
393
|
+
for (const g of this.gradients()) g.data.fill(0);
|
|
394
|
+
}
|
|
395
|
+
|
|
396
|
+
/** One routed expert's weights as a standalone checkpoint (the web-paging unit). */
|
|
397
|
+
exportExpert(index: number): MoEParam[] {
|
|
398
|
+
const e = this.experts[index];
|
|
399
|
+
if (!e) throw new Error(`exportExpert: index ${index} out of range (0..${this.config.numExperts - 1})`);
|
|
400
|
+
const names = ["w1", "b1", "w2", "b2"];
|
|
401
|
+
return e.params().map((p, i) => ({ name: names[i]!, data: p, numel: p.length }));
|
|
402
|
+
}
|
|
403
|
+
|
|
404
|
+
/**
|
|
405
|
+
* Serialise all weights to a compact "MoE0" binary. Layout: magic, version,
|
|
406
|
+
* [modelDim, hiddenDim, numExperts, topK], then params in {@link parameters}
|
|
407
|
+
* order. fp16 (v2) halves the size; f32 (v1) is exact.
|
|
408
|
+
*/
|
|
409
|
+
exportWeights(opts: { fp16?: boolean } = {}): ArrayBuffer {
|
|
410
|
+
const fp16 = opts.fp16 ?? false;
|
|
411
|
+
const params = this.parameters();
|
|
412
|
+
const total = params.reduce((n, p) => n + p.numel, 0);
|
|
413
|
+
const headerEls = 6; // magic, version, modelDim, hiddenDim, numExperts, topK
|
|
414
|
+
const headerBytes = headerEls * 4;
|
|
415
|
+
const buf = new ArrayBuffer(headerBytes + (fp16 ? total * 2 : total * 4));
|
|
416
|
+
const head = new Uint32Array(buf, 0, headerEls);
|
|
417
|
+
head[0] = MAGIC;
|
|
418
|
+
head[1] = fp16 ? 2 : 1;
|
|
419
|
+
head[2] = this.config.modelDim;
|
|
420
|
+
head[3] = this.config.hiddenDim;
|
|
421
|
+
head[4] = this.config.numExperts;
|
|
422
|
+
head[5] = this.config.topK;
|
|
423
|
+
|
|
424
|
+
const flat = new Float32Array(total);
|
|
425
|
+
let o = 0;
|
|
426
|
+
for (const p of params) {
|
|
427
|
+
flat.set(p.data, o);
|
|
428
|
+
o += p.numel;
|
|
429
|
+
}
|
|
430
|
+
if (fp16) new Uint16Array(buf, headerBytes, total).set(quantizeFp16(flat));
|
|
431
|
+
else new Float32Array(buf, headerBytes, total).set(flat);
|
|
432
|
+
return buf;
|
|
433
|
+
}
|
|
434
|
+
|
|
435
|
+
/** Load weights from an "MoE0" binary. Validates magic + dims. */
|
|
436
|
+
loadWeights(buffer: ArrayBuffer): void {
|
|
437
|
+
const head = new Uint32Array(buffer, 0, 6);
|
|
438
|
+
if (head[0] !== MAGIC) throw new Error("SharedExpertMoE.loadWeights: bad magic (not an MoE0 checkpoint)");
|
|
439
|
+
const [, version, modelDim, hiddenDim, numExperts, topK] = head;
|
|
440
|
+
if (
|
|
441
|
+
modelDim !== this.config.modelDim ||
|
|
442
|
+
hiddenDim !== this.config.hiddenDim ||
|
|
443
|
+
numExperts !== this.config.numExperts ||
|
|
444
|
+
topK !== this.config.topK
|
|
445
|
+
) {
|
|
446
|
+
throw new Error("SharedExpertMoE.loadWeights: config mismatch with checkpoint");
|
|
447
|
+
}
|
|
448
|
+
const params = this.parameters();
|
|
449
|
+
const total = params.reduce((n, p) => n + p.numel, 0);
|
|
450
|
+
const headerBytes = 24;
|
|
451
|
+
const flat =
|
|
452
|
+
version === 2
|
|
453
|
+
? dequantizeFp16(new Uint16Array(buffer, headerBytes, total))
|
|
454
|
+
: new Float32Array(buffer.slice(headerBytes, headerBytes + total * 4));
|
|
455
|
+
let o = 0;
|
|
456
|
+
for (const p of params) {
|
|
457
|
+
p.data.set(flat.subarray(o, o + p.numel));
|
|
458
|
+
o += p.numel;
|
|
459
|
+
}
|
|
460
|
+
}
|
|
461
|
+
}
|
|
462
|
+
|
|
463
|
+
/** Numerically-stable softmax over a flat array. */
|
|
464
|
+
function softmax(logits: Float32Array): Float32Array {
|
|
465
|
+
let max = -Infinity;
|
|
466
|
+
for (let i = 0; i < logits.length; i++) if (logits[i]! > max) max = logits[i]!;
|
|
467
|
+
const out = new Float32Array(logits.length);
|
|
468
|
+
let sum = 0;
|
|
469
|
+
for (let i = 0; i < logits.length; i++) {
|
|
470
|
+
out[i] = Math.exp(logits[i]! - max);
|
|
471
|
+
sum += out[i]!;
|
|
472
|
+
}
|
|
473
|
+
for (let i = 0; i < logits.length; i++) out[i] = out[i]! / sum;
|
|
474
|
+
return out;
|
|
475
|
+
}
|