@seanhogg/builderforce-memory-engine 2026.6.27 → 2026.6.28
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +6 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +5 -0
- package/dist/index.js.map +1 -1
- package/dist/lm/evermind_lm.d.ts +148 -0
- package/dist/lm/evermind_lm.d.ts.map +1 -0
- package/dist/lm/evermind_lm.js +479 -0
- package/dist/lm/evermind_lm.js.map +1 -0
- package/dist/lm/index.d.ts +6 -0
- package/dist/lm/index.d.ts.map +1 -0
- package/dist/lm/index.js +5 -0
- package/dist/lm/index.js.map +1 -0
- package/dist/model/attention_block.js +1 -1
- package/dist/model/attention_block.js.map +1 -1
- package/dist/model/mamba_model.js +1 -1
- package/dist/model/mamba_model.js.map +1 -1
- package/dist/moe/index.d.ts +10 -0
- package/dist/moe/index.d.ts.map +1 -0
- package/dist/moe/index.js +7 -0
- package/dist/moe/index.js.map +1 -0
- package/dist/moe/moe_model.d.ts +134 -0
- package/dist/moe/moe_model.d.ts.map +1 -0
- package/dist/moe/moe_model.js +415 -0
- package/dist/moe/moe_model.js.map +1 -0
- package/dist/moe/moe_package.d.ts +81 -0
- package/dist/moe/moe_package.d.ts.map +1 -0
- package/dist/moe/moe_package.js +157 -0
- package/dist/moe/moe_package.js.map +1 -0
- package/dist/moe/moe_trainer.d.ts +53 -0
- package/dist/moe/moe_trainer.d.ts.map +1 -0
- package/dist/moe/moe_trainer.js +93 -0
- package/dist/moe/moe_trainer.js.map +1 -0
- package/dist/optim/adamw.d.ts +32 -0
- package/dist/optim/adamw.d.ts.map +1 -0
- package/dist/optim/adamw.js +52 -0
- package/dist/optim/adamw.js.map +1 -0
- package/package.json +1 -1
- package/src/index.ts +28 -0
- package/src/lm/evermind_lm.ts +558 -0
- package/src/lm/index.ts +6 -0
- package/src/model/attention_block.ts +1 -1
- package/src/model/mamba_model.ts +1 -1
- package/src/moe/index.ts +23 -0
- package/src/moe/moe_model.ts +475 -0
- package/src/moe/moe_package.ts +205 -0
- package/src/moe/moe_trainer.ts +134 -0
- package/src/optim/adamw.ts +72 -0
|
@@ -0,0 +1,475 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* moe_model.ts — SharedExpertMoE: a shared-expert hybrid Mixture-of-Experts FFN.
|
|
3
|
+
*
|
|
4
|
+
* The sparsity design behind Evermind's generator. Each token is processed by:
|
|
5
|
+
* • a DENSE shared expert that is ALWAYS active (carries continuous learning;
|
|
6
|
+
* the part the online-distillation signal flows into), plus
|
|
7
|
+
* • the top-k of N routed experts, gated by a learned router and combined by a
|
|
8
|
+
* softmax over the selected experts.
|
|
9
|
+
*
|
|
10
|
+
* y = SharedFFN(x) + Σ_{e ∈ topk(x)} gate_e · Expert_e(x)
|
|
11
|
+
*
|
|
12
|
+
* This is the DeepSeekMoE "shared-expert isolation" pattern: the dense backbone
|
|
13
|
+
* resolves the online-learning attribution problem (you distil into ONE always-on
|
|
14
|
+
* path), while the routed experts add web-pageable capacity (each expert's
|
|
15
|
+
* weights are an independent checkpoint — see {@link SharedExpertMoE.exportExpert}
|
|
16
|
+
* — so a host can stream only the experts a token activates).
|
|
17
|
+
*
|
|
18
|
+
* Pure-TS CPU reference (Float32Array, exact forward + backward), mirroring
|
|
19
|
+
* {@link LimbicModel}'s WebGPU-or-fallback contract — the WGSL kernel path
|
|
20
|
+
* (router gate + expert FFN GEMM) is a numerically-identical future acceleration.
|
|
21
|
+
*
|
|
22
|
+
* Activation is ReLU for an exact, unambiguous gradient in the reference path;
|
|
23
|
+
* production may swap GELU/SwiGLU behind the same shapes.
|
|
24
|
+
*/
|
|
25
|
+
|
|
26
|
+
import { SeededRng } from "../utils/rng.js";
|
|
27
|
+
import { quantizeFp16, dequantizeFp16 } from "../utils/quantization.js";
|
|
28
|
+
|
|
29
|
+
export interface MoEConfig {
|
|
30
|
+
/** Model (token) dimension — FFN input/output width. Default 64. */
|
|
31
|
+
modelDim: number;
|
|
32
|
+
/** Hidden width of each expert FFN. Default 128. */
|
|
33
|
+
hiddenDim: number;
|
|
34
|
+
/** Number of routed experts. Default 8. */
|
|
35
|
+
numExperts: number;
|
|
36
|
+
/** Experts activated per token (top-k). Default 2. Must be ≤ numExperts. */
|
|
37
|
+
topK: number;
|
|
38
|
+
/** Deterministic init seed for reproducible cold-start weights. */
|
|
39
|
+
seed?: number;
|
|
40
|
+
}
|
|
41
|
+
|
|
42
|
+
export const DEFAULT_MOE_CONFIG: Required<Omit<MoEConfig, "seed">> = {
|
|
43
|
+
modelDim: 64,
|
|
44
|
+
hiddenDim: 128,
|
|
45
|
+
numExperts: 8,
|
|
46
|
+
topK: 2,
|
|
47
|
+
};
|
|
48
|
+
|
|
49
|
+
/** Fixed default init seed — reproducible byte-identical cold start across machines. */
|
|
50
|
+
export const DEFAULT_MOE_SEED = 0x4d6f4501; // "MoE\x01"
|
|
51
|
+
|
|
52
|
+
const MAGIC = 0x4d6f4530; // "MoE0"
|
|
53
|
+
|
|
54
|
+
/** A named trainable parameter tensor (flat row-major). */
|
|
55
|
+
export interface MoEParam {
|
|
56
|
+
name: string;
|
|
57
|
+
data: Float32Array;
|
|
58
|
+
numel: number;
|
|
59
|
+
}
|
|
60
|
+
|
|
61
|
+
/** Result of routing a token: which experts fire and with what combine weights. */
|
|
62
|
+
export interface RouteResult {
|
|
63
|
+
/** Indices of the selected top-k experts, highest router logit first. */
|
|
64
|
+
experts: number[];
|
|
65
|
+
/** Combine weights (softmax over the selected logits), index-aligned to `experts`. */
|
|
66
|
+
gates: number[];
|
|
67
|
+
/** Full softmax over ALL experts — the load-balancing signal. */
|
|
68
|
+
probs: Float32Array;
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
function relu(x: number): number {
|
|
72
|
+
return x > 0 ? x : 0;
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
/** A 2-layer FFN expert: y = W2·relu(W1·x + b1) + b2. */
|
|
76
|
+
class Expert {
|
|
77
|
+
// Parameters (flat, row-major).
|
|
78
|
+
w1: Float32Array; // hidden × model
|
|
79
|
+
b1: Float32Array; // hidden
|
|
80
|
+
w2: Float32Array; // model × hidden
|
|
81
|
+
b2: Float32Array; // model
|
|
82
|
+
// Gradient accumulators.
|
|
83
|
+
gW1: Float32Array;
|
|
84
|
+
gB1: Float32Array;
|
|
85
|
+
gW2: Float32Array;
|
|
86
|
+
gB2: Float32Array;
|
|
87
|
+
|
|
88
|
+
constructor(
|
|
89
|
+
private readonly modelDim: number,
|
|
90
|
+
private readonly hiddenDim: number,
|
|
91
|
+
gauss: (n: number, std: number) => Float32Array,
|
|
92
|
+
) {
|
|
93
|
+
// He-style init for the ReLU layer; small output init so an untrained expert
|
|
94
|
+
// contributes little until it has learned.
|
|
95
|
+
this.w1 = gauss(hiddenDim * modelDim, Math.sqrt(2 / modelDim));
|
|
96
|
+
this.b1 = new Float32Array(hiddenDim);
|
|
97
|
+
this.w2 = gauss(modelDim * hiddenDim, 0.02);
|
|
98
|
+
this.b2 = new Float32Array(modelDim);
|
|
99
|
+
this.gW1 = new Float32Array(this.w1.length);
|
|
100
|
+
this.gB1 = new Float32Array(this.b1.length);
|
|
101
|
+
this.gW2 = new Float32Array(this.w2.length);
|
|
102
|
+
this.gB2 = new Float32Array(this.b2.length);
|
|
103
|
+
}
|
|
104
|
+
|
|
105
|
+
/** Forward. Returns the output plus the cache needed for {@link backward}. */
|
|
106
|
+
forward(x: Float32Array): { y: Float32Array; pre: Float32Array; h: Float32Array } {
|
|
107
|
+
const { modelDim, hiddenDim } = this;
|
|
108
|
+
const pre = new Float32Array(hiddenDim);
|
|
109
|
+
const h = new Float32Array(hiddenDim);
|
|
110
|
+
for (let j = 0; j < hiddenDim; j++) {
|
|
111
|
+
let acc = this.b1[j]!;
|
|
112
|
+
const off = j * modelDim;
|
|
113
|
+
for (let i = 0; i < modelDim; i++) acc += this.w1[off + i]! * x[i]!;
|
|
114
|
+
pre[j] = acc;
|
|
115
|
+
h[j] = relu(acc);
|
|
116
|
+
}
|
|
117
|
+
const y = new Float32Array(modelDim);
|
|
118
|
+
for (let d = 0; d < modelDim; d++) {
|
|
119
|
+
let acc = this.b2[d]!;
|
|
120
|
+
const off = d * hiddenDim;
|
|
121
|
+
for (let j = 0; j < hiddenDim; j++) acc += this.w2[off + j]! * h[j]!;
|
|
122
|
+
y[d] = acc;
|
|
123
|
+
}
|
|
124
|
+
return { y, pre, h };
|
|
125
|
+
}
|
|
126
|
+
|
|
127
|
+
/** Accumulate gradients for one token given dL/dy. Returns dL/dx. */
|
|
128
|
+
backward(dy: Float32Array, x: Float32Array, pre: Float32Array, h: Float32Array): Float32Array {
|
|
129
|
+
const { modelDim, hiddenDim } = this;
|
|
130
|
+
const dh = new Float32Array(hiddenDim);
|
|
131
|
+
for (let d = 0; d < modelDim; d++) {
|
|
132
|
+
const dyd = dy[d]!;
|
|
133
|
+
this.gB2[d] = this.gB2[d]! + dyd;
|
|
134
|
+
const off = d * hiddenDim;
|
|
135
|
+
for (let j = 0; j < hiddenDim; j++) {
|
|
136
|
+
this.gW2[off + j] = this.gW2[off + j]! + dyd * h[j]!;
|
|
137
|
+
dh[j] = dh[j]! + dyd * this.w2[off + j]!;
|
|
138
|
+
}
|
|
139
|
+
}
|
|
140
|
+
const dx = new Float32Array(modelDim);
|
|
141
|
+
for (let j = 0; j < hiddenDim; j++) {
|
|
142
|
+
const dpre = pre[j]! > 0 ? dh[j]! : 0; // relu'
|
|
143
|
+
this.gB1[j] = this.gB1[j]! + dpre;
|
|
144
|
+
const off = j * modelDim;
|
|
145
|
+
for (let i = 0; i < modelDim; i++) {
|
|
146
|
+
this.gW1[off + i] = this.gW1[off + i]! + dpre * x[i]!;
|
|
147
|
+
dx[i] = dx[i]! + dpre * this.w1[off + i]!;
|
|
148
|
+
}
|
|
149
|
+
}
|
|
150
|
+
return dx;
|
|
151
|
+
}
|
|
152
|
+
|
|
153
|
+
params(): Float32Array[] {
|
|
154
|
+
return [this.w1, this.b1, this.w2, this.b2];
|
|
155
|
+
}
|
|
156
|
+
grads(): Float32Array[] {
|
|
157
|
+
return [this.gW1, this.gB1, this.gW2, this.gB2];
|
|
158
|
+
}
|
|
159
|
+
}
|
|
160
|
+
|
|
161
|
+
/** Per-token forward intermediates retained for the backward pass. */
|
|
162
|
+
interface MoECache {
|
|
163
|
+
x: Float32Array;
|
|
164
|
+
route: RouteResult;
|
|
165
|
+
sharedPre: Float32Array;
|
|
166
|
+
sharedH: Float32Array;
|
|
167
|
+
expertOut: Float32Array[]; // per selected expert, index-aligned to route.experts
|
|
168
|
+
expertPre: Float32Array[];
|
|
169
|
+
expertH: Float32Array[];
|
|
170
|
+
}
|
|
171
|
+
|
|
172
|
+
/**
|
|
173
|
+
* Accumulates router statistics over a batch to compute the load-balancing
|
|
174
|
+
* auxiliary loss `E · Σ_e f_e · P_e` (Switch/GShard). Minimised (→ near 1) when
|
|
175
|
+
* dispatch is uniform; large (→ near E) when the router collapses onto few
|
|
176
|
+
* experts. Add it to the task loss with a small coefficient to keep experts busy.
|
|
177
|
+
*/
|
|
178
|
+
export class LoadBalanceAccumulator {
|
|
179
|
+
private readonly counts: Float32Array;
|
|
180
|
+
private readonly probSum: Float32Array;
|
|
181
|
+
private tokens = 0;
|
|
182
|
+
constructor(private readonly numExperts: number) {
|
|
183
|
+
this.counts = new Float32Array(numExperts);
|
|
184
|
+
this.probSum = new Float32Array(numExperts);
|
|
185
|
+
}
|
|
186
|
+
observe(route: RouteResult): void {
|
|
187
|
+
this.tokens++;
|
|
188
|
+
for (const e of route.experts) this.counts[e] = this.counts[e]! + 1;
|
|
189
|
+
for (let e = 0; e < this.numExperts; e++) this.probSum[e] = this.probSum[e]! + route.probs[e]!;
|
|
190
|
+
}
|
|
191
|
+
/** The load-balance loss over everything observed so far (0 if no tokens). */
|
|
192
|
+
loss(): number {
|
|
193
|
+
if (this.tokens === 0) return 0;
|
|
194
|
+
const E = this.numExperts;
|
|
195
|
+
const dispatched = this.counts.reduce((a, b) => a + b, 0) || 1; // = tokens·topK
|
|
196
|
+
let sum = 0;
|
|
197
|
+
for (let e = 0; e < E; e++) {
|
|
198
|
+
const f = this.counts[e]! / dispatched; // fraction of dispatches to e
|
|
199
|
+
const p = this.probSum[e]! / this.tokens; // mean router prob for e
|
|
200
|
+
sum += f * p;
|
|
201
|
+
}
|
|
202
|
+
return E * sum;
|
|
203
|
+
}
|
|
204
|
+
}
|
|
205
|
+
|
|
206
|
+
export class SharedExpertMoE {
|
|
207
|
+
readonly config: Required<Omit<MoEConfig, "seed">>;
|
|
208
|
+
|
|
209
|
+
/** Router weights: numExperts × modelDim (no bias). */
|
|
210
|
+
wr: Float32Array;
|
|
211
|
+
private gWr: Float32Array;
|
|
212
|
+
|
|
213
|
+
private readonly shared: Expert;
|
|
214
|
+
private readonly experts: Expert[];
|
|
215
|
+
|
|
216
|
+
constructor(config: Partial<MoEConfig> = {}) {
|
|
217
|
+
const cfg = { ...DEFAULT_MOE_CONFIG, ...config };
|
|
218
|
+
if (cfg.topK > cfg.numExperts) {
|
|
219
|
+
throw new Error(`MoE topK (${cfg.topK}) must be ≤ numExperts (${cfg.numExperts})`);
|
|
220
|
+
}
|
|
221
|
+
if (cfg.topK < 1) throw new Error(`MoE topK must be ≥ 1 (got ${cfg.topK})`);
|
|
222
|
+
this.config = cfg;
|
|
223
|
+
|
|
224
|
+
const rng = new SeededRng(((config.seed ?? DEFAULT_MOE_SEED) >>> 0) || 1);
|
|
225
|
+
const gauss = (n: number, std: number): Float32Array => {
|
|
226
|
+
const a = new Float32Array(n);
|
|
227
|
+
for (let i = 0; i < n; i++) {
|
|
228
|
+
const u1 = Math.max(rng.next(), 1e-12);
|
|
229
|
+
const u2 = rng.next();
|
|
230
|
+
a[i] = std * Math.sqrt(-2 * Math.log(u1)) * Math.cos(2 * Math.PI * u2);
|
|
231
|
+
}
|
|
232
|
+
return a;
|
|
233
|
+
};
|
|
234
|
+
|
|
235
|
+
this.wr = gauss(cfg.numExperts * cfg.modelDim, 0.02);
|
|
236
|
+
this.gWr = new Float32Array(this.wr.length);
|
|
237
|
+
this.shared = new Expert(cfg.modelDim, cfg.hiddenDim, gauss);
|
|
238
|
+
this.experts = Array.from({ length: cfg.numExperts }, () => new Expert(cfg.modelDim, cfg.hiddenDim, gauss));
|
|
239
|
+
}
|
|
240
|
+
|
|
241
|
+
/** Route a token: router logits → top-k → combine gates + full softmax probs. */
|
|
242
|
+
route(x: Float32Array): RouteResult {
|
|
243
|
+
const { numExperts, topK, modelDim } = this.config;
|
|
244
|
+
const logits = new Float32Array(numExperts);
|
|
245
|
+
for (let e = 0; e < numExperts; e++) {
|
|
246
|
+
let acc = 0;
|
|
247
|
+
const off = e * modelDim;
|
|
248
|
+
for (let i = 0; i < modelDim; i++) acc += this.wr[off + i]! * x[i]!;
|
|
249
|
+
logits[e] = acc;
|
|
250
|
+
}
|
|
251
|
+
// Full softmax over all experts (load-balancing signal).
|
|
252
|
+
const probs = softmax(logits);
|
|
253
|
+
// Top-k experts by logit.
|
|
254
|
+
const order = Array.from({ length: numExperts }, (_, e) => e).sort((a, b) => logits[b]! - logits[a]!);
|
|
255
|
+
const experts = order.slice(0, topK);
|
|
256
|
+
// Combine gates = softmax over ONLY the selected logits.
|
|
257
|
+
const selLogits = experts.map((e) => logits[e]!);
|
|
258
|
+
const selSoft = softmax(Float32Array.from(selLogits));
|
|
259
|
+
return { experts, gates: Array.from(selSoft), probs };
|
|
260
|
+
}
|
|
261
|
+
|
|
262
|
+
/** Forward a single token. Returns the output and a cache for {@link backward}. */
|
|
263
|
+
forward(input: ArrayLike<number>): { output: Float32Array; route: RouteResult; cache: MoECache } {
|
|
264
|
+
const { modelDim } = this.config;
|
|
265
|
+
const x = Float32Array.from({ length: modelDim }, (_, i) => input[i] ?? 0);
|
|
266
|
+
const route = this.route(x);
|
|
267
|
+
|
|
268
|
+
const s = this.shared.forward(x);
|
|
269
|
+
const output = Float32Array.from(s.y);
|
|
270
|
+
|
|
271
|
+
const expertOut: Float32Array[] = [];
|
|
272
|
+
const expertPre: Float32Array[] = [];
|
|
273
|
+
const expertH: Float32Array[] = [];
|
|
274
|
+
for (let m = 0; m < route.experts.length; m++) {
|
|
275
|
+
const e = this.experts[route.experts[m]!]!;
|
|
276
|
+
const r = e.forward(x);
|
|
277
|
+
const g = route.gates[m]!;
|
|
278
|
+
for (let d = 0; d < modelDim; d++) output[d] = output[d]! + g * r.y[d]!;
|
|
279
|
+
expertOut.push(r.y);
|
|
280
|
+
expertPre.push(r.pre);
|
|
281
|
+
expertH.push(r.h);
|
|
282
|
+
}
|
|
283
|
+
|
|
284
|
+
return {
|
|
285
|
+
output,
|
|
286
|
+
route,
|
|
287
|
+
cache: { x, route, sharedPre: s.pre, sharedH: s.h, expertOut, expertPre, expertH },
|
|
288
|
+
};
|
|
289
|
+
}
|
|
290
|
+
|
|
291
|
+
/**
|
|
292
|
+
* Accumulate gradients for one token given dL/d(output). Trains the shared
|
|
293
|
+
* expert, the selected routed experts, and the router (so it learns to weight
|
|
294
|
+
* the experts that reduce loss). Call {@link zeroGrad} before a batch and apply
|
|
295
|
+
* an optimiser after. Load balancing is a separate signal (see
|
|
296
|
+
* {@link LoadBalanceAccumulator}). Returns dL/d(input) so the FFN can stack
|
|
297
|
+
* inside a residual block (e.g. {@link EvermindLM}).
|
|
298
|
+
*/
|
|
299
|
+
backward(dOutput: ArrayLike<number>, cache: MoECache): Float32Array {
|
|
300
|
+
const { modelDim } = this.config;
|
|
301
|
+
const dOut = Float32Array.from({ length: modelDim }, (_, d) => dOutput[d] ?? 0);
|
|
302
|
+
const dx = new Float32Array(modelDim);
|
|
303
|
+
|
|
304
|
+
// Shared expert (always active) sees the full upstream gradient.
|
|
305
|
+
const dxShared = this.shared.backward(dOut, cache.x, cache.sharedPre, cache.sharedH);
|
|
306
|
+
for (let i = 0; i < modelDim; i++) dx[i] = dx[i]! + dxShared[i]!;
|
|
307
|
+
|
|
308
|
+
// Routed experts: each scaled by its gate; collect dL/dgate for the router.
|
|
309
|
+
const k = cache.route.experts.length;
|
|
310
|
+
const dGate = new Float32Array(k);
|
|
311
|
+
for (let m = 0; m < k; m++) {
|
|
312
|
+
const g = cache.route.gates[m]!;
|
|
313
|
+
const scaled = new Float32Array(modelDim);
|
|
314
|
+
let dg = 0;
|
|
315
|
+
for (let d = 0; d < modelDim; d++) {
|
|
316
|
+
scaled[d] = g * dOut[d]!;
|
|
317
|
+
dg += dOut[d]! * cache.expertOut[m]![d]!;
|
|
318
|
+
}
|
|
319
|
+
const dxe = this.experts[cache.route.experts[m]!]!.backward(
|
|
320
|
+
scaled,
|
|
321
|
+
cache.x,
|
|
322
|
+
cache.expertPre[m]!,
|
|
323
|
+
cache.expertH[m]!,
|
|
324
|
+
);
|
|
325
|
+
for (let i = 0; i < modelDim; i++) dx[i] = dx[i]! + dxe[i]!;
|
|
326
|
+
dGate[m] = dg;
|
|
327
|
+
}
|
|
328
|
+
|
|
329
|
+
// Router: gates = softmax(selected logits). Backprop dGate through the
|
|
330
|
+
// softmax Jacobian to the selected logits, then to Wr and the input.
|
|
331
|
+
const gates = cache.route.gates;
|
|
332
|
+
let dot = 0;
|
|
333
|
+
for (let m = 0; m < k; m++) dot += gates[m]! * dGate[m]!;
|
|
334
|
+
for (let m = 0; m < k; m++) {
|
|
335
|
+
const dLogit = gates[m]! * (dGate[m]! - dot);
|
|
336
|
+
const e = cache.route.experts[m]!;
|
|
337
|
+
const off = e * modelDim;
|
|
338
|
+
for (let i = 0; i < modelDim; i++) {
|
|
339
|
+
this.gWr[off + i] = this.gWr[off + i]! + dLogit * cache.x[i]!;
|
|
340
|
+
dx[i] = dx[i]! + dLogit * this.wr[off + i]!;
|
|
341
|
+
}
|
|
342
|
+
}
|
|
343
|
+
return dx;
|
|
344
|
+
}
|
|
345
|
+
|
|
346
|
+
/**
|
|
347
|
+
* Add the load-balancing auxiliary-loss gradient for one token into the router
|
|
348
|
+
* gradient. `L_aux = E·Σ_e f_e·P̄_e` (Switch/GShard); `f` (per-batch dispatch
|
|
349
|
+
* fractions) is treated as a stop-grad constant, so only the full softmax `P`
|
|
350
|
+
* carries gradient: ∂L_aux/∂logit_j = scale·P_j·(f_j − Σ_e f_e·P_e), where the
|
|
351
|
+
* caller passes `scale = auxWeight·E/T`. Keeps the router from collapsing onto a
|
|
352
|
+
* few experts. Call once per token over the batch, after {@link backward}.
|
|
353
|
+
*/
|
|
354
|
+
auxGradStep(x: Float32Array, probs: Float32Array, f: Float32Array, scale: number): void {
|
|
355
|
+
const { numExperts, modelDim } = this.config;
|
|
356
|
+
let fp = 0;
|
|
357
|
+
for (let e = 0; e < numExperts; e++) fp += f[e]! * probs[e]!;
|
|
358
|
+
for (let j = 0; j < numExperts; j++) {
|
|
359
|
+
const coeff = scale * probs[j]! * (f[j]! - fp);
|
|
360
|
+
if (coeff === 0) continue;
|
|
361
|
+
const off = j * modelDim;
|
|
362
|
+
for (let i = 0; i < modelDim; i++) this.gWr[off + i] = this.gWr[off + i]! + coeff * x[i]!;
|
|
363
|
+
}
|
|
364
|
+
}
|
|
365
|
+
|
|
366
|
+
// ── Parameters / checkpoint ────────────────────────────────────────────────
|
|
367
|
+
|
|
368
|
+
/** All trainable parameters in canonical order: router, shared, then experts. */
|
|
369
|
+
parameters(): MoEParam[] {
|
|
370
|
+
const out: MoEParam[] = [{ name: "wr", data: this.wr, numel: this.wr.length }];
|
|
371
|
+
const push = (prefix: string, e: Expert) => {
|
|
372
|
+
const names = ["w1", "b1", "w2", "b2"];
|
|
373
|
+
e.params().forEach((p, i) => out.push({ name: `${prefix}.${names[i]}`, data: p, numel: p.length }));
|
|
374
|
+
};
|
|
375
|
+
push("shared", this.shared);
|
|
376
|
+
this.experts.forEach((e, idx) => push(`expert${idx}`, e));
|
|
377
|
+
return out;
|
|
378
|
+
}
|
|
379
|
+
|
|
380
|
+
/** Gradient buffers, index-aligned with {@link parameters}. */
|
|
381
|
+
gradients(): MoEParam[] {
|
|
382
|
+
const out: MoEParam[] = [{ name: "wr", data: this.gWr, numel: this.gWr.length }];
|
|
383
|
+
const push = (prefix: string, e: Expert) => {
|
|
384
|
+
const names = ["w1", "b1", "w2", "b2"];
|
|
385
|
+
e.grads().forEach((g, i) => out.push({ name: `${prefix}.${names[i]}`, data: g, numel: g.length }));
|
|
386
|
+
};
|
|
387
|
+
push("shared", this.shared);
|
|
388
|
+
this.experts.forEach((e, idx) => push(`expert${idx}`, e));
|
|
389
|
+
return out;
|
|
390
|
+
}
|
|
391
|
+
|
|
392
|
+
zeroGrad(): void {
|
|
393
|
+
for (const g of this.gradients()) g.data.fill(0);
|
|
394
|
+
}
|
|
395
|
+
|
|
396
|
+
/** One routed expert's weights as a standalone checkpoint (the web-paging unit). */
|
|
397
|
+
exportExpert(index: number): MoEParam[] {
|
|
398
|
+
const e = this.experts[index];
|
|
399
|
+
if (!e) throw new Error(`exportExpert: index ${index} out of range (0..${this.config.numExperts - 1})`);
|
|
400
|
+
const names = ["w1", "b1", "w2", "b2"];
|
|
401
|
+
return e.params().map((p, i) => ({ name: names[i]!, data: p, numel: p.length }));
|
|
402
|
+
}
|
|
403
|
+
|
|
404
|
+
/**
|
|
405
|
+
* Serialise all weights to a compact "MoE0" binary. Layout: magic, version,
|
|
406
|
+
* [modelDim, hiddenDim, numExperts, topK], then params in {@link parameters}
|
|
407
|
+
* order. fp16 (v2) halves the size; f32 (v1) is exact.
|
|
408
|
+
*/
|
|
409
|
+
exportWeights(opts: { fp16?: boolean } = {}): ArrayBuffer {
|
|
410
|
+
const fp16 = opts.fp16 ?? false;
|
|
411
|
+
const params = this.parameters();
|
|
412
|
+
const total = params.reduce((n, p) => n + p.numel, 0);
|
|
413
|
+
const headerEls = 6; // magic, version, modelDim, hiddenDim, numExperts, topK
|
|
414
|
+
const headerBytes = headerEls * 4;
|
|
415
|
+
const buf = new ArrayBuffer(headerBytes + (fp16 ? total * 2 : total * 4));
|
|
416
|
+
const head = new Uint32Array(buf, 0, headerEls);
|
|
417
|
+
head[0] = MAGIC;
|
|
418
|
+
head[1] = fp16 ? 2 : 1;
|
|
419
|
+
head[2] = this.config.modelDim;
|
|
420
|
+
head[3] = this.config.hiddenDim;
|
|
421
|
+
head[4] = this.config.numExperts;
|
|
422
|
+
head[5] = this.config.topK;
|
|
423
|
+
|
|
424
|
+
const flat = new Float32Array(total);
|
|
425
|
+
let o = 0;
|
|
426
|
+
for (const p of params) {
|
|
427
|
+
flat.set(p.data, o);
|
|
428
|
+
o += p.numel;
|
|
429
|
+
}
|
|
430
|
+
if (fp16) new Uint16Array(buf, headerBytes, total).set(quantizeFp16(flat));
|
|
431
|
+
else new Float32Array(buf, headerBytes, total).set(flat);
|
|
432
|
+
return buf;
|
|
433
|
+
}
|
|
434
|
+
|
|
435
|
+
/** Load weights from an "MoE0" binary. Validates magic + dims. */
|
|
436
|
+
loadWeights(buffer: ArrayBuffer): void {
|
|
437
|
+
const head = new Uint32Array(buffer, 0, 6);
|
|
438
|
+
if (head[0] !== MAGIC) throw new Error("SharedExpertMoE.loadWeights: bad magic (not an MoE0 checkpoint)");
|
|
439
|
+
const [, version, modelDim, hiddenDim, numExperts, topK] = head;
|
|
440
|
+
if (
|
|
441
|
+
modelDim !== this.config.modelDim ||
|
|
442
|
+
hiddenDim !== this.config.hiddenDim ||
|
|
443
|
+
numExperts !== this.config.numExperts ||
|
|
444
|
+
topK !== this.config.topK
|
|
445
|
+
) {
|
|
446
|
+
throw new Error("SharedExpertMoE.loadWeights: config mismatch with checkpoint");
|
|
447
|
+
}
|
|
448
|
+
const params = this.parameters();
|
|
449
|
+
const total = params.reduce((n, p) => n + p.numel, 0);
|
|
450
|
+
const headerBytes = 24;
|
|
451
|
+
const flat =
|
|
452
|
+
version === 2
|
|
453
|
+
? dequantizeFp16(new Uint16Array(buffer, headerBytes, total))
|
|
454
|
+
: new Float32Array(buffer.slice(headerBytes, headerBytes + total * 4));
|
|
455
|
+
let o = 0;
|
|
456
|
+
for (const p of params) {
|
|
457
|
+
p.data.set(flat.subarray(o, o + p.numel));
|
|
458
|
+
o += p.numel;
|
|
459
|
+
}
|
|
460
|
+
}
|
|
461
|
+
}
|
|
462
|
+
|
|
463
|
+
/** Numerically-stable softmax over a flat array. */
|
|
464
|
+
function softmax(logits: Float32Array): Float32Array {
|
|
465
|
+
let max = -Infinity;
|
|
466
|
+
for (let i = 0; i < logits.length; i++) if (logits[i]! > max) max = logits[i]!;
|
|
467
|
+
const out = new Float32Array(logits.length);
|
|
468
|
+
let sum = 0;
|
|
469
|
+
for (let i = 0; i < logits.length; i++) {
|
|
470
|
+
out[i] = Math.exp(logits[i]! - max);
|
|
471
|
+
sum += out[i]!;
|
|
472
|
+
}
|
|
473
|
+
for (let i = 0; i < logits.length; i++) out[i] = out[i]! / sum;
|
|
474
|
+
return out;
|
|
475
|
+
}
|
|
@@ -0,0 +1,205 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* moe_package.ts — EvermindModelPackage: the portable, publishable AI artifact.
|
|
3
|
+
*
|
|
4
|
+
* This is the unit a creator publishes to the marketplace and a buyer downloads
|
|
5
|
+
* and runs: a self-describing manifest (name, version, config, model card,
|
|
6
|
+
* integrity checksum) bundled with the trained checkpoint into one `.evermind`
|
|
7
|
+
* blob. It is the contract every downstream consumer (marketplace listing,
|
|
8
|
+
* purchase entitlement, workflow generator) reads — define it once, here.
|
|
9
|
+
*
|
|
10
|
+
* Zero-dep and isomorphic: serialises via TextEncoder/TextDecoder, runs in the
|
|
11
|
+
* browser (where models are trained) and in Node/Workers (where they execute).
|
|
12
|
+
*/
|
|
13
|
+
|
|
14
|
+
import { SharedExpertMoE, type MoEConfig } from "./moe_model.js";
|
|
15
|
+
import { EvermindLM, type EvermindLMConfig } from "../lm/evermind_lm.js";
|
|
16
|
+
|
|
17
|
+
/** First 4 bytes of a serialised package: "EVM1". */
|
|
18
|
+
const PKG_MAGIC = 0x45564d31;
|
|
19
|
+
const PKG_VERSION = 1;
|
|
20
|
+
|
|
21
|
+
/** The kinds of model an `.evermind` package can carry. */
|
|
22
|
+
export type EvermindModelType = "shared-expert-moe" | "evermind-lm";
|
|
23
|
+
|
|
24
|
+
/** Human-facing description published with the model (the "model card"). */
|
|
25
|
+
export interface EvermindModelCard {
|
|
26
|
+
description: string;
|
|
27
|
+
/** What it was trained on / intended for. */
|
|
28
|
+
trainingSummary?: string;
|
|
29
|
+
/** SPDX id or free text (e.g. "MIT", "proprietary"). */
|
|
30
|
+
license?: string;
|
|
31
|
+
author?: string;
|
|
32
|
+
tags?: string[];
|
|
33
|
+
}
|
|
34
|
+
|
|
35
|
+
/** Self-describing header for a published Evermind model. */
|
|
36
|
+
export interface EvermindModelManifest {
|
|
37
|
+
schema: "evermind.model/1";
|
|
38
|
+
name: string;
|
|
39
|
+
version: string;
|
|
40
|
+
modelType: EvermindModelType;
|
|
41
|
+
/** Flat numeric model config (the constructor args), serialised verbatim. */
|
|
42
|
+
config: Record<string, number>;
|
|
43
|
+
/** Total trainable scalar parameters (for sizing / pricing / display). */
|
|
44
|
+
paramCount: number;
|
|
45
|
+
checkpointFormat: "MoE0" | "EVL0";
|
|
46
|
+
checkpointFp16: boolean;
|
|
47
|
+
/** 32-bit FNV-1a over the checkpoint bytes — integrity for download/purchase. */
|
|
48
|
+
checksum: number;
|
|
49
|
+
card: EvermindModelCard;
|
|
50
|
+
/** ISO timestamp, caller-supplied (the engine avoids Date for determinism). */
|
|
51
|
+
createdAt?: string;
|
|
52
|
+
}
|
|
53
|
+
|
|
54
|
+
export interface PackageMeta {
|
|
55
|
+
name: string;
|
|
56
|
+
version: string;
|
|
57
|
+
card: EvermindModelCard;
|
|
58
|
+
/** Store the checkpoint in fp16 (half the size). Default false. */
|
|
59
|
+
fp16?: boolean;
|
|
60
|
+
createdAt?: string;
|
|
61
|
+
}
|
|
62
|
+
|
|
63
|
+
export interface ValidationResult {
|
|
64
|
+
ok: boolean;
|
|
65
|
+
errors: string[];
|
|
66
|
+
}
|
|
67
|
+
|
|
68
|
+
/** 32-bit FNV-1a hash — dependency-free integrity check over the checkpoint. */
|
|
69
|
+
function fnv1a(bytes: Uint8Array): number {
|
|
70
|
+
let h = 0x811c9dc5;
|
|
71
|
+
for (let i = 0; i < bytes.length; i++) {
|
|
72
|
+
h ^= bytes[i]!;
|
|
73
|
+
h = Math.imul(h, 0x01000193);
|
|
74
|
+
}
|
|
75
|
+
return h >>> 0;
|
|
76
|
+
}
|
|
77
|
+
|
|
78
|
+
/**
|
|
79
|
+
* A trained model packaged for publishing: manifest + checkpoint. Build one with
|
|
80
|
+
* {@link EvermindModelPackage.fromModel}, ship {@link toBlob}, and a buyer
|
|
81
|
+
* reconstitutes it with {@link fromBlob} → {@link validate} → {@link loadModel}.
|
|
82
|
+
*/
|
|
83
|
+
export class EvermindModelPackage {
|
|
84
|
+
constructor(
|
|
85
|
+
readonly manifest: EvermindModelManifest,
|
|
86
|
+
readonly checkpoint: ArrayBuffer,
|
|
87
|
+
) {}
|
|
88
|
+
|
|
89
|
+
/** Package a trained model with its publishing metadata. */
|
|
90
|
+
static fromModel(model: SharedExpertMoE, meta: PackageMeta): EvermindModelPackage {
|
|
91
|
+
const fp16 = meta.fp16 ?? false;
|
|
92
|
+
const checkpoint = model.exportWeights({ fp16 });
|
|
93
|
+
const manifest: EvermindModelManifest = {
|
|
94
|
+
schema: "evermind.model/1",
|
|
95
|
+
name: meta.name,
|
|
96
|
+
version: meta.version,
|
|
97
|
+
modelType: "shared-expert-moe",
|
|
98
|
+
config: model.config,
|
|
99
|
+
paramCount: model.parameters().reduce((n, p) => n + p.numel, 0),
|
|
100
|
+
checkpointFormat: "MoE0",
|
|
101
|
+
checkpointFp16: fp16,
|
|
102
|
+
checksum: fnv1a(new Uint8Array(checkpoint)),
|
|
103
|
+
card: meta.card,
|
|
104
|
+
...(meta.createdAt ? { createdAt: meta.createdAt } : {}),
|
|
105
|
+
};
|
|
106
|
+
return new EvermindModelPackage(manifest, checkpoint);
|
|
107
|
+
}
|
|
108
|
+
|
|
109
|
+
/** Package a trained generative {@link EvermindLM} — the runnable marketplace AI. */
|
|
110
|
+
static fromLM(lm: EvermindLM, meta: PackageMeta): EvermindModelPackage {
|
|
111
|
+
const fp16 = meta.fp16 ?? false;
|
|
112
|
+
const checkpoint = lm.exportWeights({ fp16 });
|
|
113
|
+
const manifest: EvermindModelManifest = {
|
|
114
|
+
schema: "evermind.model/1",
|
|
115
|
+
name: meta.name,
|
|
116
|
+
version: meta.version,
|
|
117
|
+
modelType: "evermind-lm",
|
|
118
|
+
config: lm.config as unknown as Record<string, number>,
|
|
119
|
+
paramCount: lm.parameters().reduce((n, p) => n + p.data.length, 0),
|
|
120
|
+
checkpointFormat: "EVL0",
|
|
121
|
+
checkpointFp16: fp16,
|
|
122
|
+
checksum: fnv1a(new Uint8Array(checkpoint)),
|
|
123
|
+
card: meta.card,
|
|
124
|
+
...(meta.createdAt ? { createdAt: meta.createdAt } : {}),
|
|
125
|
+
};
|
|
126
|
+
return new EvermindModelPackage(manifest, checkpoint);
|
|
127
|
+
}
|
|
128
|
+
|
|
129
|
+
/** Serialise to a single `.evermind` blob: magic, version, manifest, checkpoint. */
|
|
130
|
+
toBlob(): ArrayBuffer {
|
|
131
|
+
const manifestBytes = new TextEncoder().encode(JSON.stringify(this.manifest));
|
|
132
|
+
const headerBytes = 12; // magic, version, manifestLen
|
|
133
|
+
const out = new ArrayBuffer(headerBytes + manifestBytes.byteLength + this.checkpoint.byteLength);
|
|
134
|
+
const head = new Uint32Array(out, 0, 3);
|
|
135
|
+
head[0] = PKG_MAGIC;
|
|
136
|
+
head[1] = PKG_VERSION;
|
|
137
|
+
head[2] = manifestBytes.byteLength;
|
|
138
|
+
new Uint8Array(out, headerBytes, manifestBytes.byteLength).set(manifestBytes);
|
|
139
|
+
new Uint8Array(out, headerBytes + manifestBytes.byteLength).set(new Uint8Array(this.checkpoint));
|
|
140
|
+
return out;
|
|
141
|
+
}
|
|
142
|
+
|
|
143
|
+
/** Parse a `.evermind` blob. Throws on bad magic / truncation. */
|
|
144
|
+
static fromBlob(buffer: ArrayBuffer): EvermindModelPackage {
|
|
145
|
+
if (buffer.byteLength < 12) throw new Error("EvermindModelPackage.fromBlob: truncated (no header)");
|
|
146
|
+
const head = new Uint32Array(buffer, 0, 3);
|
|
147
|
+
if (head[0] !== PKG_MAGIC) throw new Error("EvermindModelPackage.fromBlob: bad magic (not an .evermind package)");
|
|
148
|
+
const manifestLen = head[2]!;
|
|
149
|
+
const headerBytes = 12;
|
|
150
|
+
if (headerBytes + manifestLen > buffer.byteLength) {
|
|
151
|
+
throw new Error("EvermindModelPackage.fromBlob: truncated (manifest length exceeds blob)");
|
|
152
|
+
}
|
|
153
|
+
const manifestBytes = new Uint8Array(buffer, headerBytes, manifestLen);
|
|
154
|
+
const manifest = JSON.parse(new TextDecoder().decode(manifestBytes)) as EvermindModelManifest;
|
|
155
|
+
const checkpoint = buffer.slice(headerBytes + manifestLen);
|
|
156
|
+
return new EvermindModelPackage(manifest, checkpoint);
|
|
157
|
+
}
|
|
158
|
+
|
|
159
|
+
/** Verify integrity + structural sanity before trusting a downloaded package. */
|
|
160
|
+
validate(): ValidationResult {
|
|
161
|
+
const errors: string[] = [];
|
|
162
|
+
if (this.manifest.schema !== "evermind.model/1") errors.push(`unknown schema: ${this.manifest.schema}`);
|
|
163
|
+
const c = this.manifest.config;
|
|
164
|
+
if (this.manifest.modelType === "shared-expert-moe") {
|
|
165
|
+
if (!c || c.modelDim! <= 0 || c.hiddenDim! <= 0 || c.numExperts! <= 0 || c.topK! < 1 || c.topK! > c.numExperts!) {
|
|
166
|
+
errors.push("invalid config");
|
|
167
|
+
}
|
|
168
|
+
} else if (this.manifest.modelType === "evermind-lm") {
|
|
169
|
+
if (!c || c.vocabSize! <= 0 || c.dModel! <= 0 || c.numLayers! <= 0 || c.numExperts! <= 0 || c.topK! < 1 || c.topK! > c.numExperts!) {
|
|
170
|
+
errors.push("invalid config");
|
|
171
|
+
}
|
|
172
|
+
} else {
|
|
173
|
+
errors.push(`unsupported modelType: ${String(this.manifest.modelType)}`);
|
|
174
|
+
}
|
|
175
|
+
const actual = fnv1a(new Uint8Array(this.checkpoint));
|
|
176
|
+
if (actual !== this.manifest.checksum) {
|
|
177
|
+
errors.push(`checksum mismatch (manifest ${this.manifest.checksum}, actual ${actual}) — corrupt or tampered checkpoint`);
|
|
178
|
+
}
|
|
179
|
+
return { ok: errors.length === 0, errors };
|
|
180
|
+
}
|
|
181
|
+
|
|
182
|
+
/** Reconstruct the bare MoE layer. Validates first; throws if invalid / wrong type. */
|
|
183
|
+
loadModel(): SharedExpertMoE {
|
|
184
|
+
const v = this.validate();
|
|
185
|
+
if (!v.ok) throw new Error(`EvermindModelPackage.loadModel: ${v.errors.join("; ")}`);
|
|
186
|
+
if (this.manifest.modelType !== "shared-expert-moe") {
|
|
187
|
+
throw new Error(`loadModel: package is '${this.manifest.modelType}', use loadLM()`);
|
|
188
|
+
}
|
|
189
|
+
const model = new SharedExpertMoE(this.manifest.config as Partial<MoEConfig>);
|
|
190
|
+
model.loadWeights(this.checkpoint);
|
|
191
|
+
return model;
|
|
192
|
+
}
|
|
193
|
+
|
|
194
|
+
/** Reconstruct the runnable generative model — what a marketplace buyer runs. */
|
|
195
|
+
loadLM(): EvermindLM {
|
|
196
|
+
const v = this.validate();
|
|
197
|
+
if (!v.ok) throw new Error(`EvermindModelPackage.loadLM: ${v.errors.join("; ")}`);
|
|
198
|
+
if (this.manifest.modelType !== "evermind-lm") {
|
|
199
|
+
throw new Error(`loadLM: package is '${this.manifest.modelType}', use loadModel()`);
|
|
200
|
+
}
|
|
201
|
+
const lm = new EvermindLM(this.manifest.config as unknown as EvermindLMConfig);
|
|
202
|
+
lm.loadWeights(this.checkpoint);
|
|
203
|
+
return lm;
|
|
204
|
+
}
|
|
205
|
+
}
|