@seanhogg/builderforce-memory-engine 2026.6.27 → 2026.6.28
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +6 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +5 -0
- package/dist/index.js.map +1 -1
- package/dist/lm/evermind_lm.d.ts +148 -0
- package/dist/lm/evermind_lm.d.ts.map +1 -0
- package/dist/lm/evermind_lm.js +479 -0
- package/dist/lm/evermind_lm.js.map +1 -0
- package/dist/lm/index.d.ts +6 -0
- package/dist/lm/index.d.ts.map +1 -0
- package/dist/lm/index.js +5 -0
- package/dist/lm/index.js.map +1 -0
- package/dist/model/attention_block.js +1 -1
- package/dist/model/attention_block.js.map +1 -1
- package/dist/model/mamba_model.js +1 -1
- package/dist/model/mamba_model.js.map +1 -1
- package/dist/moe/index.d.ts +10 -0
- package/dist/moe/index.d.ts.map +1 -0
- package/dist/moe/index.js +7 -0
- package/dist/moe/index.js.map +1 -0
- package/dist/moe/moe_model.d.ts +134 -0
- package/dist/moe/moe_model.d.ts.map +1 -0
- package/dist/moe/moe_model.js +415 -0
- package/dist/moe/moe_model.js.map +1 -0
- package/dist/moe/moe_package.d.ts +81 -0
- package/dist/moe/moe_package.d.ts.map +1 -0
- package/dist/moe/moe_package.js +157 -0
- package/dist/moe/moe_package.js.map +1 -0
- package/dist/moe/moe_trainer.d.ts +53 -0
- package/dist/moe/moe_trainer.d.ts.map +1 -0
- package/dist/moe/moe_trainer.js +93 -0
- package/dist/moe/moe_trainer.js.map +1 -0
- package/dist/optim/adamw.d.ts +32 -0
- package/dist/optim/adamw.d.ts.map +1 -0
- package/dist/optim/adamw.js +52 -0
- package/dist/optim/adamw.js.map +1 -0
- package/package.json +1 -1
- package/src/index.ts +28 -0
- package/src/lm/evermind_lm.ts +558 -0
- package/src/lm/index.ts +6 -0
- package/src/model/attention_block.ts +1 -1
- package/src/model/mamba_model.ts +1 -1
- package/src/moe/index.ts +23 -0
- package/src/moe/moe_model.ts +475 -0
- package/src/moe/moe_package.ts +205 -0
- package/src/moe/moe_trainer.ts +134 -0
- package/src/optim/adamw.ts +72 -0
|
@@ -0,0 +1,558 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* evermind_lm.ts — EvermindLM: a small but complete generative language model.
|
|
3
|
+
*
|
|
4
|
+
* This is what turns a trained checkpoint into an *AI that generates text* (the
|
|
5
|
+
* thing a marketplace buyer actually runs). Architecture (Mamba-flavoured, the
|
|
6
|
+
* minimal exact-gradient CPU reference):
|
|
7
|
+
*
|
|
8
|
+
* x_t = Embed[token_t]
|
|
9
|
+
* per layer:
|
|
10
|
+
* x_t += DepthwiseCausalConv(x)_t // temporal mixing (short conv)
|
|
11
|
+
* x_t += SharedExpertMoE(x_t) // per-position channel mixing (sparse)
|
|
12
|
+
* logits_t = x_t · Embedᵀ // tied output head
|
|
13
|
+
*
|
|
14
|
+
* The token mixer is a depthwise causal convolution (each channel sees a short
|
|
15
|
+
* window of its own past — Mamba's pre-conv) and the channel mixer is the
|
|
16
|
+
* shared-expert MoE, so the model is genuinely sparse. Embeddings are tied
|
|
17
|
+
* (input lookup == output head), which the gradient code accounts for.
|
|
18
|
+
*
|
|
19
|
+
* Pure CPU, exact forward + backward (finite-difference checked), reusing the
|
|
20
|
+
* engine's MoE, cross-entropy, and AdamW. The WGSL/WebGPU path is a future
|
|
21
|
+
* acceleration with the same shapes.
|
|
22
|
+
*/
|
|
23
|
+
|
|
24
|
+
import { SharedExpertMoE } from "../moe/moe_model.js";
|
|
25
|
+
import { crossEntropyLoss, crossEntropyGrad } from "../training/autograd.js";
|
|
26
|
+
import { AdamW, type AdamWOptions } from "../optim/adamw.js";
|
|
27
|
+
import { SeededRng } from "../utils/rng.js";
|
|
28
|
+
import { quantizeFp16, dequantizeFp16 } from "../utils/quantization.js";
|
|
29
|
+
|
|
30
|
+
export interface EvermindLMConfig {
|
|
31
|
+
/** Vocabulary size. */
|
|
32
|
+
vocabSize: number;
|
|
33
|
+
/** Model (channel) dimension. Default 64. */
|
|
34
|
+
dModel: number;
|
|
35
|
+
/** Number of (conv + MoE) blocks. Default 2. */
|
|
36
|
+
numLayers: number;
|
|
37
|
+
/** Causal conv kernel width. Default 3. */
|
|
38
|
+
convKernel: number;
|
|
39
|
+
/** Hidden width of each MoE expert FFN. Default 2·dModel. */
|
|
40
|
+
hiddenDim: number;
|
|
41
|
+
/** Routed experts per MoE layer. Default 4. */
|
|
42
|
+
numExperts: number;
|
|
43
|
+
/** Experts activated per token. Default 2. */
|
|
44
|
+
topK: number;
|
|
45
|
+
/** Deterministic init seed. */
|
|
46
|
+
seed?: number;
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
export const DEFAULT_LM_CONFIG: Required<Omit<EvermindLMConfig, "seed" | "vocabSize">> = {
|
|
50
|
+
dModel: 64,
|
|
51
|
+
numLayers: 2,
|
|
52
|
+
convKernel: 3,
|
|
53
|
+
hiddenDim: 128,
|
|
54
|
+
numExperts: 4,
|
|
55
|
+
topK: 2,
|
|
56
|
+
};
|
|
57
|
+
|
|
58
|
+
export const DEFAULT_LM_SEED = 0x45564c4d; // "EVLM"
|
|
59
|
+
const MAGIC = 0x45564c30; // "EVL0"
|
|
60
|
+
|
|
61
|
+
interface MoECacheLike {
|
|
62
|
+
x: Float32Array;
|
|
63
|
+
route: { experts: number[]; gates: number[]; probs: Float32Array };
|
|
64
|
+
sharedPre: Float32Array;
|
|
65
|
+
sharedH: Float32Array;
|
|
66
|
+
expertOut: Float32Array[];
|
|
67
|
+
expertPre: Float32Array[];
|
|
68
|
+
expertH: Float32Array[];
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
interface LayerCache {
|
|
72
|
+
layerIn: Float32Array[]; // residual base for the conv sub-block (the layer input)
|
|
73
|
+
normedConv: Float32Array[]; // RMSNorm(layerIn) — the conv input
|
|
74
|
+
rmsConv: number[]; // per-position RMS denom for the conv norm
|
|
75
|
+
afterConv: Float32Array[]; // residual base for the MoE sub-block
|
|
76
|
+
rmsMoe: number[]; // per-position RMS denom for the MoE norm
|
|
77
|
+
moeCache: MoECacheLike[]; // per position
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
interface ForwardCache {
|
|
81
|
+
tokens: number[];
|
|
82
|
+
layers: LayerCache[];
|
|
83
|
+
finalX: Float32Array[]; // per position, fed to the tied head
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
/** A tokenizer the LM can read/write text through (the engine's `BPETokenizer` fits). */
|
|
87
|
+
export interface TextCodec {
|
|
88
|
+
encode(text: string): number[];
|
|
89
|
+
decode(ids: number[]): string;
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
export interface LMGenerateOptions {
|
|
93
|
+
maxNewTokens: number;
|
|
94
|
+
/** Sampling temperature; ≤0 ⇒ greedy argmax. Default 0 (greedy). */
|
|
95
|
+
temperature?: number;
|
|
96
|
+
/** Deterministic sampler seed (only used when temperature > 0). */
|
|
97
|
+
seed?: number;
|
|
98
|
+
/** Stop generating when this token id is produced. */
|
|
99
|
+
stopToken?: number;
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
export class EvermindLM {
|
|
103
|
+
readonly config: Required<Omit<EvermindLMConfig, "seed">>;
|
|
104
|
+
|
|
105
|
+
/** Tied token embedding / output head: vocabSize × dModel (row-major). */
|
|
106
|
+
emb: Float32Array;
|
|
107
|
+
private gEmb: Float32Array;
|
|
108
|
+
/** Per-layer depthwise causal conv kernels: dModel × convKernel. */
|
|
109
|
+
private readonly conv: Float32Array[];
|
|
110
|
+
private readonly gConv: Float32Array[];
|
|
111
|
+
/** Per-layer pre-conv / pre-MoE RMSNorm gains (dModel each). */
|
|
112
|
+
private readonly nConv: Float32Array[];
|
|
113
|
+
private readonly gNConv: Float32Array[];
|
|
114
|
+
private readonly nMoe: Float32Array[];
|
|
115
|
+
private readonly gNMoe: Float32Array[];
|
|
116
|
+
/** Per-layer channel mixer. */
|
|
117
|
+
private readonly moe: SharedExpertMoE[];
|
|
118
|
+
|
|
119
|
+
constructor(config: EvermindLMConfig) {
|
|
120
|
+
const dModel = config.dModel ?? DEFAULT_LM_CONFIG.dModel;
|
|
121
|
+
const cfg: Required<Omit<EvermindLMConfig, "seed">> = {
|
|
122
|
+
vocabSize: config.vocabSize,
|
|
123
|
+
dModel,
|
|
124
|
+
numLayers: config.numLayers ?? DEFAULT_LM_CONFIG.numLayers,
|
|
125
|
+
convKernel: config.convKernel ?? DEFAULT_LM_CONFIG.convKernel,
|
|
126
|
+
hiddenDim: config.hiddenDim ?? dModel * 2,
|
|
127
|
+
numExperts: config.numExperts ?? DEFAULT_LM_CONFIG.numExperts,
|
|
128
|
+
topK: config.topK ?? DEFAULT_LM_CONFIG.topK,
|
|
129
|
+
};
|
|
130
|
+
if (cfg.vocabSize <= 0) throw new Error("EvermindLM: vocabSize must be > 0");
|
|
131
|
+
if (cfg.topK > cfg.numExperts) throw new Error("EvermindLM: topK must be ≤ numExperts");
|
|
132
|
+
this.config = cfg;
|
|
133
|
+
|
|
134
|
+
const seed = (config.seed ?? DEFAULT_LM_SEED) >>> 0 || 1;
|
|
135
|
+
const rng = new SeededRng(seed);
|
|
136
|
+
const gauss = (n: number, std: number): Float32Array => {
|
|
137
|
+
const a = new Float32Array(n);
|
|
138
|
+
for (let i = 0; i < n; i++) {
|
|
139
|
+
const u1 = Math.max(rng.next(), 1e-12);
|
|
140
|
+
const u2 = rng.next();
|
|
141
|
+
a[i] = std * Math.sqrt(-2 * Math.log(u1)) * Math.cos(2 * Math.PI * u2);
|
|
142
|
+
}
|
|
143
|
+
return a;
|
|
144
|
+
};
|
|
145
|
+
|
|
146
|
+
this.emb = gauss(cfg.vocabSize * cfg.dModel, 0.02);
|
|
147
|
+
this.gEmb = new Float32Array(this.emb.length);
|
|
148
|
+
this.conv = [];
|
|
149
|
+
this.gConv = [];
|
|
150
|
+
this.nConv = [];
|
|
151
|
+
this.gNConv = [];
|
|
152
|
+
this.nMoe = [];
|
|
153
|
+
this.gNMoe = [];
|
|
154
|
+
this.moe = [];
|
|
155
|
+
for (let l = 0; l < cfg.numLayers; l++) {
|
|
156
|
+
// Conv init near an identity passthrough (current tap ≈ 1, history ≈ 0) so
|
|
157
|
+
// an untrained block is close to a residual no-op.
|
|
158
|
+
const k = new Float32Array(cfg.dModel * cfg.convKernel);
|
|
159
|
+
for (let c = 0; c < cfg.dModel; c++) k[c * cfg.convKernel] = 1;
|
|
160
|
+
this.conv.push(k);
|
|
161
|
+
this.gConv.push(new Float32Array(k.length));
|
|
162
|
+
// RMSNorm gains start at 1 (identity scale).
|
|
163
|
+
this.nConv.push(new Float32Array(cfg.dModel).fill(1));
|
|
164
|
+
this.gNConv.push(new Float32Array(cfg.dModel));
|
|
165
|
+
this.nMoe.push(new Float32Array(cfg.dModel).fill(1));
|
|
166
|
+
this.gNMoe.push(new Float32Array(cfg.dModel));
|
|
167
|
+
// Each MoE layer gets a distinct seed for varied expert init.
|
|
168
|
+
this.moe.push(
|
|
169
|
+
new SharedExpertMoE({
|
|
170
|
+
modelDim: cfg.dModel,
|
|
171
|
+
hiddenDim: cfg.hiddenDim,
|
|
172
|
+
numExperts: cfg.numExperts,
|
|
173
|
+
topK: cfg.topK,
|
|
174
|
+
seed: seed + 1 + l,
|
|
175
|
+
}),
|
|
176
|
+
);
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
// ── Forward ────────────────────────────────────────────────────────────────
|
|
181
|
+
|
|
182
|
+
/** Run the model over a token sequence; returns per-position logits + a cache. */
|
|
183
|
+
forward(tokens: number[]): { logits: Float32Array[]; cache: ForwardCache } {
|
|
184
|
+
const { dModel, convKernel, numLayers, vocabSize } = this.config;
|
|
185
|
+
const T = tokens.length;
|
|
186
|
+
|
|
187
|
+
// Embed.
|
|
188
|
+
let x: Float32Array[] = tokens.map((tok) => {
|
|
189
|
+
const row = new Float32Array(dModel);
|
|
190
|
+
const off = tok * dModel;
|
|
191
|
+
for (let c = 0; c < dModel; c++) row[c] = this.emb[off + c]!;
|
|
192
|
+
return row;
|
|
193
|
+
});
|
|
194
|
+
|
|
195
|
+
const layers: LayerCache[] = [];
|
|
196
|
+
for (let l = 0; l < numLayers; l++) {
|
|
197
|
+
const layerIn = x;
|
|
198
|
+
const ker = this.conv[l]!;
|
|
199
|
+
const nConv = this.nConv[l]!;
|
|
200
|
+
const nMoe = this.nMoe[l]!;
|
|
201
|
+
|
|
202
|
+
// Pre-norm → depthwise causal conv → residual.
|
|
203
|
+
const normedConv: Float32Array[] = [];
|
|
204
|
+
const rmsConv: number[] = [];
|
|
205
|
+
for (let t = 0; t < T; t++) {
|
|
206
|
+
const { y, r } = rmsNorm(layerIn[t]!, nConv);
|
|
207
|
+
normedConv.push(y);
|
|
208
|
+
rmsConv.push(r);
|
|
209
|
+
}
|
|
210
|
+
const afterConv: Float32Array[] = [];
|
|
211
|
+
for (let t = 0; t < T; t++) {
|
|
212
|
+
const out = Float32Array.from(layerIn[t]!); // residual base
|
|
213
|
+
for (let c = 0; c < dModel; c++) {
|
|
214
|
+
let acc = 0;
|
|
215
|
+
for (let j = 0; j < convKernel; j++) {
|
|
216
|
+
const ti = t - j;
|
|
217
|
+
if (ti >= 0) acc += ker[c * convKernel + j]! * normedConv[ti]![c]!;
|
|
218
|
+
}
|
|
219
|
+
out[c] = out[c]! + acc;
|
|
220
|
+
}
|
|
221
|
+
afterConv.push(out);
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
// Pre-norm → MoE channel mixer → residual.
|
|
225
|
+
const rmsMoe: number[] = [];
|
|
226
|
+
const moeCache: MoECacheLike[] = [];
|
|
227
|
+
const afterMoe: Float32Array[] = [];
|
|
228
|
+
for (let t = 0; t < T; t++) {
|
|
229
|
+
const { y, r } = rmsNorm(afterConv[t]!, nMoe);
|
|
230
|
+
rmsMoe.push(r);
|
|
231
|
+
const out = Float32Array.from(afterConv[t]!); // residual base
|
|
232
|
+
const mr = this.moe[l]!.forward(y);
|
|
233
|
+
for (let c = 0; c < dModel; c++) out[c] = out[c]! + mr.output[c]!;
|
|
234
|
+
afterMoe.push(out);
|
|
235
|
+
moeCache.push(mr.cache as unknown as MoECacheLike);
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
layers.push({ layerIn, normedConv, rmsConv, afterConv, rmsMoe, moeCache });
|
|
239
|
+
x = afterMoe;
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
// Tied head: logits_t[v] = x_t · emb[v].
|
|
243
|
+
const logits: Float32Array[] = x.map((xt) => {
|
|
244
|
+
const lg = new Float32Array(vocabSize);
|
|
245
|
+
for (let v = 0; v < vocabSize; v++) {
|
|
246
|
+
let acc = 0;
|
|
247
|
+
const off = v * dModel;
|
|
248
|
+
for (let c = 0; c < dModel; c++) acc += xt[c]! * this.emb[off + c]!;
|
|
249
|
+
lg[v] = acc;
|
|
250
|
+
}
|
|
251
|
+
return lg;
|
|
252
|
+
});
|
|
253
|
+
|
|
254
|
+
return { logits, cache: { tokens, layers, finalX: x } };
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
// ── Loss + backward ──────────────────────────────────────────────────────────
|
|
258
|
+
|
|
259
|
+
/**
|
|
260
|
+
* Next-token cross-entropy over the sequence (predict tokens[t+1] from
|
|
261
|
+
* position t), accumulating exact gradients. Returns the mean loss. Call
|
|
262
|
+
* {@link zeroGrad} before and an optimiser step after.
|
|
263
|
+
*/
|
|
264
|
+
lossAndBackward(tokens: number[]): number {
|
|
265
|
+
const { dModel, convKernel, numLayers, vocabSize } = this.config;
|
|
266
|
+
const T = tokens.length;
|
|
267
|
+
if (T < 2) return 0;
|
|
268
|
+
const { logits, cache } = this.forward(tokens);
|
|
269
|
+
|
|
270
|
+
const predPositions = T - 1; // positions 0..T-2 predict the next token
|
|
271
|
+
const inv = 1 / predPositions;
|
|
272
|
+
|
|
273
|
+
// dL/d(finalX_t) and head gradient into the tied embedding.
|
|
274
|
+
const dX: Float32Array[] = Array.from({ length: T }, () => new Float32Array(dModel));
|
|
275
|
+
let loss = 0;
|
|
276
|
+
for (let t = 0; t < predPositions; t++) {
|
|
277
|
+
const target = tokens[t + 1]!;
|
|
278
|
+
loss += crossEntropyLoss(logits[t]!, target) * inv;
|
|
279
|
+
const dLogit = crossEntropyGrad(logits[t]!, target); // probs - onehot
|
|
280
|
+
const xt = cache.finalX[t]!;
|
|
281
|
+
for (let v = 0; v < vocabSize; v++) {
|
|
282
|
+
const g = dLogit[v]! * inv;
|
|
283
|
+
if (g === 0) continue;
|
|
284
|
+
const off = v * dModel;
|
|
285
|
+
for (let c = 0; c < dModel; c++) {
|
|
286
|
+
this.gEmb[off + c] = this.gEmb[off + c]! + g * xt[c]!; // head → emb
|
|
287
|
+
dX[t]![c] = dX[t]![c]! + g * this.emb[off + c]!; // head → x_t
|
|
288
|
+
}
|
|
289
|
+
}
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
// Backprop through layers in reverse.
|
|
293
|
+
for (let l = numLayers - 1; l >= 0; l--) {
|
|
294
|
+
const lc = cache.layers[l]!;
|
|
295
|
+
const ker = this.conv[l]!;
|
|
296
|
+
const gker = this.gConv[l]!;
|
|
297
|
+
const nConv = this.nConv[l]!;
|
|
298
|
+
const gNConv = this.gNConv[l]!;
|
|
299
|
+
const nMoe = this.nMoe[l]!;
|
|
300
|
+
const gNMoe = this.gNMoe[l]!;
|
|
301
|
+
|
|
302
|
+
// MoE sub-block: afterMoe = afterConv + MoE(RMSNorm(afterConv, nMoe)).
|
|
303
|
+
const dAfterConv: Float32Array[] = [];
|
|
304
|
+
for (let t = 0; t < T; t++) {
|
|
305
|
+
const dMoeNormed = this.moe[l]!.backward(dX[t]!, lc.moeCache[t] as never);
|
|
306
|
+
const { dx, dgain } = rmsNormBackward(dMoeNormed, lc.afterConv[t]!, lc.rmsMoe[t]!, nMoe);
|
|
307
|
+
for (let c = 0; c < dModel; c++) gNMoe[c] = gNMoe[c]! + dgain[c]!;
|
|
308
|
+
const d = Float32Array.from(dX[t]!); // residual passthrough
|
|
309
|
+
for (let c = 0; c < dModel; c++) d[c] = d[c]! + dx[c]!;
|
|
310
|
+
dAfterConv.push(d);
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
// Conv sub-block: afterConv = layerIn + conv(RMSNorm(layerIn, nConv)).
|
|
314
|
+
const dNormedConv: Float32Array[] = Array.from({ length: T }, () => new Float32Array(dModel));
|
|
315
|
+
const dLayerIn: Float32Array[] = dAfterConv.map((v) => Float32Array.from(v)); // residual passthrough
|
|
316
|
+
for (let t = 0; t < T; t++) {
|
|
317
|
+
for (let c = 0; c < dModel; c++) {
|
|
318
|
+
const dmix = dAfterConv[t]![c]!;
|
|
319
|
+
if (dmix === 0) continue;
|
|
320
|
+
for (let j = 0; j < convKernel; j++) {
|
|
321
|
+
const ti = t - j;
|
|
322
|
+
if (ti < 0) continue;
|
|
323
|
+
gker[c * convKernel + j] = gker[c * convKernel + j]! + dmix * lc.normedConv[ti]![c]!;
|
|
324
|
+
dNormedConv[ti]![c] = dNormedConv[ti]![c]! + dmix * ker[c * convKernel + j]!;
|
|
325
|
+
}
|
|
326
|
+
}
|
|
327
|
+
}
|
|
328
|
+
for (let t = 0; t < T; t++) {
|
|
329
|
+
const { dx, dgain } = rmsNormBackward(dNormedConv[t]!, lc.layerIn[t]!, lc.rmsConv[t]!, nConv);
|
|
330
|
+
for (let c = 0; c < dModel; c++) {
|
|
331
|
+
gNConv[c] = gNConv[c]! + dgain[c]!;
|
|
332
|
+
dLayerIn[t]![c] = dLayerIn[t]![c]! + dx[c]!;
|
|
333
|
+
}
|
|
334
|
+
}
|
|
335
|
+
for (let t = 0; t < T; t++) dX[t] = dLayerIn[t]!;
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
// Embedding lookup: dX at layer-0 input flows into the row for token_t.
|
|
339
|
+
for (let t = 0; t < T; t++) {
|
|
340
|
+
const off = tokens[t]! * dModel;
|
|
341
|
+
for (let c = 0; c < dModel; c++) this.gEmb[off + c] = this.gEmb[off + c]! + dX[t]![c]!;
|
|
342
|
+
}
|
|
343
|
+
|
|
344
|
+
return loss;
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
// ── Generation ───────────────────────────────────────────────────────────────
|
|
348
|
+
|
|
349
|
+
/**
|
|
350
|
+
* Text-level generation: encode the prompt, generate, decode. `codec` is any
|
|
351
|
+
* tokenizer exposing encode/decode (the engine's `BPETokenizer` satisfies it),
|
|
352
|
+
* so the LM consumes and emits real text rather than raw token ids. The model's
|
|
353
|
+
* `vocabSize` must match the codec's vocabulary.
|
|
354
|
+
*/
|
|
355
|
+
generateText(prompt: string, codec: TextCodec, opts: LMGenerateOptions): string {
|
|
356
|
+
return codec.decode(this.generate(codec.encode(prompt), opts));
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
/** Greedy / temperature-sampled autoregressive generation. Returns NEW token ids. */
|
|
360
|
+
generate(prompt: number[], opts: LMGenerateOptions): number[] {
|
|
361
|
+
const temperature = opts.temperature ?? 0;
|
|
362
|
+
const rng = temperature > 0 ? new SeededRng((opts.seed ?? 1) >>> 0 || 1) : null;
|
|
363
|
+
const tokens = [...prompt];
|
|
364
|
+
const produced: number[] = [];
|
|
365
|
+
for (let n = 0; n < opts.maxNewTokens; n++) {
|
|
366
|
+
const { logits } = this.forward(tokens.length > 0 ? tokens : [0]);
|
|
367
|
+
const last = logits[logits.length - 1]!;
|
|
368
|
+
const next = rng ? sampleTemperature(last, temperature, rng) : argmax(last);
|
|
369
|
+
produced.push(next);
|
|
370
|
+
tokens.push(next);
|
|
371
|
+
if (opts.stopToken !== undefined && next === opts.stopToken) break;
|
|
372
|
+
}
|
|
373
|
+
return produced;
|
|
374
|
+
}
|
|
375
|
+
|
|
376
|
+
// ── Parameters / checkpoint ──────────────────────────────────────────────────
|
|
377
|
+
|
|
378
|
+
/** All trainable parameters as {data} (AdamW-compatible), canonical order. */
|
|
379
|
+
parameters(): { data: Float32Array }[] {
|
|
380
|
+
const out: { data: Float32Array }[] = [{ data: this.emb }];
|
|
381
|
+
for (let l = 0; l < this.config.numLayers; l++) {
|
|
382
|
+
out.push({ data: this.conv[l]! }, { data: this.nConv[l]! }, { data: this.nMoe[l]! });
|
|
383
|
+
for (const p of this.moe[l]!.parameters()) out.push({ data: p.data });
|
|
384
|
+
}
|
|
385
|
+
return out;
|
|
386
|
+
}
|
|
387
|
+
|
|
388
|
+
/** Gradient buffers, index-aligned with {@link parameters}. */
|
|
389
|
+
gradients(): { data: Float32Array }[] {
|
|
390
|
+
const out: { data: Float32Array }[] = [{ data: this.gEmb }];
|
|
391
|
+
for (let l = 0; l < this.config.numLayers; l++) {
|
|
392
|
+
out.push({ data: this.gConv[l]! }, { data: this.gNConv[l]! }, { data: this.gNMoe[l]! });
|
|
393
|
+
for (const g of this.moe[l]!.gradients()) out.push({ data: g.data });
|
|
394
|
+
}
|
|
395
|
+
return out;
|
|
396
|
+
}
|
|
397
|
+
|
|
398
|
+
zeroGrad(): void {
|
|
399
|
+
this.gEmb.fill(0);
|
|
400
|
+
for (let l = 0; l < this.config.numLayers; l++) {
|
|
401
|
+
this.gConv[l]!.fill(0);
|
|
402
|
+
this.gNConv[l]!.fill(0);
|
|
403
|
+
this.gNMoe[l]!.fill(0);
|
|
404
|
+
this.moe[l]!.zeroGrad();
|
|
405
|
+
}
|
|
406
|
+
}
|
|
407
|
+
|
|
408
|
+
/** Serialise to an "EVL0" binary (fp16 or f32), params in {@link parameters} order. */
|
|
409
|
+
exportWeights(opts: { fp16?: boolean } = {}): ArrayBuffer {
|
|
410
|
+
const fp16 = opts.fp16 ?? false;
|
|
411
|
+
const params = this.parameters();
|
|
412
|
+
const total = params.reduce((n, p) => n + p.data.length, 0);
|
|
413
|
+
// magic, version, vocab, dModel, numLayers, convKernel, hiddenDim, numExperts, topK.
|
|
414
|
+
// numExperts and topK get distinct slots (an earlier *16 packing collided once
|
|
415
|
+
// numExperts ≥ 16 — e.g. (20,20) and (21,4) both packed to 340).
|
|
416
|
+
const headerEls = 9;
|
|
417
|
+
const headerBytes = headerEls * 4;
|
|
418
|
+
const buf = new ArrayBuffer(headerBytes + (fp16 ? total * 2 : total * 4));
|
|
419
|
+
const head = new Uint32Array(buf, 0, headerEls);
|
|
420
|
+
head[0] = MAGIC;
|
|
421
|
+
head[1] = fp16 ? 2 : 1;
|
|
422
|
+
head[2] = this.config.vocabSize;
|
|
423
|
+
head[3] = this.config.dModel;
|
|
424
|
+
head[4] = this.config.numLayers;
|
|
425
|
+
head[5] = this.config.convKernel;
|
|
426
|
+
head[6] = this.config.hiddenDim;
|
|
427
|
+
head[7] = this.config.numExperts;
|
|
428
|
+
head[8] = this.config.topK;
|
|
429
|
+
const flat = new Float32Array(total);
|
|
430
|
+
let o = 0;
|
|
431
|
+
for (const p of params) {
|
|
432
|
+
flat.set(p.data, o);
|
|
433
|
+
o += p.data.length;
|
|
434
|
+
}
|
|
435
|
+
if (fp16) new Uint16Array(buf, headerBytes, total).set(quantizeFp16(flat));
|
|
436
|
+
else new Float32Array(buf, headerBytes, total).set(flat);
|
|
437
|
+
return buf;
|
|
438
|
+
}
|
|
439
|
+
|
|
440
|
+
/** Load weights from an "EVL0" binary. Validates magic + dims. */
|
|
441
|
+
loadWeights(buffer: ArrayBuffer): void {
|
|
442
|
+
const head = new Uint32Array(buffer, 0, 9);
|
|
443
|
+
if (head[0] !== MAGIC) throw new Error("EvermindLM.loadWeights: bad magic (not an EVL0 checkpoint)");
|
|
444
|
+
const version = head[1]!;
|
|
445
|
+
if (
|
|
446
|
+
head[2] !== this.config.vocabSize ||
|
|
447
|
+
head[3] !== this.config.dModel ||
|
|
448
|
+
head[4] !== this.config.numLayers ||
|
|
449
|
+
head[5] !== this.config.convKernel ||
|
|
450
|
+
head[6] !== this.config.hiddenDim ||
|
|
451
|
+
head[7] !== this.config.numExperts ||
|
|
452
|
+
head[8] !== this.config.topK
|
|
453
|
+
) {
|
|
454
|
+
throw new Error("EvermindLM.loadWeights: config mismatch with checkpoint");
|
|
455
|
+
}
|
|
456
|
+
const params = this.parameters();
|
|
457
|
+
const total = params.reduce((n, p) => n + p.data.length, 0);
|
|
458
|
+
const headerBytes = 36;
|
|
459
|
+
const flat =
|
|
460
|
+
version === 2
|
|
461
|
+
? dequantizeFp16(new Uint16Array(buffer, headerBytes, total))
|
|
462
|
+
: new Float32Array(buffer.slice(headerBytes, headerBytes + total * 4));
|
|
463
|
+
let o = 0;
|
|
464
|
+
for (const p of params) {
|
|
465
|
+
p.data.set(flat.subarray(o, o + p.data.length));
|
|
466
|
+
o += p.data.length;
|
|
467
|
+
}
|
|
468
|
+
}
|
|
469
|
+
}
|
|
470
|
+
|
|
471
|
+
/** Minimal sequence trainer: AdamW over next-token cross-entropy. */
|
|
472
|
+
export class EvermindLMTrainer {
|
|
473
|
+
private readonly adam: AdamW;
|
|
474
|
+
constructor(
|
|
475
|
+
private readonly model: EvermindLM,
|
|
476
|
+
private readonly opts: AdamWOptions & { epochs?: number } = {},
|
|
477
|
+
) {
|
|
478
|
+
this.adam = new AdamW(model, opts);
|
|
479
|
+
}
|
|
480
|
+
/** Train on a set of token sequences; returns per-epoch mean loss. */
|
|
481
|
+
fit(sequences: number[][]): number[] {
|
|
482
|
+
const epochs = this.opts.epochs ?? 1;
|
|
483
|
+
const history: number[] = [];
|
|
484
|
+
for (let e = 0; e < epochs; e++) {
|
|
485
|
+
let total = 0;
|
|
486
|
+
let n = 0;
|
|
487
|
+
for (const seq of sequences) {
|
|
488
|
+
if (seq.length < 2) continue;
|
|
489
|
+
this.model.zeroGrad();
|
|
490
|
+
total += this.model.lossAndBackward(seq);
|
|
491
|
+
this.adam.step();
|
|
492
|
+
n++;
|
|
493
|
+
}
|
|
494
|
+
history.push(n > 0 ? total / n : 0);
|
|
495
|
+
}
|
|
496
|
+
return history;
|
|
497
|
+
}
|
|
498
|
+
}
|
|
499
|
+
|
|
500
|
+
const RMS_EPS = 1e-5;
|
|
501
|
+
|
|
502
|
+
/** RMSNorm: y[c] = gain[c]·x[c]/rms, rms = sqrt(mean(x²)+eps). Returns y and the denom. */
|
|
503
|
+
function rmsNorm(x: Float32Array, gain: Float32Array): { y: Float32Array; r: number } {
|
|
504
|
+
const D = x.length;
|
|
505
|
+
let ss = 0;
|
|
506
|
+
for (let c = 0; c < D; c++) ss += x[c]! * x[c]!;
|
|
507
|
+
const r = Math.sqrt(ss / D + RMS_EPS);
|
|
508
|
+
const y = new Float32Array(D);
|
|
509
|
+
for (let c = 0; c < D; c++) y[c] = (gain[c]! * x[c]!) / r;
|
|
510
|
+
return { y, r };
|
|
511
|
+
}
|
|
512
|
+
|
|
513
|
+
/**
|
|
514
|
+
* RMSNorm backward. Given dL/dy and the cached input/denom/gain, returns dL/dx and
|
|
515
|
+
* dL/dgain. dx_j = gain_j·dy_j/r − x_j·A/(D·r³) with A = Σ_c dy_c·gain_c·x_c;
|
|
516
|
+
* dgain_c = dy_c·x_c/r.
|
|
517
|
+
*/
|
|
518
|
+
function rmsNormBackward(
|
|
519
|
+
dy: Float32Array,
|
|
520
|
+
x: Float32Array,
|
|
521
|
+
r: number,
|
|
522
|
+
gain: Float32Array,
|
|
523
|
+
): { dx: Float32Array; dgain: Float32Array } {
|
|
524
|
+
const D = x.length;
|
|
525
|
+
let A = 0;
|
|
526
|
+
for (let c = 0; c < D; c++) A += dy[c]! * gain[c]! * x[c]!;
|
|
527
|
+
const dx = new Float32Array(D);
|
|
528
|
+
const dgain = new Float32Array(D);
|
|
529
|
+
const r3 = r * r * r;
|
|
530
|
+
for (let c = 0; c < D; c++) {
|
|
531
|
+
dx[c] = (gain[c]! * dy[c]!) / r - (x[c]! * A) / (D * r3);
|
|
532
|
+
dgain[c] = (dy[c]! * x[c]!) / r;
|
|
533
|
+
}
|
|
534
|
+
return { dx, dgain };
|
|
535
|
+
}
|
|
536
|
+
|
|
537
|
+
function argmax(v: Float32Array): number {
|
|
538
|
+
let best = 0;
|
|
539
|
+
for (let i = 1; i < v.length; i++) if (v[i]! > v[best]!) best = i;
|
|
540
|
+
return best;
|
|
541
|
+
}
|
|
542
|
+
|
|
543
|
+
function sampleTemperature(logits: Float32Array, temperature: number, rng: SeededRng): number {
|
|
544
|
+
let max = -Infinity;
|
|
545
|
+
for (let i = 0; i < logits.length; i++) if (logits[i]! / temperature > max) max = logits[i]! / temperature;
|
|
546
|
+
let sum = 0;
|
|
547
|
+
const probs = new Float32Array(logits.length);
|
|
548
|
+
for (let i = 0; i < logits.length; i++) {
|
|
549
|
+
probs[i] = Math.exp(logits[i]! / temperature - max);
|
|
550
|
+
sum += probs[i]!;
|
|
551
|
+
}
|
|
552
|
+
let r = rng.next() * sum;
|
|
553
|
+
for (let i = 0; i < probs.length; i++) {
|
|
554
|
+
r -= probs[i]!;
|
|
555
|
+
if (r <= 0) return i;
|
|
556
|
+
}
|
|
557
|
+
return probs.length - 1;
|
|
558
|
+
}
|
package/src/lm/index.ts
ADDED
|
@@ -0,0 +1,6 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* EvermindLM — the generative language model (the runnable "AI").
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
export { EvermindLM, EvermindLMTrainer, DEFAULT_LM_CONFIG, DEFAULT_LM_SEED } from "./evermind_lm.js";
|
|
6
|
+
export type { EvermindLMConfig, LMGenerateOptions, TextCodec } from "./evermind_lm.js";
|
|
@@ -114,7 +114,7 @@ export class AttentionBlock implements SequenceLayer {
|
|
|
114
114
|
}
|
|
115
115
|
|
|
116
116
|
private _initWeights(): void {
|
|
117
|
-
const { dModel,
|
|
117
|
+
const { dModel, hasFfn, ffnMult } = this.config;
|
|
118
118
|
|
|
119
119
|
const randn = (n: number, std = 0.02): Float32Array => gaussianArray(n, std);
|
|
120
120
|
|
package/src/model/mamba_model.ts
CHANGED
|
@@ -375,7 +375,7 @@ export class HybridMambaModel {
|
|
|
375
375
|
const { temperature = 1.0, topK = 50, topP = 0.9 } = samplingOpts;
|
|
376
376
|
const { vocabSize } = this.config;
|
|
377
377
|
|
|
378
|
-
|
|
378
|
+
const ids = [...promptIds];
|
|
379
379
|
|
|
380
380
|
for (let step = 0; step < maxNewTokens; step++) {
|
|
381
381
|
const { logits } = await this.forward(new Uint32Array(ids), 1, ids.length);
|
package/src/moe/index.ts
ADDED
|
@@ -0,0 +1,23 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* Mixture-of-Experts — shared-expert hybrid sparsity for the Evermind generator.
|
|
3
|
+
*/
|
|
4
|
+
|
|
5
|
+
export {
|
|
6
|
+
SharedExpertMoE,
|
|
7
|
+
LoadBalanceAccumulator,
|
|
8
|
+
DEFAULT_MOE_CONFIG,
|
|
9
|
+
DEFAULT_MOE_SEED,
|
|
10
|
+
} from "./moe_model.js";
|
|
11
|
+
export type { MoEConfig, MoEParam, RouteResult } from "./moe_model.js";
|
|
12
|
+
|
|
13
|
+
export { MoETrainer } from "./moe_trainer.js";
|
|
14
|
+
export type { MoESample, MoETrainOptions, MoEEpochResult } from "./moe_trainer.js";
|
|
15
|
+
|
|
16
|
+
export { EvermindModelPackage } from "./moe_package.js";
|
|
17
|
+
export type {
|
|
18
|
+
EvermindModelManifest,
|
|
19
|
+
EvermindModelCard,
|
|
20
|
+
EvermindModelType,
|
|
21
|
+
PackageMeta,
|
|
22
|
+
ValidationResult,
|
|
23
|
+
} from "./moe_package.js";
|