@seanhogg/builderforce-memory-engine 2026.6.20 → 2026.6.28
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/index.d.ts +9 -0
- package/dist/index.d.ts.map +1 -1
- package/dist/index.js +8 -0
- package/dist/index.js.map +1 -1
- package/dist/kernels/limbic_affect.d.ts +2 -0
- package/dist/kernels/limbic_affect.d.ts.map +1 -0
- package/dist/kernels/limbic_affect.js +74 -0
- package/dist/kernels/limbic_affect.js.map +1 -0
- package/dist/limbic/index.d.ts +14 -0
- package/dist/limbic/index.d.ts.map +1 -0
- package/dist/limbic/index.js +11 -0
- package/dist/limbic/index.js.map +1 -0
- package/dist/limbic/limbic_model.d.ts +111 -0
- package/dist/limbic/limbic_model.d.ts.map +1 -0
- package/dist/limbic/limbic_model.js +299 -0
- package/dist/limbic/limbic_model.js.map +1 -0
- package/dist/limbic/limbic_trainer.d.ts +62 -0
- package/dist/limbic/limbic_trainer.d.ts.map +1 -0
- package/dist/limbic/limbic_trainer.js +172 -0
- package/dist/limbic/limbic_trainer.js.map +1 -0
- package/dist/limbic/regions.d.ts +79 -0
- package/dist/limbic/regions.d.ts.map +1 -0
- package/dist/limbic/regions.js +132 -0
- package/dist/limbic/regions.js.map +1 -0
- package/dist/lm/evermind_lm.d.ts +148 -0
- package/dist/lm/evermind_lm.d.ts.map +1 -0
- package/dist/lm/evermind_lm.js +479 -0
- package/dist/lm/evermind_lm.js.map +1 -0
- package/dist/lm/index.d.ts +6 -0
- package/dist/lm/index.d.ts.map +1 -0
- package/dist/lm/index.js +5 -0
- package/dist/lm/index.js.map +1 -0
- package/dist/model/attention_block.js +1 -1
- package/dist/model/attention_block.js.map +1 -1
- package/dist/model/mamba_model.js +1 -1
- package/dist/model/mamba_model.js.map +1 -1
- package/dist/moe/index.d.ts +10 -0
- package/dist/moe/index.d.ts.map +1 -0
- package/dist/moe/index.js +7 -0
- package/dist/moe/index.js.map +1 -0
- package/dist/moe/moe_model.d.ts +134 -0
- package/dist/moe/moe_model.d.ts.map +1 -0
- package/dist/moe/moe_model.js +415 -0
- package/dist/moe/moe_model.js.map +1 -0
- package/dist/moe/moe_package.d.ts +81 -0
- package/dist/moe/moe_package.d.ts.map +1 -0
- package/dist/moe/moe_package.js +157 -0
- package/dist/moe/moe_package.js.map +1 -0
- package/dist/moe/moe_trainer.d.ts +53 -0
- package/dist/moe/moe_trainer.d.ts.map +1 -0
- package/dist/moe/moe_trainer.js +93 -0
- package/dist/moe/moe_trainer.js.map +1 -0
- package/dist/optim/adamw.d.ts +32 -0
- package/dist/optim/adamw.d.ts.map +1 -0
- package/dist/optim/adamw.js +52 -0
- package/dist/optim/adamw.js.map +1 -0
- package/package.json +1 -1
- package/src/index.ts +59 -0
- package/src/kernels/limbic_affect.ts +74 -0
- package/src/limbic/index.ts +28 -0
- package/src/limbic/limbic_model.ts +373 -0
- package/src/limbic/limbic_trainer.ts +253 -0
- package/src/limbic/regions.ts +141 -0
- package/src/lm/evermind_lm.ts +558 -0
- package/src/lm/index.ts +6 -0
- package/src/model/attention_block.ts +1 -1
- package/src/model/mamba_model.ts +1 -1
- package/src/moe/index.ts +23 -0
- package/src/moe/moe_model.ts +475 -0
- package/src/moe/moe_package.ts +205 -0
- package/src/moe/moe_trainer.ts +134 -0
- package/src/optim/adamw.ts +72 -0
|
@@ -0,0 +1,141 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* regions.ts – Limbic-system region map and affective state schema.
|
|
3
|
+
*
|
|
4
|
+
* The limbic model is the *dynamic* affective/motivational layer that rides on
|
|
5
|
+
* top of the (static) psychometric personality. Where the hippocampus
|
|
6
|
+
* (HybridMambaModel + MemoryStore) holds *what the agent knows*, the limbic
|
|
7
|
+
* model holds *how the agent currently feels and what it is driven toward* —
|
|
8
|
+
* and learns the dynamics of that, in WebGPU, from experience.
|
|
9
|
+
*
|
|
10
|
+
* This file is the single source of truth for the affective state vector that
|
|
11
|
+
* every other limbic module (model, trainer, runtime service) indexes into.
|
|
12
|
+
* Keep the dimension ids in sync with the runtime compiler in
|
|
13
|
+
* `agent-runtime/src/builderforce/limbic.ts` — the two are coupled solely by
|
|
14
|
+
* these string ids and the {@link LIMBIC_DIM} indices.
|
|
15
|
+
*
|
|
16
|
+
* Region → state mapping (mirrors the labelled diagram):
|
|
17
|
+
* • Amygdala → salience / threat appraisal → drives valence + arousal
|
|
18
|
+
* • Hypothalamus → homeostatic drives (curiosity, caution, effort, social)
|
|
19
|
+
* • Thalamus → attention gate (how much incoming signal is admitted)
|
|
20
|
+
* • Basal ganglia → action selection bias (explore vs. exploit)
|
|
21
|
+
* • Hippocampus → reused (existing SSM memory); feeds the experience input
|
|
22
|
+
*/
|
|
23
|
+
|
|
24
|
+
/** The five modelled limbic regions. Hippocampus is reused, not re-modelled. */
|
|
25
|
+
export const REGION = {
|
|
26
|
+
amygdala: "amygdala",
|
|
27
|
+
hypothalamus: "hypothalamus",
|
|
28
|
+
thalamus: "thalamus",
|
|
29
|
+
basalGanglia: "basal_ganglia",
|
|
30
|
+
hippocampus: "hippocampus",
|
|
31
|
+
} as const;
|
|
32
|
+
export type Region = (typeof REGION)[keyof typeof REGION];
|
|
33
|
+
|
|
34
|
+
/**
|
|
35
|
+
* Canonical indices into the affective state vector. The vector is a dense
|
|
36
|
+
* Float32Array of length {@link LIMBIC_STATE_DIM}. Core affect (valence,
|
|
37
|
+
* arousal) is the 2D summary; the remaining dims are the per-region drives the
|
|
38
|
+
* agent's behaviour is modulated by.
|
|
39
|
+
*/
|
|
40
|
+
export const LIMBIC_DIM = {
|
|
41
|
+
/** Core affect — pleasantness. Range [-1, +1] (negative .. positive). */
|
|
42
|
+
valence: 0,
|
|
43
|
+
/** Core affect — activation. Range [0, 1] (calm .. activated). */
|
|
44
|
+
arousal: 1,
|
|
45
|
+
/** Hypothalamus drive — appetite for novelty/exploration. Range [0, 1]. */
|
|
46
|
+
driveCuriosity: 2,
|
|
47
|
+
/** Hypothalamus drive — appetite for safety/guardrails. Range [0, 1]. */
|
|
48
|
+
driveCaution: 3,
|
|
49
|
+
/** Hypothalamus drive — available energy. Range [0, 1] (fatigued .. fresh). */
|
|
50
|
+
driveEffort: 4,
|
|
51
|
+
/** Hypothalamus drive — appetite for communication/collaboration. Range [0, 1]. */
|
|
52
|
+
driveSocial: 5,
|
|
53
|
+
/** Thalamus — attention gain on incoming signal. Range [0, 1]. */
|
|
54
|
+
attention: 6,
|
|
55
|
+
/** Basal ganglia — explore(1) vs. exploit(0) action-selection bias. Range [0, 1]. */
|
|
56
|
+
exploration: 7,
|
|
57
|
+
} as const;
|
|
58
|
+
export type LimbicDimName = keyof typeof LIMBIC_DIM;
|
|
59
|
+
|
|
60
|
+
/** Length of the affective state vector. */
|
|
61
|
+
export const LIMBIC_STATE_DIM = 8;
|
|
62
|
+
|
|
63
|
+
/** Ordered dim names, index-aligned with {@link LIMBIC_DIM}. */
|
|
64
|
+
export const LIMBIC_DIM_NAMES: LimbicDimName[] = [
|
|
65
|
+
"valence",
|
|
66
|
+
"arousal",
|
|
67
|
+
"driveCuriosity",
|
|
68
|
+
"driveCaution",
|
|
69
|
+
"driveEffort",
|
|
70
|
+
"driveSocial",
|
|
71
|
+
"attention",
|
|
72
|
+
"exploration",
|
|
73
|
+
];
|
|
74
|
+
|
|
75
|
+
/** Inclusive [min, max] bounds per state dim, index-aligned. Valence is signed. */
|
|
76
|
+
export const LIMBIC_BOUNDS: ReadonlyArray<readonly [number, number]> = [
|
|
77
|
+
[-1, 1], // valence
|
|
78
|
+
[0, 1], // arousal
|
|
79
|
+
[0, 1], // driveCuriosity
|
|
80
|
+
[0, 1], // driveCaution
|
|
81
|
+
[0, 1], // driveEffort
|
|
82
|
+
[0, 1], // driveSocial
|
|
83
|
+
[0, 1], // attention
|
|
84
|
+
[0, 1], // exploration
|
|
85
|
+
];
|
|
86
|
+
|
|
87
|
+
/**
|
|
88
|
+
* Neutral resting state — the default homeostatic setpoint before personality
|
|
89
|
+
* pulls it anywhere. Calm, mildly positive, balanced drives, full attention,
|
|
90
|
+
* slightly exploit-biased.
|
|
91
|
+
*/
|
|
92
|
+
export const NEUTRAL_STATE: ReadonlyArray<number> = [
|
|
93
|
+
0.0, // valence
|
|
94
|
+
0.2, // arousal
|
|
95
|
+
0.5, // driveCuriosity
|
|
96
|
+
0.5, // driveCaution
|
|
97
|
+
0.8, // driveEffort
|
|
98
|
+
0.5, // driveSocial
|
|
99
|
+
0.7, // attention
|
|
100
|
+
0.5, // exploration (centred → resting state is behaviourally inert)
|
|
101
|
+
];
|
|
102
|
+
|
|
103
|
+
/** Clamp a single state dim to its bounds. */
|
|
104
|
+
export function clampDim(index: number, value: number): number {
|
|
105
|
+
const b = LIMBIC_BOUNDS[index];
|
|
106
|
+
if (!b) return value;
|
|
107
|
+
if (Number.isNaN(value)) return b[0];
|
|
108
|
+
return Math.max(b[0], Math.min(b[1], value));
|
|
109
|
+
}
|
|
110
|
+
|
|
111
|
+
/** Clamp a whole state vector in place and return it. */
|
|
112
|
+
export function clampState(state: Float32Array): Float32Array {
|
|
113
|
+
for (let i = 0; i < state.length && i < LIMBIC_STATE_DIM; i++) {
|
|
114
|
+
state[i] = clampDim(i, state[i]!);
|
|
115
|
+
}
|
|
116
|
+
return state;
|
|
117
|
+
}
|
|
118
|
+
|
|
119
|
+
/** A fresh neutral state vector. */
|
|
120
|
+
export function neutralState(): Float32Array {
|
|
121
|
+
return Float32Array.from(NEUTRAL_STATE);
|
|
122
|
+
}
|
|
123
|
+
|
|
124
|
+
/** Build a labelled record from a dense state vector (for logging / transport). */
|
|
125
|
+
export function stateToRecord(state: ArrayLike<number>): Record<LimbicDimName, number> {
|
|
126
|
+
const out = {} as Record<LimbicDimName, number>;
|
|
127
|
+
for (let i = 0; i < LIMBIC_DIM_NAMES.length; i++) {
|
|
128
|
+
out[LIMBIC_DIM_NAMES[i]!] = state[i] ?? 0;
|
|
129
|
+
}
|
|
130
|
+
return out;
|
|
131
|
+
}
|
|
132
|
+
|
|
133
|
+
/** Build a dense state vector from a (possibly partial) labelled record. */
|
|
134
|
+
export function recordToState(rec: Partial<Record<LimbicDimName, number>>): Float32Array {
|
|
135
|
+
const s = neutralState();
|
|
136
|
+
for (let i = 0; i < LIMBIC_DIM_NAMES.length; i++) {
|
|
137
|
+
const v = rec[LIMBIC_DIM_NAMES[i]!];
|
|
138
|
+
if (typeof v === "number" && !Number.isNaN(v)) s[i] = clampDim(i, v);
|
|
139
|
+
}
|
|
140
|
+
return s;
|
|
141
|
+
}
|
|
@@ -0,0 +1,558 @@
|
|
|
1
|
+
/**
|
|
2
|
+
* evermind_lm.ts — EvermindLM: a small but complete generative language model.
|
|
3
|
+
*
|
|
4
|
+
* This is what turns a trained checkpoint into an *AI that generates text* (the
|
|
5
|
+
* thing a marketplace buyer actually runs). Architecture (Mamba-flavoured, the
|
|
6
|
+
* minimal exact-gradient CPU reference):
|
|
7
|
+
*
|
|
8
|
+
* x_t = Embed[token_t]
|
|
9
|
+
* per layer:
|
|
10
|
+
* x_t += DepthwiseCausalConv(x)_t // temporal mixing (short conv)
|
|
11
|
+
* x_t += SharedExpertMoE(x_t) // per-position channel mixing (sparse)
|
|
12
|
+
* logits_t = x_t · Embedᵀ // tied output head
|
|
13
|
+
*
|
|
14
|
+
* The token mixer is a depthwise causal convolution (each channel sees a short
|
|
15
|
+
* window of its own past — Mamba's pre-conv) and the channel mixer is the
|
|
16
|
+
* shared-expert MoE, so the model is genuinely sparse. Embeddings are tied
|
|
17
|
+
* (input lookup == output head), which the gradient code accounts for.
|
|
18
|
+
*
|
|
19
|
+
* Pure CPU, exact forward + backward (finite-difference checked), reusing the
|
|
20
|
+
* engine's MoE, cross-entropy, and AdamW. The WGSL/WebGPU path is a future
|
|
21
|
+
* acceleration with the same shapes.
|
|
22
|
+
*/
|
|
23
|
+
|
|
24
|
+
import { SharedExpertMoE } from "../moe/moe_model.js";
|
|
25
|
+
import { crossEntropyLoss, crossEntropyGrad } from "../training/autograd.js";
|
|
26
|
+
import { AdamW, type AdamWOptions } from "../optim/adamw.js";
|
|
27
|
+
import { SeededRng } from "../utils/rng.js";
|
|
28
|
+
import { quantizeFp16, dequantizeFp16 } from "../utils/quantization.js";
|
|
29
|
+
|
|
30
|
+
export interface EvermindLMConfig {
|
|
31
|
+
/** Vocabulary size. */
|
|
32
|
+
vocabSize: number;
|
|
33
|
+
/** Model (channel) dimension. Default 64. */
|
|
34
|
+
dModel: number;
|
|
35
|
+
/** Number of (conv + MoE) blocks. Default 2. */
|
|
36
|
+
numLayers: number;
|
|
37
|
+
/** Causal conv kernel width. Default 3. */
|
|
38
|
+
convKernel: number;
|
|
39
|
+
/** Hidden width of each MoE expert FFN. Default 2·dModel. */
|
|
40
|
+
hiddenDim: number;
|
|
41
|
+
/** Routed experts per MoE layer. Default 4. */
|
|
42
|
+
numExperts: number;
|
|
43
|
+
/** Experts activated per token. Default 2. */
|
|
44
|
+
topK: number;
|
|
45
|
+
/** Deterministic init seed. */
|
|
46
|
+
seed?: number;
|
|
47
|
+
}
|
|
48
|
+
|
|
49
|
+
export const DEFAULT_LM_CONFIG: Required<Omit<EvermindLMConfig, "seed" | "vocabSize">> = {
|
|
50
|
+
dModel: 64,
|
|
51
|
+
numLayers: 2,
|
|
52
|
+
convKernel: 3,
|
|
53
|
+
hiddenDim: 128,
|
|
54
|
+
numExperts: 4,
|
|
55
|
+
topK: 2,
|
|
56
|
+
};
|
|
57
|
+
|
|
58
|
+
export const DEFAULT_LM_SEED = 0x45564c4d; // "EVLM"
|
|
59
|
+
const MAGIC = 0x45564c30; // "EVL0"
|
|
60
|
+
|
|
61
|
+
interface MoECacheLike {
|
|
62
|
+
x: Float32Array;
|
|
63
|
+
route: { experts: number[]; gates: number[]; probs: Float32Array };
|
|
64
|
+
sharedPre: Float32Array;
|
|
65
|
+
sharedH: Float32Array;
|
|
66
|
+
expertOut: Float32Array[];
|
|
67
|
+
expertPre: Float32Array[];
|
|
68
|
+
expertH: Float32Array[];
|
|
69
|
+
}
|
|
70
|
+
|
|
71
|
+
interface LayerCache {
|
|
72
|
+
layerIn: Float32Array[]; // residual base for the conv sub-block (the layer input)
|
|
73
|
+
normedConv: Float32Array[]; // RMSNorm(layerIn) — the conv input
|
|
74
|
+
rmsConv: number[]; // per-position RMS denom for the conv norm
|
|
75
|
+
afterConv: Float32Array[]; // residual base for the MoE sub-block
|
|
76
|
+
rmsMoe: number[]; // per-position RMS denom for the MoE norm
|
|
77
|
+
moeCache: MoECacheLike[]; // per position
|
|
78
|
+
}
|
|
79
|
+
|
|
80
|
+
interface ForwardCache {
|
|
81
|
+
tokens: number[];
|
|
82
|
+
layers: LayerCache[];
|
|
83
|
+
finalX: Float32Array[]; // per position, fed to the tied head
|
|
84
|
+
}
|
|
85
|
+
|
|
86
|
+
/** A tokenizer the LM can read/write text through (the engine's `BPETokenizer` fits). */
|
|
87
|
+
export interface TextCodec {
|
|
88
|
+
encode(text: string): number[];
|
|
89
|
+
decode(ids: number[]): string;
|
|
90
|
+
}
|
|
91
|
+
|
|
92
|
+
export interface LMGenerateOptions {
|
|
93
|
+
maxNewTokens: number;
|
|
94
|
+
/** Sampling temperature; ≤0 ⇒ greedy argmax. Default 0 (greedy). */
|
|
95
|
+
temperature?: number;
|
|
96
|
+
/** Deterministic sampler seed (only used when temperature > 0). */
|
|
97
|
+
seed?: number;
|
|
98
|
+
/** Stop generating when this token id is produced. */
|
|
99
|
+
stopToken?: number;
|
|
100
|
+
}
|
|
101
|
+
|
|
102
|
+
export class EvermindLM {
|
|
103
|
+
readonly config: Required<Omit<EvermindLMConfig, "seed">>;
|
|
104
|
+
|
|
105
|
+
/** Tied token embedding / output head: vocabSize × dModel (row-major). */
|
|
106
|
+
emb: Float32Array;
|
|
107
|
+
private gEmb: Float32Array;
|
|
108
|
+
/** Per-layer depthwise causal conv kernels: dModel × convKernel. */
|
|
109
|
+
private readonly conv: Float32Array[];
|
|
110
|
+
private readonly gConv: Float32Array[];
|
|
111
|
+
/** Per-layer pre-conv / pre-MoE RMSNorm gains (dModel each). */
|
|
112
|
+
private readonly nConv: Float32Array[];
|
|
113
|
+
private readonly gNConv: Float32Array[];
|
|
114
|
+
private readonly nMoe: Float32Array[];
|
|
115
|
+
private readonly gNMoe: Float32Array[];
|
|
116
|
+
/** Per-layer channel mixer. */
|
|
117
|
+
private readonly moe: SharedExpertMoE[];
|
|
118
|
+
|
|
119
|
+
constructor(config: EvermindLMConfig) {
|
|
120
|
+
const dModel = config.dModel ?? DEFAULT_LM_CONFIG.dModel;
|
|
121
|
+
const cfg: Required<Omit<EvermindLMConfig, "seed">> = {
|
|
122
|
+
vocabSize: config.vocabSize,
|
|
123
|
+
dModel,
|
|
124
|
+
numLayers: config.numLayers ?? DEFAULT_LM_CONFIG.numLayers,
|
|
125
|
+
convKernel: config.convKernel ?? DEFAULT_LM_CONFIG.convKernel,
|
|
126
|
+
hiddenDim: config.hiddenDim ?? dModel * 2,
|
|
127
|
+
numExperts: config.numExperts ?? DEFAULT_LM_CONFIG.numExperts,
|
|
128
|
+
topK: config.topK ?? DEFAULT_LM_CONFIG.topK,
|
|
129
|
+
};
|
|
130
|
+
if (cfg.vocabSize <= 0) throw new Error("EvermindLM: vocabSize must be > 0");
|
|
131
|
+
if (cfg.topK > cfg.numExperts) throw new Error("EvermindLM: topK must be ≤ numExperts");
|
|
132
|
+
this.config = cfg;
|
|
133
|
+
|
|
134
|
+
const seed = (config.seed ?? DEFAULT_LM_SEED) >>> 0 || 1;
|
|
135
|
+
const rng = new SeededRng(seed);
|
|
136
|
+
const gauss = (n: number, std: number): Float32Array => {
|
|
137
|
+
const a = new Float32Array(n);
|
|
138
|
+
for (let i = 0; i < n; i++) {
|
|
139
|
+
const u1 = Math.max(rng.next(), 1e-12);
|
|
140
|
+
const u2 = rng.next();
|
|
141
|
+
a[i] = std * Math.sqrt(-2 * Math.log(u1)) * Math.cos(2 * Math.PI * u2);
|
|
142
|
+
}
|
|
143
|
+
return a;
|
|
144
|
+
};
|
|
145
|
+
|
|
146
|
+
this.emb = gauss(cfg.vocabSize * cfg.dModel, 0.02);
|
|
147
|
+
this.gEmb = new Float32Array(this.emb.length);
|
|
148
|
+
this.conv = [];
|
|
149
|
+
this.gConv = [];
|
|
150
|
+
this.nConv = [];
|
|
151
|
+
this.gNConv = [];
|
|
152
|
+
this.nMoe = [];
|
|
153
|
+
this.gNMoe = [];
|
|
154
|
+
this.moe = [];
|
|
155
|
+
for (let l = 0; l < cfg.numLayers; l++) {
|
|
156
|
+
// Conv init near an identity passthrough (current tap ≈ 1, history ≈ 0) so
|
|
157
|
+
// an untrained block is close to a residual no-op.
|
|
158
|
+
const k = new Float32Array(cfg.dModel * cfg.convKernel);
|
|
159
|
+
for (let c = 0; c < cfg.dModel; c++) k[c * cfg.convKernel] = 1;
|
|
160
|
+
this.conv.push(k);
|
|
161
|
+
this.gConv.push(new Float32Array(k.length));
|
|
162
|
+
// RMSNorm gains start at 1 (identity scale).
|
|
163
|
+
this.nConv.push(new Float32Array(cfg.dModel).fill(1));
|
|
164
|
+
this.gNConv.push(new Float32Array(cfg.dModel));
|
|
165
|
+
this.nMoe.push(new Float32Array(cfg.dModel).fill(1));
|
|
166
|
+
this.gNMoe.push(new Float32Array(cfg.dModel));
|
|
167
|
+
// Each MoE layer gets a distinct seed for varied expert init.
|
|
168
|
+
this.moe.push(
|
|
169
|
+
new SharedExpertMoE({
|
|
170
|
+
modelDim: cfg.dModel,
|
|
171
|
+
hiddenDim: cfg.hiddenDim,
|
|
172
|
+
numExperts: cfg.numExperts,
|
|
173
|
+
topK: cfg.topK,
|
|
174
|
+
seed: seed + 1 + l,
|
|
175
|
+
}),
|
|
176
|
+
);
|
|
177
|
+
}
|
|
178
|
+
}
|
|
179
|
+
|
|
180
|
+
// ── Forward ────────────────────────────────────────────────────────────────
|
|
181
|
+
|
|
182
|
+
/** Run the model over a token sequence; returns per-position logits + a cache. */
|
|
183
|
+
forward(tokens: number[]): { logits: Float32Array[]; cache: ForwardCache } {
|
|
184
|
+
const { dModel, convKernel, numLayers, vocabSize } = this.config;
|
|
185
|
+
const T = tokens.length;
|
|
186
|
+
|
|
187
|
+
// Embed.
|
|
188
|
+
let x: Float32Array[] = tokens.map((tok) => {
|
|
189
|
+
const row = new Float32Array(dModel);
|
|
190
|
+
const off = tok * dModel;
|
|
191
|
+
for (let c = 0; c < dModel; c++) row[c] = this.emb[off + c]!;
|
|
192
|
+
return row;
|
|
193
|
+
});
|
|
194
|
+
|
|
195
|
+
const layers: LayerCache[] = [];
|
|
196
|
+
for (let l = 0; l < numLayers; l++) {
|
|
197
|
+
const layerIn = x;
|
|
198
|
+
const ker = this.conv[l]!;
|
|
199
|
+
const nConv = this.nConv[l]!;
|
|
200
|
+
const nMoe = this.nMoe[l]!;
|
|
201
|
+
|
|
202
|
+
// Pre-norm → depthwise causal conv → residual.
|
|
203
|
+
const normedConv: Float32Array[] = [];
|
|
204
|
+
const rmsConv: number[] = [];
|
|
205
|
+
for (let t = 0; t < T; t++) {
|
|
206
|
+
const { y, r } = rmsNorm(layerIn[t]!, nConv);
|
|
207
|
+
normedConv.push(y);
|
|
208
|
+
rmsConv.push(r);
|
|
209
|
+
}
|
|
210
|
+
const afterConv: Float32Array[] = [];
|
|
211
|
+
for (let t = 0; t < T; t++) {
|
|
212
|
+
const out = Float32Array.from(layerIn[t]!); // residual base
|
|
213
|
+
for (let c = 0; c < dModel; c++) {
|
|
214
|
+
let acc = 0;
|
|
215
|
+
for (let j = 0; j < convKernel; j++) {
|
|
216
|
+
const ti = t - j;
|
|
217
|
+
if (ti >= 0) acc += ker[c * convKernel + j]! * normedConv[ti]![c]!;
|
|
218
|
+
}
|
|
219
|
+
out[c] = out[c]! + acc;
|
|
220
|
+
}
|
|
221
|
+
afterConv.push(out);
|
|
222
|
+
}
|
|
223
|
+
|
|
224
|
+
// Pre-norm → MoE channel mixer → residual.
|
|
225
|
+
const rmsMoe: number[] = [];
|
|
226
|
+
const moeCache: MoECacheLike[] = [];
|
|
227
|
+
const afterMoe: Float32Array[] = [];
|
|
228
|
+
for (let t = 0; t < T; t++) {
|
|
229
|
+
const { y, r } = rmsNorm(afterConv[t]!, nMoe);
|
|
230
|
+
rmsMoe.push(r);
|
|
231
|
+
const out = Float32Array.from(afterConv[t]!); // residual base
|
|
232
|
+
const mr = this.moe[l]!.forward(y);
|
|
233
|
+
for (let c = 0; c < dModel; c++) out[c] = out[c]! + mr.output[c]!;
|
|
234
|
+
afterMoe.push(out);
|
|
235
|
+
moeCache.push(mr.cache as unknown as MoECacheLike);
|
|
236
|
+
}
|
|
237
|
+
|
|
238
|
+
layers.push({ layerIn, normedConv, rmsConv, afterConv, rmsMoe, moeCache });
|
|
239
|
+
x = afterMoe;
|
|
240
|
+
}
|
|
241
|
+
|
|
242
|
+
// Tied head: logits_t[v] = x_t · emb[v].
|
|
243
|
+
const logits: Float32Array[] = x.map((xt) => {
|
|
244
|
+
const lg = new Float32Array(vocabSize);
|
|
245
|
+
for (let v = 0; v < vocabSize; v++) {
|
|
246
|
+
let acc = 0;
|
|
247
|
+
const off = v * dModel;
|
|
248
|
+
for (let c = 0; c < dModel; c++) acc += xt[c]! * this.emb[off + c]!;
|
|
249
|
+
lg[v] = acc;
|
|
250
|
+
}
|
|
251
|
+
return lg;
|
|
252
|
+
});
|
|
253
|
+
|
|
254
|
+
return { logits, cache: { tokens, layers, finalX: x } };
|
|
255
|
+
}
|
|
256
|
+
|
|
257
|
+
// ── Loss + backward ──────────────────────────────────────────────────────────
|
|
258
|
+
|
|
259
|
+
/**
|
|
260
|
+
* Next-token cross-entropy over the sequence (predict tokens[t+1] from
|
|
261
|
+
* position t), accumulating exact gradients. Returns the mean loss. Call
|
|
262
|
+
* {@link zeroGrad} before and an optimiser step after.
|
|
263
|
+
*/
|
|
264
|
+
lossAndBackward(tokens: number[]): number {
|
|
265
|
+
const { dModel, convKernel, numLayers, vocabSize } = this.config;
|
|
266
|
+
const T = tokens.length;
|
|
267
|
+
if (T < 2) return 0;
|
|
268
|
+
const { logits, cache } = this.forward(tokens);
|
|
269
|
+
|
|
270
|
+
const predPositions = T - 1; // positions 0..T-2 predict the next token
|
|
271
|
+
const inv = 1 / predPositions;
|
|
272
|
+
|
|
273
|
+
// dL/d(finalX_t) and head gradient into the tied embedding.
|
|
274
|
+
const dX: Float32Array[] = Array.from({ length: T }, () => new Float32Array(dModel));
|
|
275
|
+
let loss = 0;
|
|
276
|
+
for (let t = 0; t < predPositions; t++) {
|
|
277
|
+
const target = tokens[t + 1]!;
|
|
278
|
+
loss += crossEntropyLoss(logits[t]!, target) * inv;
|
|
279
|
+
const dLogit = crossEntropyGrad(logits[t]!, target); // probs - onehot
|
|
280
|
+
const xt = cache.finalX[t]!;
|
|
281
|
+
for (let v = 0; v < vocabSize; v++) {
|
|
282
|
+
const g = dLogit[v]! * inv;
|
|
283
|
+
if (g === 0) continue;
|
|
284
|
+
const off = v * dModel;
|
|
285
|
+
for (let c = 0; c < dModel; c++) {
|
|
286
|
+
this.gEmb[off + c] = this.gEmb[off + c]! + g * xt[c]!; // head → emb
|
|
287
|
+
dX[t]![c] = dX[t]![c]! + g * this.emb[off + c]!; // head → x_t
|
|
288
|
+
}
|
|
289
|
+
}
|
|
290
|
+
}
|
|
291
|
+
|
|
292
|
+
// Backprop through layers in reverse.
|
|
293
|
+
for (let l = numLayers - 1; l >= 0; l--) {
|
|
294
|
+
const lc = cache.layers[l]!;
|
|
295
|
+
const ker = this.conv[l]!;
|
|
296
|
+
const gker = this.gConv[l]!;
|
|
297
|
+
const nConv = this.nConv[l]!;
|
|
298
|
+
const gNConv = this.gNConv[l]!;
|
|
299
|
+
const nMoe = this.nMoe[l]!;
|
|
300
|
+
const gNMoe = this.gNMoe[l]!;
|
|
301
|
+
|
|
302
|
+
// MoE sub-block: afterMoe = afterConv + MoE(RMSNorm(afterConv, nMoe)).
|
|
303
|
+
const dAfterConv: Float32Array[] = [];
|
|
304
|
+
for (let t = 0; t < T; t++) {
|
|
305
|
+
const dMoeNormed = this.moe[l]!.backward(dX[t]!, lc.moeCache[t] as never);
|
|
306
|
+
const { dx, dgain } = rmsNormBackward(dMoeNormed, lc.afterConv[t]!, lc.rmsMoe[t]!, nMoe);
|
|
307
|
+
for (let c = 0; c < dModel; c++) gNMoe[c] = gNMoe[c]! + dgain[c]!;
|
|
308
|
+
const d = Float32Array.from(dX[t]!); // residual passthrough
|
|
309
|
+
for (let c = 0; c < dModel; c++) d[c] = d[c]! + dx[c]!;
|
|
310
|
+
dAfterConv.push(d);
|
|
311
|
+
}
|
|
312
|
+
|
|
313
|
+
// Conv sub-block: afterConv = layerIn + conv(RMSNorm(layerIn, nConv)).
|
|
314
|
+
const dNormedConv: Float32Array[] = Array.from({ length: T }, () => new Float32Array(dModel));
|
|
315
|
+
const dLayerIn: Float32Array[] = dAfterConv.map((v) => Float32Array.from(v)); // residual passthrough
|
|
316
|
+
for (let t = 0; t < T; t++) {
|
|
317
|
+
for (let c = 0; c < dModel; c++) {
|
|
318
|
+
const dmix = dAfterConv[t]![c]!;
|
|
319
|
+
if (dmix === 0) continue;
|
|
320
|
+
for (let j = 0; j < convKernel; j++) {
|
|
321
|
+
const ti = t - j;
|
|
322
|
+
if (ti < 0) continue;
|
|
323
|
+
gker[c * convKernel + j] = gker[c * convKernel + j]! + dmix * lc.normedConv[ti]![c]!;
|
|
324
|
+
dNormedConv[ti]![c] = dNormedConv[ti]![c]! + dmix * ker[c * convKernel + j]!;
|
|
325
|
+
}
|
|
326
|
+
}
|
|
327
|
+
}
|
|
328
|
+
for (let t = 0; t < T; t++) {
|
|
329
|
+
const { dx, dgain } = rmsNormBackward(dNormedConv[t]!, lc.layerIn[t]!, lc.rmsConv[t]!, nConv);
|
|
330
|
+
for (let c = 0; c < dModel; c++) {
|
|
331
|
+
gNConv[c] = gNConv[c]! + dgain[c]!;
|
|
332
|
+
dLayerIn[t]![c] = dLayerIn[t]![c]! + dx[c]!;
|
|
333
|
+
}
|
|
334
|
+
}
|
|
335
|
+
for (let t = 0; t < T; t++) dX[t] = dLayerIn[t]!;
|
|
336
|
+
}
|
|
337
|
+
|
|
338
|
+
// Embedding lookup: dX at layer-0 input flows into the row for token_t.
|
|
339
|
+
for (let t = 0; t < T; t++) {
|
|
340
|
+
const off = tokens[t]! * dModel;
|
|
341
|
+
for (let c = 0; c < dModel; c++) this.gEmb[off + c] = this.gEmb[off + c]! + dX[t]![c]!;
|
|
342
|
+
}
|
|
343
|
+
|
|
344
|
+
return loss;
|
|
345
|
+
}
|
|
346
|
+
|
|
347
|
+
// ── Generation ───────────────────────────────────────────────────────────────
|
|
348
|
+
|
|
349
|
+
/**
|
|
350
|
+
* Text-level generation: encode the prompt, generate, decode. `codec` is any
|
|
351
|
+
* tokenizer exposing encode/decode (the engine's `BPETokenizer` satisfies it),
|
|
352
|
+
* so the LM consumes and emits real text rather than raw token ids. The model's
|
|
353
|
+
* `vocabSize` must match the codec's vocabulary.
|
|
354
|
+
*/
|
|
355
|
+
generateText(prompt: string, codec: TextCodec, opts: LMGenerateOptions): string {
|
|
356
|
+
return codec.decode(this.generate(codec.encode(prompt), opts));
|
|
357
|
+
}
|
|
358
|
+
|
|
359
|
+
/** Greedy / temperature-sampled autoregressive generation. Returns NEW token ids. */
|
|
360
|
+
generate(prompt: number[], opts: LMGenerateOptions): number[] {
|
|
361
|
+
const temperature = opts.temperature ?? 0;
|
|
362
|
+
const rng = temperature > 0 ? new SeededRng((opts.seed ?? 1) >>> 0 || 1) : null;
|
|
363
|
+
const tokens = [...prompt];
|
|
364
|
+
const produced: number[] = [];
|
|
365
|
+
for (let n = 0; n < opts.maxNewTokens; n++) {
|
|
366
|
+
const { logits } = this.forward(tokens.length > 0 ? tokens : [0]);
|
|
367
|
+
const last = logits[logits.length - 1]!;
|
|
368
|
+
const next = rng ? sampleTemperature(last, temperature, rng) : argmax(last);
|
|
369
|
+
produced.push(next);
|
|
370
|
+
tokens.push(next);
|
|
371
|
+
if (opts.stopToken !== undefined && next === opts.stopToken) break;
|
|
372
|
+
}
|
|
373
|
+
return produced;
|
|
374
|
+
}
|
|
375
|
+
|
|
376
|
+
// ── Parameters / checkpoint ──────────────────────────────────────────────────
|
|
377
|
+
|
|
378
|
+
/** All trainable parameters as {data} (AdamW-compatible), canonical order. */
|
|
379
|
+
parameters(): { data: Float32Array }[] {
|
|
380
|
+
const out: { data: Float32Array }[] = [{ data: this.emb }];
|
|
381
|
+
for (let l = 0; l < this.config.numLayers; l++) {
|
|
382
|
+
out.push({ data: this.conv[l]! }, { data: this.nConv[l]! }, { data: this.nMoe[l]! });
|
|
383
|
+
for (const p of this.moe[l]!.parameters()) out.push({ data: p.data });
|
|
384
|
+
}
|
|
385
|
+
return out;
|
|
386
|
+
}
|
|
387
|
+
|
|
388
|
+
/** Gradient buffers, index-aligned with {@link parameters}. */
|
|
389
|
+
gradients(): { data: Float32Array }[] {
|
|
390
|
+
const out: { data: Float32Array }[] = [{ data: this.gEmb }];
|
|
391
|
+
for (let l = 0; l < this.config.numLayers; l++) {
|
|
392
|
+
out.push({ data: this.gConv[l]! }, { data: this.gNConv[l]! }, { data: this.gNMoe[l]! });
|
|
393
|
+
for (const g of this.moe[l]!.gradients()) out.push({ data: g.data });
|
|
394
|
+
}
|
|
395
|
+
return out;
|
|
396
|
+
}
|
|
397
|
+
|
|
398
|
+
zeroGrad(): void {
|
|
399
|
+
this.gEmb.fill(0);
|
|
400
|
+
for (let l = 0; l < this.config.numLayers; l++) {
|
|
401
|
+
this.gConv[l]!.fill(0);
|
|
402
|
+
this.gNConv[l]!.fill(0);
|
|
403
|
+
this.gNMoe[l]!.fill(0);
|
|
404
|
+
this.moe[l]!.zeroGrad();
|
|
405
|
+
}
|
|
406
|
+
}
|
|
407
|
+
|
|
408
|
+
/** Serialise to an "EVL0" binary (fp16 or f32), params in {@link parameters} order. */
|
|
409
|
+
exportWeights(opts: { fp16?: boolean } = {}): ArrayBuffer {
|
|
410
|
+
const fp16 = opts.fp16 ?? false;
|
|
411
|
+
const params = this.parameters();
|
|
412
|
+
const total = params.reduce((n, p) => n + p.data.length, 0);
|
|
413
|
+
// magic, version, vocab, dModel, numLayers, convKernel, hiddenDim, numExperts, topK.
|
|
414
|
+
// numExperts and topK get distinct slots (an earlier *16 packing collided once
|
|
415
|
+
// numExperts ≥ 16 — e.g. (20,20) and (21,4) both packed to 340).
|
|
416
|
+
const headerEls = 9;
|
|
417
|
+
const headerBytes = headerEls * 4;
|
|
418
|
+
const buf = new ArrayBuffer(headerBytes + (fp16 ? total * 2 : total * 4));
|
|
419
|
+
const head = new Uint32Array(buf, 0, headerEls);
|
|
420
|
+
head[0] = MAGIC;
|
|
421
|
+
head[1] = fp16 ? 2 : 1;
|
|
422
|
+
head[2] = this.config.vocabSize;
|
|
423
|
+
head[3] = this.config.dModel;
|
|
424
|
+
head[4] = this.config.numLayers;
|
|
425
|
+
head[5] = this.config.convKernel;
|
|
426
|
+
head[6] = this.config.hiddenDim;
|
|
427
|
+
head[7] = this.config.numExperts;
|
|
428
|
+
head[8] = this.config.topK;
|
|
429
|
+
const flat = new Float32Array(total);
|
|
430
|
+
let o = 0;
|
|
431
|
+
for (const p of params) {
|
|
432
|
+
flat.set(p.data, o);
|
|
433
|
+
o += p.data.length;
|
|
434
|
+
}
|
|
435
|
+
if (fp16) new Uint16Array(buf, headerBytes, total).set(quantizeFp16(flat));
|
|
436
|
+
else new Float32Array(buf, headerBytes, total).set(flat);
|
|
437
|
+
return buf;
|
|
438
|
+
}
|
|
439
|
+
|
|
440
|
+
/** Load weights from an "EVL0" binary. Validates magic + dims. */
|
|
441
|
+
loadWeights(buffer: ArrayBuffer): void {
|
|
442
|
+
const head = new Uint32Array(buffer, 0, 9);
|
|
443
|
+
if (head[0] !== MAGIC) throw new Error("EvermindLM.loadWeights: bad magic (not an EVL0 checkpoint)");
|
|
444
|
+
const version = head[1]!;
|
|
445
|
+
if (
|
|
446
|
+
head[2] !== this.config.vocabSize ||
|
|
447
|
+
head[3] !== this.config.dModel ||
|
|
448
|
+
head[4] !== this.config.numLayers ||
|
|
449
|
+
head[5] !== this.config.convKernel ||
|
|
450
|
+
head[6] !== this.config.hiddenDim ||
|
|
451
|
+
head[7] !== this.config.numExperts ||
|
|
452
|
+
head[8] !== this.config.topK
|
|
453
|
+
) {
|
|
454
|
+
throw new Error("EvermindLM.loadWeights: config mismatch with checkpoint");
|
|
455
|
+
}
|
|
456
|
+
const params = this.parameters();
|
|
457
|
+
const total = params.reduce((n, p) => n + p.data.length, 0);
|
|
458
|
+
const headerBytes = 36;
|
|
459
|
+
const flat =
|
|
460
|
+
version === 2
|
|
461
|
+
? dequantizeFp16(new Uint16Array(buffer, headerBytes, total))
|
|
462
|
+
: new Float32Array(buffer.slice(headerBytes, headerBytes + total * 4));
|
|
463
|
+
let o = 0;
|
|
464
|
+
for (const p of params) {
|
|
465
|
+
p.data.set(flat.subarray(o, o + p.data.length));
|
|
466
|
+
o += p.data.length;
|
|
467
|
+
}
|
|
468
|
+
}
|
|
469
|
+
}
|
|
470
|
+
|
|
471
|
+
/** Minimal sequence trainer: AdamW over next-token cross-entropy. */
|
|
472
|
+
export class EvermindLMTrainer {
|
|
473
|
+
private readonly adam: AdamW;
|
|
474
|
+
constructor(
|
|
475
|
+
private readonly model: EvermindLM,
|
|
476
|
+
private readonly opts: AdamWOptions & { epochs?: number } = {},
|
|
477
|
+
) {
|
|
478
|
+
this.adam = new AdamW(model, opts);
|
|
479
|
+
}
|
|
480
|
+
/** Train on a set of token sequences; returns per-epoch mean loss. */
|
|
481
|
+
fit(sequences: number[][]): number[] {
|
|
482
|
+
const epochs = this.opts.epochs ?? 1;
|
|
483
|
+
const history: number[] = [];
|
|
484
|
+
for (let e = 0; e < epochs; e++) {
|
|
485
|
+
let total = 0;
|
|
486
|
+
let n = 0;
|
|
487
|
+
for (const seq of sequences) {
|
|
488
|
+
if (seq.length < 2) continue;
|
|
489
|
+
this.model.zeroGrad();
|
|
490
|
+
total += this.model.lossAndBackward(seq);
|
|
491
|
+
this.adam.step();
|
|
492
|
+
n++;
|
|
493
|
+
}
|
|
494
|
+
history.push(n > 0 ? total / n : 0);
|
|
495
|
+
}
|
|
496
|
+
return history;
|
|
497
|
+
}
|
|
498
|
+
}
|
|
499
|
+
|
|
500
|
+
const RMS_EPS = 1e-5;
|
|
501
|
+
|
|
502
|
+
/** RMSNorm: y[c] = gain[c]·x[c]/rms, rms = sqrt(mean(x²)+eps). Returns y and the denom. */
|
|
503
|
+
function rmsNorm(x: Float32Array, gain: Float32Array): { y: Float32Array; r: number } {
|
|
504
|
+
const D = x.length;
|
|
505
|
+
let ss = 0;
|
|
506
|
+
for (let c = 0; c < D; c++) ss += x[c]! * x[c]!;
|
|
507
|
+
const r = Math.sqrt(ss / D + RMS_EPS);
|
|
508
|
+
const y = new Float32Array(D);
|
|
509
|
+
for (let c = 0; c < D; c++) y[c] = (gain[c]! * x[c]!) / r;
|
|
510
|
+
return { y, r };
|
|
511
|
+
}
|
|
512
|
+
|
|
513
|
+
/**
|
|
514
|
+
* RMSNorm backward. Given dL/dy and the cached input/denom/gain, returns dL/dx and
|
|
515
|
+
* dL/dgain. dx_j = gain_j·dy_j/r − x_j·A/(D·r³) with A = Σ_c dy_c·gain_c·x_c;
|
|
516
|
+
* dgain_c = dy_c·x_c/r.
|
|
517
|
+
*/
|
|
518
|
+
function rmsNormBackward(
|
|
519
|
+
dy: Float32Array,
|
|
520
|
+
x: Float32Array,
|
|
521
|
+
r: number,
|
|
522
|
+
gain: Float32Array,
|
|
523
|
+
): { dx: Float32Array; dgain: Float32Array } {
|
|
524
|
+
const D = x.length;
|
|
525
|
+
let A = 0;
|
|
526
|
+
for (let c = 0; c < D; c++) A += dy[c]! * gain[c]! * x[c]!;
|
|
527
|
+
const dx = new Float32Array(D);
|
|
528
|
+
const dgain = new Float32Array(D);
|
|
529
|
+
const r3 = r * r * r;
|
|
530
|
+
for (let c = 0; c < D; c++) {
|
|
531
|
+
dx[c] = (gain[c]! * dy[c]!) / r - (x[c]! * A) / (D * r3);
|
|
532
|
+
dgain[c] = (dy[c]! * x[c]!) / r;
|
|
533
|
+
}
|
|
534
|
+
return { dx, dgain };
|
|
535
|
+
}
|
|
536
|
+
|
|
537
|
+
function argmax(v: Float32Array): number {
|
|
538
|
+
let best = 0;
|
|
539
|
+
for (let i = 1; i < v.length; i++) if (v[i]! > v[best]!) best = i;
|
|
540
|
+
return best;
|
|
541
|
+
}
|
|
542
|
+
|
|
543
|
+
function sampleTemperature(logits: Float32Array, temperature: number, rng: SeededRng): number {
|
|
544
|
+
let max = -Infinity;
|
|
545
|
+
for (let i = 0; i < logits.length; i++) if (logits[i]! / temperature > max) max = logits[i]! / temperature;
|
|
546
|
+
let sum = 0;
|
|
547
|
+
const probs = new Float32Array(logits.length);
|
|
548
|
+
for (let i = 0; i < logits.length; i++) {
|
|
549
|
+
probs[i] = Math.exp(logits[i]! / temperature - max);
|
|
550
|
+
sum += probs[i]!;
|
|
551
|
+
}
|
|
552
|
+
let r = rng.next() * sum;
|
|
553
|
+
for (let i = 0; i < probs.length; i++) {
|
|
554
|
+
r -= probs[i]!;
|
|
555
|
+
if (r <= 0) return i;
|
|
556
|
+
}
|
|
557
|
+
return probs.length - 1;
|
|
558
|
+
}
|