npm - @dniskav/neuron - Versions diffs - 0.2.5 → 0.2.6 - Mend

@dniskav/neuron 0.2.5 → 0.2.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/dist/index.js CHANGED Viewed

@@ -23,6 +23,7 @@ __export(index_exports, {
   Adam: () => Adam,
   AttentionHead: () => AttentionHead,
   BatchNorm: () => BatchNorm,
+  BiasVector: () => BiasVector,
   ClipOptimizer: () => ClipOptimizer,
   ClippedOptimizerFactory: () => ClippedOptimizerFactory,
   Conv1D: () => Conv1D,
@@ -51,6 +52,7 @@ __export(index_exports, {
   crossEntropy: () => crossEntropy,
   crossEntropyDelta: () => crossEntropyDelta,
   crossEntropyDeltaRaw: () => crossEntropyDeltaRaw,
+  defaultOptimizer: () => defaultOptimizer,
   elu: () => elu,
   leakyRelu: () => leakyRelu,
   linear: () => linear,
@@ -201,6 +203,7 @@ function makeElu(alpha = 1) {
 var elu = makeElu(1);
 // src/optimizers.ts
+var defaultOptimizer = () => new SGD();
 var SGD = class {
   step(weight, gradient, lr) {
     return weight + lr * gradient;
@@ -249,7 +252,6 @@ var Adam = class {
 };
 // src/NeuronN.ts
-var defaultOptimizer = () => new SGD();
 var NeuronN = class {
   constructor(nInputs, activation = sigmoid2, optimizerFactory = defaultOptimizer) {
     const limit = Math.sqrt(1 / nInputs);
@@ -278,9 +280,8 @@ var NeuronN = class {
 };
 // src/Layer.ts
-var defaultOptimizer2 = () => new SGD();
 var Layer = class {
-  constructor(nNeurons, nInputs, activation = sigmoid2, optimizerFactory = defaultOptimizer2) {
+  constructor(nNeurons, nInputs, activation = sigmoid2, optimizerFactory = defaultOptimizer) {
     this.neurons = Array.from(
       { length: nNeurons },
       () => new NeuronN(nInputs, activation, optimizerFactory)
@@ -300,7 +301,7 @@ var Network = class {
   predict(inputs) {
     validateArray(inputs, this.hiddenLayer.neurons[0].weights.length, "Network.predict");
     const hiddenOut = this.hiddenLayer.predict(inputs);
-    return this.outputLayer.predict(hiddenOut)[0];
+    return this.outputLayer.predict(hiddenOut);
   }
   // Trains on a single example. Returns the squared error.
   train(inputs, target, lr) {
@@ -309,22 +310,17 @@ var Network = class {
     validateNumber(lr, "Network.train");
     const hiddenOut = this.hiddenLayer.predict(inputs);
     const prediction = this.outputLayer.predict(hiddenOut)[0];
-    const outputError = target - prediction;
-    const outputDelta = outputError * prediction * (1 - prediction);
     const outputNeuron = this.outputLayer.neurons[0];
+    const outputError = target - prediction;
+    const outputDelta = outputError * outputNeuron.activation.dfn(prediction);
     const hiddenDeltas = this.hiddenLayer.neurons.map((neuron, i) => {
-      const hiddenOut_i = hiddenOut[i];
       const hiddenError = outputDelta * outputNeuron.weights[i];
-      return hiddenError * hiddenOut_i * (1 - hiddenOut_i);
+      return hiddenError * neuron.activation.dfn(hiddenOut[i]);
     });
     this.hiddenLayer.neurons.forEach((neuron, i) => {
-      neuron.weights = neuron.weights.map((w, j) => w + lr * hiddenDeltas[i] * inputs[j]);
-      neuron.bias += lr * hiddenDeltas[i];
+      neuron._update(inputs.map((inp) => hiddenDeltas[i] * inp), hiddenDeltas[i], lr);
     });
-    outputNeuron.weights = outputNeuron.weights.map(
-      (w, i) => w + lr * outputDelta * hiddenOut[i]
-    );
-    outputNeuron.bias += lr * outputDelta;
+    outputNeuron._update(hiddenOut.map((h) => outputDelta * h), outputDelta, lr);
     return outputError * outputError;
   }
   // ── Flat weight serialization ─────────────────────────────────────────────
@@ -394,13 +390,12 @@ var Dropout = class {
 };
 // src/NetworkN.ts
-var defaultOptimizer3 = () => new SGD();
 var NetworkN = class {
   constructor(structure, options = {}) {
     this.structure = structure;
     const nLayers = structure.length - 1;
     const activations = options.activations ?? Array.from({ length: nLayers }, () => sigmoid2);
-    const optimizer = options.optimizer ?? defaultOptimizer3;
+    const optimizer = options.optimizer ?? defaultOptimizer;
     const dropoutRate = options.dropoutRate ?? 0;
     if (activations.length !== nLayers) {
       throw new Error(`Expected ${nLayers} activations, got ${activations.length}`);
@@ -453,73 +448,69 @@ var NetworkN = class {
   train(inputs, targets, lr) {
     validateArray(inputs, this.structure[0], "NetworkN.train");
     validateArray(targets, this.structure[this.structure.length - 1], "NetworkN.train");
-    const act = [inputs];
-    for (let i = 0; i < this.layers.length; i++) {
-      const layerInput = act[act.length - 1];
-      const layerOutput = this.layers[i].predict(layerInput);
-      let current;
-      if (this._shouldResidual(i)) {
-        if (this.structure[i] === this.structure[i + 1]) {
-          current = layerOutput.map((v, j) => v + layerInput[j]);
-        } else {
-          current = [...layerOutput];
-        }
-      } else {
-        current = [...layerOutput];
-      }
-      if (i < this._dropouts.length) {
-        current = this._dropouts[i].forward(current, true);
-      }
-      act.push(current);
-    }
+    const act = this._forwardAll(inputs, true);
     const pred = act[act.length - 1];
     const outAct = this.layers[this.layers.length - 1].neurons[0].activation;
-    let deltas = pred.map((p, i) => (targets[i] - p) * outAct.dfn(p));
-    for (let l = this.layers.length - 1; l >= 0; l--) {
-      const layer = this.layers[l];
-      if (l < this._dropouts.length) {
-        deltas = this._dropouts[l].backward(deltas);
-      }
-      const layerIn = act[l];
-      const prevAct = l > 0 ? this.layers[l - 1].neurons[0].activation : null;
-      const prevDeltas = layerIn.map((out, j) => {
-        const errProp = layer.neurons.reduce((s, n, k) => s + deltas[k] * n.weights[j], 0);
-        return prevAct ? errProp * prevAct.dfn(out) : errProp;
-      });
-      if (this._shouldResidual(l) && this.structure[l] === this.structure[l + 1]) {
-        for (let j = 0; j < prevDeltas.length; j++) {
-          prevDeltas[j] += deltas[j];
-        }
-      }
-      layer.neurons.forEach((n, k) => {
-        n._update(layerIn.map((inp) => deltas[k] * inp), deltas[k], lr);
-      });
-      deltas = prevDeltas;
-    }
+    const deltas = pred.map((p, i) => (targets[i] - p) * outAct.dfn(p));
+    this._backpropLayers(act, deltas, lr);
     return pred.reduce((s, p, i) => s + (targets[i] - p) ** 2, 0) / pred.length;
   }
   // Backprop with externally provided output-layer deltas.
   // Useful for custom loss functions (e.g. physics-based gradients).
   trainWithDeltas(inputs, outputDeltas, lr) {
+    const act = this._forwardAll(inputs, true);
+    this._backpropLayers(act, outputDeltas, lr);
+  }
+  // ── Flat weight serialization ─────────────────────────────────────────────
+  // Order: layer 0 (all neurons), layer 1, ..., layer N.
+  getWeights() {
+    for (const d of this._dropouts) d.resetMask();
+    const w = [];
+    for (const layer of this.layers) {
+      for (const n of layer.neurons) {
+        w.push(...n.weights, n.bias);
+      }
+    }
+    return w;
+  }
+  setWeights(weights) {
+    for (const d of this._dropouts) d.resetMask();
+    let idx = 0;
+    for (const layer of this.layers) {
+      for (const n of layer.neurons) {
+        for (let j = 0; j < n.weights.length; j++) n.weights[j] = weights[idx++];
+        n.bias = weights[idx++];
+      }
+    }
+  }
+  // ── Private helpers ──────────────────────────────────────────────────────
+  _shouldResidual(layerIndex) {
+    if (typeof this._residual === "function") return this._residual(layerIndex);
+    return this._residual;
+  }
+  // Forward pass storing activations at every layer boundary.
+  // Used by train(), trainWithDeltas(), and predict() shares the same logic.
+  _forwardAll(inputs, training) {
     const act = [inputs];
     for (let i = 0; i < this.layers.length; i++) {
       const layerInput = act[act.length - 1];
       const layerOutput = this.layers[i].predict(layerInput);
       let current;
-      if (this._shouldResidual(i)) {
-        if (this.structure[i] === this.structure[i + 1]) {
-          current = layerOutput.map((v, j) => v + layerInput[j]);
-        } else {
-          current = [...layerOutput];
-        }
+      if (this._shouldResidual(i) && this.structure[i] === this.structure[i + 1]) {
+        current = layerOutput.map((v, j) => v + layerInput[j]);
       } else {
-        current = [...layerOutput];
+        current = layerOutput;
       }
       if (i < this._dropouts.length) {
-        current = this._dropouts[i].forward(current, true);
+        current = this._dropouts[i].forward(current, training);
       }
       act.push(current);
     }
+    return act;
+  }
+  // Backward pass: updates all layer weights given the pre-computed activations
+  // and the initial output-layer deltas.
+  _backpropLayers(act, outputDeltas, lr) {
     let deltas = outputDeltas;
     for (let l = this.layers.length - 1; l >= 0; l--) {
       const layer = this.layers[l];
@@ -533,9 +524,7 @@ var NetworkN = class {
         return prevAct ? errProp * prevAct.dfn(out) : errProp;
       });
       if (this._shouldResidual(l) && this.structure[l] === this.structure[l + 1]) {
-        for (let j = 0; j < prevDeltas.length; j++) {
-          prevDeltas[j] += deltas[j];
-        }
+        for (let j = 0; j < prevDeltas.length; j++) prevDeltas[j] += deltas[j];
       }
       layer.neurons.forEach((n, k) => {
         n._update(layerIn.map((inp) => deltas[k] * inp), deltas[k], lr);
@@ -543,33 +532,6 @@ var NetworkN = class {
       deltas = prevDeltas;
     }
   }
-  // ── Flat weight serialization ─────────────────────────────────────────────
-  // Order: layer 0 (all neurons), layer 1, ..., layer N.
-  getWeights() {
-    for (const d of this._dropouts) d.resetMask();
-    const w = [];
-    for (const layer of this.layers) {
-      for (const n of layer.neurons) {
-        w.push(...n.weights, n.bias);
-      }
-    }
-    return w;
-  }
-  setWeights(weights) {
-    for (const d of this._dropouts) d.resetMask();
-    let idx = 0;
-    for (const layer of this.layers) {
-      for (const n of layer.neurons) {
-        for (let j = 0; j < n.weights.length; j++) n.weights[j] = weights[idx++];
-        n.bias = weights[idx++];
-      }
-    }
-  }
-  // ── Helper ───────────────────────────────────────────────────────────────
-  _shouldResidual(layerIndex) {
-    if (typeof this._residual === "function") return this._residual(layerIndex);
-    return this._residual;
-  }
 };
 // src/LSTMLayer.ts
@@ -584,7 +546,7 @@ var Gate = class {
   // shape: [hSize]
   constructor(inputSize, hSize, initBias = 0) {
     const n = inputSize + hSize;
-    const limit = Math.sqrt(2 / n);
+    const limit = Math.sqrt(2 / (n + hSize));
     this.W = Array.from(
       { length: hSize },
       () => Array.from({ length: n }, () => (Math.random() * 2 - 1) * limit)
@@ -783,7 +745,6 @@ var LSTMLayer = class {
 };
 // src/NetworkLSTM.ts
-var defaultOptimizer4 = () => new SGD();
 var NetworkLSTM = class {
   // [T][layer+1][neuron]
   constructor(inputSize, hiddenSize, denseStructure, options = {}) {
@@ -791,7 +752,7 @@ var NetworkLSTM = class {
     this.hiddenSize = hiddenSize;
     this.lstm = new LSTMLayer(inputSize, hiddenSize);
     const activation = options.denseActivation ?? sigmoid2;
-    const optimizer = options.optimizer ?? defaultOptimizer4;
+    const optimizer = options.optimizer ?? defaultOptimizer;
     this.denseLayers = [];
     const sizes = [hiddenSize, ...denseStructure];
     for (let i = 1; i < sizes.length; i++) {
@@ -978,6 +939,22 @@ var WeightMatrix = class {
       for (let j = 0; j < this.W[i].length; j++) this.W[i][j] = weights[idx++];
   }
 };
+var BiasVector = class {
+  constructor(size) {
+    this.values = new Array(size).fill(0);
+    this.opts = Array.from({ length: size }, () => new Adam());
+  }
+  update(grad, lr) {
+    for (let i = 0; i < this.values.length; i++)
+      this.values[i] = this.opts[i].step(this.values[i], grad[i], lr);
+  }
+  getWeights() {
+    return [...this.values];
+  }
+  setWeights(weights) {
+    for (let i = 0; i < this.values.length; i++) this.values[i] = weights[i];
+  }
+};
 var EmbeddingMatrix = class {
   constructor(vocabSize, d_model) {
     const limit = Math.sqrt(1 / d_model);
@@ -1063,6 +1040,7 @@ var AttentionHead = class {
   //   5. dWq  = dQ^T @ X,      dWk = dK^T @ X,  dWv = dV^T @ X
   //   6. dX   = dQ @ Wq  +  dK @ Wk  +  dV @ Wv
   backward(dOut, lr) {
+    if (!this.cache) throw new Error("AttentionHead.backward() called before predict()");
     const { X, Q, K, V, attn } = this.cache;
     const seqLen = X.length;
     const d_model = X[0].length;
@@ -1190,6 +1168,7 @@ var MultiHeadAttention = class {
   // ── Backward ──────────────────────────────────────────────────────────────
   // dOut: seqLen × d_model  →  dX: seqLen × d_model
   backward(dOut, lr) {
+    if (!this._concat) throw new Error("MultiHeadAttention.backward() called before predict()");
     const seqLen = dOut.length;
     const concatD = this.nHeads * this.d_k;
     const d_model = this.d_model;
@@ -1294,11 +1273,12 @@ var LayerNorm = class {
   backwardOne(dOut, pos, lr) {
     const { x_norm, std } = this._cache[pos];
     const N = dOut.length;
+    const gammaOld = this.gamma.slice();
     for (let i = 0; i < N; i++) {
       this.gamma[i] += lr * dOut[i] * x_norm[i];
       this.beta[i] += lr * dOut[i];
     }
-    const D = dOut.map((d, i) => d * this.gamma[i]);
+    const D = dOut.map((d, i) => d * gammaOld[i]);
     const mD = D.reduce((s, v) => s + v, 0) / N;
     const mDxn = D.reduce((s, d, i) => s + d * x_norm[i], 0) / N;
     return D.map((d, i) => (d - mD - x_norm[i] * mDxn) / std);
@@ -1318,6 +1298,7 @@ var LayerNorm = class {
 // src/TransformerBlock.ts
 var TransformerBlock = class {
   constructor({ d_model, nHeads, d_ff, causal = false }) {
+    // d_model
     // Forward caches (needed for backprop)
     this._X = null;
     this._attnOut = null;
@@ -1334,10 +1315,8 @@ var TransformerBlock = class {
     this.norm2 = new LayerNorm(d_model);
     this.ff1 = new WeightMatrix(d_ff, d_model);
     this.ff2 = new WeightMatrix(d_model, d_ff);
-    this.b1 = new Array(d_ff).fill(0);
-    this.b2 = new Array(d_model).fill(0);
-    this.b1Opts = Array.from({ length: d_ff }, () => new Adam());
-    this.b2Opts = Array.from({ length: d_model }, () => new Adam());
+    this.b1 = new BiasVector(d_ff);
+    this.b2 = new BiasVector(d_model);
   }
   // ── Forward ───────────────────────────────────────────────────────────────
   // X: seqLen × d_model  →  out: seqLen × d_model
@@ -1350,11 +1329,11 @@ var TransformerBlock = class {
       return this.norm1.predictOne(added, i);
     });
     const ff1Pre = h1.map(
-      (h) => this.ff1.W.map((row, k) => row.reduce((s, w, m) => s + w * h[m], this.b1[k]))
+      (h) => this.ff1.W.map((row, k) => row.reduce((s, w, m) => s + w * h[m], this.b1.values[k]))
     );
     const ff1Out = ff1Pre.map((pre) => pre.map((v) => Math.max(0, v)));
     const ff2Out = ff1Out.map(
-      (h) => this.ff2.W.map((row, k) => row.reduce((s, w, m) => s + w * h[m], this.b2[k]))
+      (h) => this.ff2.W.map((row, k) => row.reduce((s, w, m) => s + w * h[m], this.b2.values[k]))
     );
     this.norm2.resetCache(seqLen);
     const out = h1.map((h, i) => {
@@ -1372,6 +1351,9 @@ var TransformerBlock = class {
   // ── Backward ──────────────────────────────────────────────────────────────
   // dOut: seqLen × d_model  →  dX: seqLen × d_model
   backward(dOut, lr) {
+    if (!this._h1 || !this._ff1Out || !this._ff1Pre) {
+      throw new Error("TransformerBlock.backward() called before predict()");
+    }
     const seqLen = dOut.length;
     const d_model = this.d_model;
     const h1 = this._h1;
@@ -1396,8 +1378,7 @@ var TransformerBlock = class {
       (_, m) => dAdded2.reduce((s, da) => s + da[m], 0)
     );
     this.ff2.update(dW2, lr);
-    for (let m = 0; m < d_model; m++)
-      this.b2[m] = this.b2Opts[m].step(this.b2[m], db2[m], lr);
+    this.b2.update(db2, lr);
     const dFf1Pre = dFf1Out.map(
       (d, i) => d.map((v, k) => ff1Pre[i][k] > 0 ? v : 0)
     );
@@ -1419,8 +1400,7 @@ var TransformerBlock = class {
       (_, k) => dFf1Pre.reduce((s, dp) => s + dp[k], 0)
     );
     this.ff1.update(dW1, lr);
-    for (let k = 0; k < this.d_ff; k++)
-      this.b1[k] = this.b1Opts[k].step(this.b1[k], db1[k], lr);
+    this.b1.update(db1, lr);
     const dH1 = Array.from(
       { length: seqLen },
       (_, i) => dH1_fromFf[i].map((v, m) => v + dAdded2[i][m])
@@ -1449,9 +1429,9 @@ var TransformerBlock = class {
     w.push(...this.attn.getWeights());
     w.push(...this.norm1.gamma, ...this.norm1.beta);
     for (const row of this.ff1.W) w.push(...row);
-    w.push(...this.b1);
+    w.push(...this.b1.values);
     for (const row of this.ff2.W) w.push(...row);
-    w.push(...this.b2);
+    w.push(...this.b2.values);
     w.push(...this.norm2.gamma, ...this.norm2.beta);
     return w;
   }
@@ -1460,16 +1440,17 @@ var TransformerBlock = class {
     const attnLen = this.attn.getWeights().length;
     this.attn.setWeights(weights.slice(idx, idx + attnLen));
     idx += attnLen;
-    for (let i = 0; i < this.norm1.gamma.length; i++) this.norm1.gamma[i] = weights[idx++];
-    for (let i = 0; i < this.norm1.beta.length; i++) this.norm1.beta[i] = weights[idx++];
-    for (let i = 0; i < this.ff1.W.length; i++)
-      for (let j = 0; j < this.ff1.W[i].length; j++) this.ff1.W[i][j] = weights[idx++];
-    for (let i = 0; i < this.b1.length; i++) this.b1[i] = weights[idx++];
-    for (let i = 0; i < this.ff2.W.length; i++)
-      for (let j = 0; j < this.ff2.W[i].length; j++) this.ff2.W[i][j] = weights[idx++];
-    for (let i = 0; i < this.b2.length; i++) this.b2[i] = weights[idx++];
-    for (let i = 0; i < this.norm2.gamma.length; i++) this.norm2.gamma[i] = weights[idx++];
-    for (let i = 0; i < this.norm2.beta.length; i++) this.norm2.beta[i] = weights[idx++];
+    this.norm1.setWeights(weights.slice(idx, idx + this.norm1.getWeights().length));
+    idx += this.norm1.getWeights().length;
+    this.ff1.setWeights(weights.slice(idx, idx + this.ff1.getWeights().length));
+    idx += this.ff1.getWeights().length;
+    this.b1.setWeights(weights.slice(idx, idx + this.b1.values.length));
+    idx += this.b1.values.length;
+    this.ff2.setWeights(weights.slice(idx, idx + this.ff2.getWeights().length));
+    idx += this.ff2.getWeights().length;
+    this.b2.setWeights(weights.slice(idx, idx + this.b2.values.length));
+    idx += this.b2.values.length;
+    this.norm2.setWeights(weights.slice(idx, idx + this.norm2.getWeights().length));
   }
 };
@@ -1495,8 +1476,7 @@ var NetworkTransformer = class {
       () => new TransformerBlock({ d_model, nHeads, d_ff })
     );
     this.outputProj = new WeightMatrix(nClasses, d_model);
-    this.outputBias = new Array(nClasses).fill(0);
-    this.outBiasOpts = Array.from({ length: nClasses }, () => new Adam());
+    this.outputBias = new BiasVector(nClasses);
   }
   // ── Forward pass ──────────────────────────────────────────────────────────
   // tokens: seqLen integer ids  →  seqLen * nClasses logits (flattened)
@@ -1504,7 +1484,7 @@ var NetworkTransformer = class {
     const h = this._forward(tokens);
     return h.flatMap(
       (hi) => this.outputProj.W.map(
-        (row, c) => row.reduce((s, w, m) => s + w * hi[m], this.outputBias[c])
+        (row, c) => row.reduce((s, w, m) => s + w * hi[m], this.outputBias.values[c])
       )
     );
   }
@@ -1518,7 +1498,7 @@ var NetworkTransformer = class {
     const h = this._forward(tokens);
     const logits = h.map(
       (hi) => this.outputProj.W.map(
-        (row, c) => row.reduce((s, w, m) => s + w * hi[m], this.outputBias[c])
+        (row, c) => row.reduce((s, w, m) => s + w * hi[m], this.outputBias.values[c])
       )
     );
     let loss = 0;
@@ -1553,8 +1533,7 @@ var NetworkTransformer = class {
       (_, c) => dLogits.reduce((s, dl) => s + dl[c], 0)
     );
     this.outputProj.update(dWout, lr);
-    for (let c = 0; c < this.nClasses; c++)
-      this.outputBias[c] = this.outBiasOpts[c].step(this.outputBias[c], dBout[c], lr);
+    this.outputBias.update(dBout, lr);
     let dX = dH;
     for (let b = this.blocks.length - 1; b >= 0; b--)
       dX = this.blocks[b].backward(dX, lr);
@@ -1573,27 +1552,30 @@ var NetworkTransformer = class {
   // Order: tokenEmb, posEmb, block0, block1, ..., blockN, outputProj, outputBias.
   getWeights() {
     const w = [];
-    for (const row of this.tokenEmb.W) w.push(...row);
-    for (const row of this.posEmb.W) w.push(...row);
+    w.push(...this.tokenEmb.getWeights());
+    w.push(...this.posEmb.getWeights());
     for (const block of this.blocks) w.push(...block.getWeights());
-    for (const row of this.outputProj.W) w.push(...row);
-    w.push(...this.outputBias);
+    w.push(...this.outputProj.getWeights());
+    w.push(...this.outputBias.getWeights());
     return w;
   }
   setWeights(weights) {
     let idx = 0;
-    for (let i = 0; i < this.tokenEmb.W.length; i++)
-      for (let j = 0; j < this.tokenEmb.W[i].length; j++) this.tokenEmb.W[i][j] = weights[idx++];
-    for (let i = 0; i < this.posEmb.W.length; i++)
-      for (let j = 0; j < this.posEmb.W[i].length; j++) this.posEmb.W[i][j] = weights[idx++];
+    const tokenEmbLen = this.tokenEmb.getWeights().length;
+    this.tokenEmb.setWeights(weights.slice(idx, idx + tokenEmbLen));
+    idx += tokenEmbLen;
+    const posEmbLen = this.posEmb.getWeights().length;
+    this.posEmb.setWeights(weights.slice(idx, idx + posEmbLen));
+    idx += posEmbLen;
     for (const block of this.blocks) {
       const blockLen = block.getWeights().length;
       block.setWeights(weights.slice(idx, idx + blockLen));
       idx += blockLen;
     }
-    for (let i = 0; i < this.outputProj.W.length; i++)
-      for (let j = 0; j < this.outputProj.W[i].length; j++) this.outputProj.W[i][j] = weights[idx++];
-    for (let i = 0; i < this.outputBias.length; i++) this.outputBias[i] = weights[idx++];
+    const outProjLen = this.outputProj.getWeights().length;
+    this.outputProj.setWeights(weights.slice(idx, idx + outProjLen));
+    idx += outProjLen;
+    this.outputBias.setWeights(weights.slice(idx, idx + this.outputBias.values.length));
   }
   // ── Internal ──────────────────────────────────────────────────────────────
   // Shared embedding + block forward pass.
@@ -1635,8 +1617,7 @@ var NetworkTransformerRL = class {
       () => new TransformerBlock({ d_model, nHeads, d_ff, causal: true })
     );
     this.outputProj = new WeightMatrix(nActions, d_model);
-    this.outputBias = new Array(nActions).fill(0);
-    this.outBiasOpts = Array.from({ length: nActions }, () => new Adam());
+    this.outputBias = new BiasVector(nActions);
   }
   // ── Forward ────────────────────────────────────────────────────────────────
   // sequence: seqLen × inputDim → nActions Q-values
@@ -1644,7 +1625,7 @@ var NetworkTransformerRL = class {
     const h = this._forward(sequence);
     const pooled = this._pool(h);
     return this.outputProj.W.map(
-      (row, c) => row.reduce((s, w, m) => s + w * pooled[m], this.outputBias[c])
+      (row, c) => row.reduce((s, w, m) => s + w * pooled[m], this.outputBias.values[c])
     );
   }
   // ── Training ────────────────────────────────────────────────────────────────
@@ -1656,7 +1637,7 @@ var NetworkTransformerRL = class {
     const h = this._forward(sequence);
     const pooled = this._pool(h);
     const pred = this.outputProj.W.map(
-      (row, c) => row.reduce((s, w, m) => s + w * pooled[m], this.outputBias[c])
+      (row, c) => row.reduce((s, w, m) => s + w * pooled[m], this.outputBias.values[c])
     );
     const n = this.nActions;
     let loss = 0;
@@ -1679,8 +1660,7 @@ var NetworkTransformerRL = class {
     );
     const dBout = dPred.slice();
     this.outputProj.update(dWout, lr);
-    for (let c = 0; c < this.nActions; c++)
-      this.outputBias[c] = this.outBiasOpts[c].step(this.outputBias[c], dBout[c], lr);
+    this.outputBias.update(dBout, lr);
     let dH = this._distributePoolGradient(dPooled);
     for (let b = this.blocks.length - 1; b >= 0; b--)
       dH = this.blocks[b].backward(dH, lr);
@@ -1704,24 +1684,26 @@ var NetworkTransformerRL = class {
   // Order: inputProj, block0, block1, ..., blockN, outputProj, outputBias.
   getWeightsFlat() {
     const w = [];
-    for (const row of this.inputProj.W) w.push(...row);
+    w.push(...this.inputProj.getWeights());
     for (const block of this.blocks) w.push(...block.getWeights());
-    for (const row of this.outputProj.W) w.push(...row);
-    w.push(...this.outputBias);
+    w.push(...this.outputProj.getWeights());
+    w.push(...this.outputBias.getWeights());
     return w;
   }
   setWeightsFlat(weights) {
     let idx = 0;
-    for (let i = 0; i < this.inputProj.W.length; i++)
-      for (let j = 0; j < this.inputProj.W[i].length; j++) this.inputProj.W[i][j] = weights[idx++];
+    const inputProjLen = this.inputProj.getWeights().length;
+    this.inputProj.setWeights(weights.slice(idx, idx + inputProjLen));
+    idx += inputProjLen;
     for (const block of this.blocks) {
       const blockLen = block.getWeights().length;
       block.setWeights(weights.slice(idx, idx + blockLen));
       idx += blockLen;
     }
-    for (let i = 0; i < this.outputProj.W.length; i++)
-      for (let j = 0; j < this.outputProj.W[i].length; j++) this.outputProj.W[i][j] = weights[idx++];
-    for (let i = 0; i < this.outputBias.length; i++) this.outputBias[i] = weights[idx++];
+    const outProjLen = this.outputProj.getWeights().length;
+    this.outputProj.setWeights(weights.slice(idx, idx + outProjLen));
+    idx += outProjLen;
+    this.outputBias.setWeights(weights.slice(idx, idx + this.outputBias.values.length));
   }
   getWeightsStructured() {
     return {
@@ -1739,17 +1721,15 @@ var NetworkTransformerRL = class {
         norm2: { gamma: [...b.norm2.gamma], beta: [...b.norm2.beta] },
         ff1: b.ff1.W.map((r) => [...r]),
         ff2: b.ff2.W.map((r) => [...r]),
-        b1: [...b.b1],
-        b2: [...b.b2]
+        b1: [...b.b1.values],
+        b2: [...b.b2.values]
       })),
       outputProj: this.outputProj.W.map((r) => [...r]),
-      outputBias: [...this.outputBias]
+      outputBias: [...this.outputBias.values]
     };
   }
   setWeightsStructured(data) {
-    data.inputProj.forEach((row, i) => {
-      this.inputProj.W[i] = [...row];
-    });
+    this.inputProj.setWeights(data.inputProj.flat());
     data.blocks.forEach((bd, b) => {
       const blk = this.blocks[b];
       bd.attn.heads.forEach((hd, h) => {
@@ -1764,11 +1744,11 @@ var NetworkTransformerRL = class {
       blk.norm2.beta = [...bd.norm2.beta];
       blk.ff1.W = bd.ff1.map((r) => [...r]);
       blk.ff2.W = bd.ff2.map((r) => [...r]);
-      blk.b1 = [...bd.b1];
-      blk.b2 = [...bd.b2];
+      blk.b1.setWeights(bd.b1);
+      blk.b2.setWeights(bd.b2);
     });
     this.outputProj.W = data.outputProj.map((r) => [...r]);
-    this.outputBias = [...data.outputBias];
+    this.outputBias.setWeights(data.outputBias);
   }
   // ── Serializable interface (flat array) ────────────────────────────────────
   // These satisfy the Serializable interface from ModelSaver, which requires
@@ -1927,7 +1907,7 @@ function tanhFn(x) {
 var Gate2 = class {
   constructor(inputSize, hSize, initBias = 0) {
     const n = inputSize + hSize;
-    const limit = Math.sqrt(2 / n);
+    const limit = Math.sqrt(2 / (n + hSize));
     this.W = Array.from(
       { length: hSize },
       () => Array.from({ length: n }, () => (Math.random() * 2 - 1) * limit)
@@ -2699,6 +2679,7 @@ var ModelSaver = class _ModelSaver {
   Adam,
   AttentionHead,
   BatchNorm,
+  BiasVector,
   ClipOptimizer,
   ClippedOptimizerFactory,
   Conv1D,
@@ -2727,6 +2708,7 @@ var ModelSaver = class _ModelSaver {
   crossEntropy,
   crossEntropyDelta,
   crossEntropyDeltaRaw,
+  defaultOptimizer,
   elu,
   leakyRelu,
   linear,