npm - @dniskav/neuron - Versions diffs - 0.2.3 → 0.2.6 - Mend

@dniskav/neuron 0.2.3 → 0.2.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (6) hide show

package/dist/index.mjs CHANGED Viewed

@@ -1,3 +1,71 @@
+// src/Validation.ts
+function validateArray(arr, expectedLength, methodName) {
+  if (!Array.isArray(arr)) {
+    throw new Error(`${methodName}: expected array, got ${typeof arr}`);
+  }
+  if (arr.length !== expectedLength) {
+    throw new Error(
+      `${methodName}: expected array of length ${expectedLength}, got ${arr.length}`
+    );
+  }
+  for (let i = 0; i < arr.length; i++) {
+    if (typeof arr[i] !== "number" || !isFinite(arr[i])) {
+      throw new Error(
+        `${methodName}: invalid value at index ${i}: ${arr[i]}`
+      );
+    }
+  }
+}
+function validateArrayMinLength(arr, minLength, methodName) {
+  if (!Array.isArray(arr)) {
+    throw new Error(`${methodName}: expected array, got ${typeof arr}`);
+  }
+  if (arr.length < minLength) {
+    throw new Error(
+      `${methodName}: expected array of at least length ${minLength}, got ${arr.length}`
+    );
+  }
+  for (let i = 0; i < arr.length; i++) {
+    if (typeof arr[i] !== "number" || !isFinite(arr[i])) {
+      throw new Error(
+        `${methodName}: invalid value at index ${i}: ${arr[i]}`
+      );
+    }
+  }
+}
+function validate2DArray(arr, expectedRows, expectedCols, methodName) {
+  if (!Array.isArray(arr)) {
+    throw new Error(`${methodName}: expected 2D array, got ${typeof arr}`);
+  }
+  if (arr.length !== expectedRows) {
+    throw new Error(
+      `${methodName}: expected ${expectedRows} rows, got ${arr.length}`
+    );
+  }
+  for (let i = 0; i < arr.length; i++) {
+    if (!Array.isArray(arr[i])) {
+      throw new Error(`${methodName}: row ${i} is not an array`);
+    }
+    if (arr[i].length !== expectedCols) {
+      throw new Error(
+        `${methodName}: row ${i} expected ${expectedCols} cols, got ${arr[i].length}`
+      );
+    }
+    for (let j = 0; j < arr[i].length; j++) {
+      if (typeof arr[i][j] !== "number" || !isFinite(arr[i][j])) {
+        throw new Error(
+          `${methodName}: invalid value at [${i}][${j}]: ${arr[i][j]}`
+        );
+      }
+    }
+  }
+}
+function validateNumber(value, methodName) {
+  if (typeof value !== "number" || !isFinite(value)) {
+    throw new Error(`${methodName}: expected finite number, got ${value}`);
+  }
+}
 // src/Neuron.ts
 function sigmoid(x) {
   return 1 / (1 + Math.exp(-x));
@@ -8,13 +76,18 @@ var Neuron = class {
     this.bias = Math.random() * 0.1;
   }
   predict(input) {
+    validateNumber(input, "Neuron.predict");
     return sigmoid(input * this.weight + this.bias);
   }
   train(input, target, lr) {
+    validateNumber(input, "Neuron.train");
+    validateNumber(target, "Neuron.train");
+    validateNumber(lr, "Neuron.train");
     const prediction = this.predict(input);
     const error = target - prediction;
-    this.weight += lr * error * input;
-    this.bias += lr * error;
+    const grad = error * prediction * (1 - prediction);
+    this.weight += lr * grad * input;
+    this.bias += lr * grad;
   }
 };
@@ -54,6 +127,7 @@ function makeElu(alpha = 1) {
 var elu = makeElu(1);
 // src/optimizers.ts
+var defaultOptimizer = () => new SGD();
 var SGD = class {
   step(weight, gradient, lr) {
     return weight + lr * gradient;
@@ -69,6 +143,19 @@ var Momentum = class {
     return weight + this.v;
   }
 };
+var ClipOptimizer = class {
+  constructor(inner, clipValue) {
+    this.inner = inner;
+    this.clipValue = clipValue;
+  }
+  step(weight, gradient, lr) {
+    const clipped = Math.max(-this.clipValue, Math.min(this.clipValue, gradient));
+    return this.inner.step(weight, clipped, lr);
+  }
+};
+function ClippedOptimizerFactory(innerFactory, clipValue) {
+  return () => new ClipOptimizer(innerFactory(), clipValue);
+}
 var Adam = class {
   constructor(beta1 = 0.9, beta2 = 0.999, epsilon = 1e-8) {
     this.beta1 = beta1;
@@ -89,7 +176,6 @@ var Adam = class {
 };
 // src/NeuronN.ts
-var defaultOptimizer = () => new SGD();
 var NeuronN = class {
   constructor(nInputs, activation = sigmoid2, optimizerFactory = defaultOptimizer) {
     const limit = Math.sqrt(1 / nInputs);
@@ -99,6 +185,7 @@ var NeuronN = class {
     this._opts = Array.from({ length: nInputs + 1 }, optimizerFactory);
   }
   predict(inputs) {
+    validateArray(inputs, this.weights.length, "NeuronN.predict");
     const sum = inputs.reduce((acc, e, i) => acc + e * this.weights[i], this.bias);
     return this.activation.fn(sum);
   }
@@ -111,14 +198,14 @@ var NeuronN = class {
   train(inputs, target, lr) {
     const prediction = this.predict(inputs);
     const error = target - prediction;
-    this._update(inputs.map((inp) => error * inp), error, lr);
+    const grad = error * this.activation.dfn(prediction);
+    this._update(inputs.map((inp) => grad * inp), grad, lr);
   }
 };
 // src/Layer.ts
-var defaultOptimizer2 = () => new SGD();
 var Layer = class {
-  constructor(nNeurons, nInputs, activation = sigmoid2, optimizerFactory = defaultOptimizer2) {
+  constructor(nNeurons, nInputs, activation = sigmoid2, optimizerFactory = defaultOptimizer) {
     this.neurons = Array.from(
       { length: nNeurons },
       () => new NeuronN(nInputs, activation, optimizerFactory)
@@ -136,84 +223,233 @@ var Network = class {
     this.outputLayer = new Layer(nOutputs, nHidden);
   }
   predict(inputs) {
+    validateArray(inputs, this.hiddenLayer.neurons[0].weights.length, "Network.predict");
     const hiddenOut = this.hiddenLayer.predict(inputs);
-    return this.outputLayer.predict(hiddenOut)[0];
+    return this.outputLayer.predict(hiddenOut);
   }
   // Trains on a single example. Returns the squared error.
   train(inputs, target, lr) {
+    validateArray(inputs, this.hiddenLayer.neurons[0].weights.length, "Network.train");
+    validateNumber(target, "Network.train");
+    validateNumber(lr, "Network.train");
     const hiddenOut = this.hiddenLayer.predict(inputs);
     const prediction = this.outputLayer.predict(hiddenOut)[0];
-    const outputError = target - prediction;
-    const outputDelta = outputError * prediction * (1 - prediction);
     const outputNeuron = this.outputLayer.neurons[0];
-    outputNeuron.weights = outputNeuron.weights.map(
-      (w, i) => w + lr * outputDelta * hiddenOut[i]
-    );
-    outputNeuron.bias += lr * outputDelta;
-    this.hiddenLayer.neurons.forEach((neuron, i) => {
-      const hiddenOut_i = hiddenOut[i];
+    const outputError = target - prediction;
+    const outputDelta = outputError * outputNeuron.activation.dfn(prediction);
+    const hiddenDeltas = this.hiddenLayer.neurons.map((neuron, i) => {
       const hiddenError = outputDelta * outputNeuron.weights[i];
-      const hiddenDelta = hiddenError * hiddenOut_i * (1 - hiddenOut_i);
-      neuron.weights = neuron.weights.map((w, j) => w + lr * hiddenDelta * inputs[j]);
-      neuron.bias += lr * hiddenDelta;
+      return hiddenError * neuron.activation.dfn(hiddenOut[i]);
+    });
+    this.hiddenLayer.neurons.forEach((neuron, i) => {
+      neuron._update(inputs.map((inp) => hiddenDeltas[i] * inp), hiddenDeltas[i], lr);
     });
+    outputNeuron._update(hiddenOut.map((h) => outputDelta * h), outputDelta, lr);
     return outputError * outputError;
   }
+  // ── Flat weight serialization ─────────────────────────────────────────────
+  // Order: hidden layer (all neurons: weights then bias), then output layer.
+  getWeights() {
+    const w = [];
+    for (const n of this.hiddenLayer.neurons) {
+      w.push(...n.weights, n.bias);
+    }
+    for (const n of this.outputLayer.neurons) {
+      w.push(...n.weights, n.bias);
+    }
+    return w;
+  }
+  setWeights(weights) {
+    let idx = 0;
+    for (const n of this.hiddenLayer.neurons) {
+      for (let j = 0; j < n.weights.length; j++) n.weights[j] = weights[idx++];
+      n.bias = weights[idx++];
+    }
+    for (const n of this.outputLayer.neurons) {
+      for (let j = 0; j < n.weights.length; j++) n.weights[j] = weights[idx++];
+      n.bias = weights[idx++];
+    }
+  }
+};
+// src/Dropout.ts
+var Dropout = class {
+  constructor(rate) {
+    this._mask = null;
+    if (rate < 0 || rate >= 1) {
+      throw new Error(`Dropout rate must be in [0, 1), got ${rate}`);
+    }
+    this.rate = rate;
+  }
+  // ── Forward ───────────────────────────────────────────────────────────────
+  // x: number[]  →  number[]
+  // If training, applies inverted dropout mask.
+  // If not training, returns input unchanged.
+  forward(x, training = true) {
+    if (!training || this.rate === 0) {
+      this._mask = null;
+      return [...x];
+    }
+    const scale = 1 / (1 - this.rate);
+    this._mask = x.map(() => Math.random() > this.rate ? scale : 0);
+    return x.map((v, i) => v * this._mask[i]);
+  }
+  // ── Backward ──────────────────────────────────────────────────────────────
+  // dOut: number[]  →  number[]
+  // Applies the same mask (gradient is zeroed where activation was zeroed).
+  backward(dOut) {
+    if (!this._mask) return [...dOut];
+    return dOut.map((d, i) => d * this._mask[i]);
+  }
+  // ── Reset mask between forward passes ─────────────────────────────────────
+  resetMask() {
+    this._mask = null;
+  }
+  // ── No trainable params ───────────────────────────────────────────────────
+  getWeights() {
+    return [];
+  }
+  setWeights(_weights) {
+  }
 };
 // src/NetworkN.ts
-var defaultOptimizer3 = () => new SGD();
 var NetworkN = class {
   constructor(structure, options = {}) {
     this.structure = structure;
     const nLayers = structure.length - 1;
     const activations = options.activations ?? Array.from({ length: nLayers }, () => sigmoid2);
-    const optimizer = options.optimizer ?? defaultOptimizer3;
+    const optimizer = options.optimizer ?? defaultOptimizer;
+    const dropoutRate = options.dropoutRate ?? 0;
+    if (activations.length !== nLayers) {
+      throw new Error(`Expected ${nLayers} activations, got ${activations.length}`);
+    }
+    if (dropoutRate < 0 || dropoutRate >= 1) {
+      throw new Error(`Dropout rate must be in [0, 1), got ${dropoutRate}`);
+    }
+    this._residual = options.residual ?? false;
     this.layers = [];
     for (let i = 1; i < structure.length; i++) {
       this.layers.push(new Layer(structure[i], structure[i - 1], activations[i - 1], optimizer));
     }
+    this._dropouts = [];
+    if (dropoutRate > 0) {
+      for (let i = 0; i < nLayers - 1; i++) {
+        this._dropouts.push(new Dropout(dropoutRate));
+      }
+    }
+    const outputLayer = this.layers[this.layers.length - 1];
+    const outputActivation = outputLayer.neurons[0].activation;
+    for (let i = 1; i < outputLayer.neurons.length; i++) {
+      if (outputLayer.neurons[i].activation !== outputActivation) {
+        throw new Error("All output neurons must share the same activation function");
+      }
+    }
   }
-  predict(inputs) {
-    return this.layers.reduce((acc, layer) => layer.predict(acc), inputs);
+  predict(inputs, training = false) {
+    validateArray(inputs, this.structure[0], "NetworkN.predict");
+    let current = [...inputs];
+    for (let i = 0; i < this.layers.length; i++) {
+      const layerInput = [...current];
+      const layerOutput = this.layers[i].predict(current);
+      if (this._shouldResidual(i)) {
+        if (this.structure[i] === this.structure[i + 1]) {
+          current = layerOutput.map((v, j) => v + layerInput[j]);
+        } else {
+          current = [...layerOutput];
+        }
+      } else {
+        current = [...layerOutput];
+      }
+      if (i < this._dropouts.length) {
+        current = this._dropouts[i].forward(current, training);
+      }
+    }
+    return current;
   }
   // Generalized backpropagation across L layers.
   // Returns the mean squared error for the example.
   train(inputs, targets, lr) {
-    const act = [inputs];
-    for (const layer of this.layers) act.push(layer.predict(act[act.length - 1]));
+    validateArray(inputs, this.structure[0], "NetworkN.train");
+    validateArray(targets, this.structure[this.structure.length - 1], "NetworkN.train");
+    const act = this._forwardAll(inputs, true);
     const pred = act[act.length - 1];
     const outAct = this.layers[this.layers.length - 1].neurons[0].activation;
-    let deltas = pred.map((p, i) => (targets[i] - p) * outAct.dfn(p));
-    for (let l = this.layers.length - 1; l >= 0; l--) {
-      const layer = this.layers[l];
-      const layerIn = act[l];
-      const prevAct = l > 0 ? this.layers[l - 1].neurons[0].activation : null;
-      const prevDeltas = layerIn.map((out, j) => {
-        const errProp = layer.neurons.reduce((s, n, k) => s + deltas[k] * n.weights[j], 0);
-        return prevAct ? errProp * prevAct.dfn(out) : errProp;
-      });
-      layer.neurons.forEach((n, k) => {
-        n._update(layerIn.map((inp) => deltas[k] * inp), deltas[k], lr);
-      });
-      deltas = prevDeltas;
-    }
+    const deltas = pred.map((p, i) => (targets[i] - p) * outAct.dfn(p));
+    this._backpropLayers(act, deltas, lr);
     return pred.reduce((s, p, i) => s + (targets[i] - p) ** 2, 0) / pred.length;
   }
   // Backprop with externally provided output-layer deltas.
   // Useful for custom loss functions (e.g. physics-based gradients).
   trainWithDeltas(inputs, outputDeltas, lr) {
+    const act = this._forwardAll(inputs, true);
+    this._backpropLayers(act, outputDeltas, lr);
+  }
+  // ── Flat weight serialization ─────────────────────────────────────────────
+  // Order: layer 0 (all neurons), layer 1, ..., layer N.
+  getWeights() {
+    for (const d of this._dropouts) d.resetMask();
+    const w = [];
+    for (const layer of this.layers) {
+      for (const n of layer.neurons) {
+        w.push(...n.weights, n.bias);
+      }
+    }
+    return w;
+  }
+  setWeights(weights) {
+    for (const d of this._dropouts) d.resetMask();
+    let idx = 0;
+    for (const layer of this.layers) {
+      for (const n of layer.neurons) {
+        for (let j = 0; j < n.weights.length; j++) n.weights[j] = weights[idx++];
+        n.bias = weights[idx++];
+      }
+    }
+  }
+  // ── Private helpers ──────────────────────────────────────────────────────
+  _shouldResidual(layerIndex) {
+    if (typeof this._residual === "function") return this._residual(layerIndex);
+    return this._residual;
+  }
+  // Forward pass storing activations at every layer boundary.
+  // Used by train(), trainWithDeltas(), and predict() shares the same logic.
+  _forwardAll(inputs, training) {
     const act = [inputs];
-    for (const layer of this.layers) act.push(layer.predict(act[act.length - 1]));
+    for (let i = 0; i < this.layers.length; i++) {
+      const layerInput = act[act.length - 1];
+      const layerOutput = this.layers[i].predict(layerInput);
+      let current;
+      if (this._shouldResidual(i) && this.structure[i] === this.structure[i + 1]) {
+        current = layerOutput.map((v, j) => v + layerInput[j]);
+      } else {
+        current = layerOutput;
+      }
+      if (i < this._dropouts.length) {
+        current = this._dropouts[i].forward(current, training);
+      }
+      act.push(current);
+    }
+    return act;
+  }
+  // Backward pass: updates all layer weights given the pre-computed activations
+  // and the initial output-layer deltas.
+  _backpropLayers(act, outputDeltas, lr) {
     let deltas = outputDeltas;
     for (let l = this.layers.length - 1; l >= 0; l--) {
       const layer = this.layers[l];
+      if (l < this._dropouts.length) {
+        deltas = this._dropouts[l].backward(deltas);
+      }
       const layerIn = act[l];
       const prevAct = l > 0 ? this.layers[l - 1].neurons[0].activation : null;
       const prevDeltas = layerIn.map((out, j) => {
         const errProp = layer.neurons.reduce((s, n, k) => s + deltas[k] * n.weights[j], 0);
         return prevAct ? errProp * prevAct.dfn(out) : errProp;
       });
+      if (this._shouldResidual(l) && this.structure[l] === this.structure[l + 1]) {
+        for (let j = 0; j < prevDeltas.length; j++) prevDeltas[j] += deltas[j];
+      }
       layer.neurons.forEach((n, k) => {
         n._update(layerIn.map((inp) => deltas[k] * inp), deltas[k], lr);
       });
@@ -234,7 +470,7 @@ var Gate = class {
   // shape: [hSize]
   constructor(inputSize, hSize, initBias = 0) {
     const n = inputSize + hSize;
-    const limit = Math.sqrt(2 / n);
+    const limit = Math.sqrt(2 / (n + hSize));
     this.W = Array.from(
       { length: hSize },
       () => Array.from({ length: n }, () => (Math.random() * 2 - 1) * limit)
@@ -248,8 +484,11 @@ var Gate = class {
   }
 };
 var LSTMLayer = class {
-  constructor(inputSize, hiddenSize) {
+  constructor(inputSize, hiddenSize, optimizerFactory = () => new SGD()) {
     this._traj = [];
+    if (inputSize <= 0 || hiddenSize <= 0) {
+      throw new Error(`LSTMLayer: inputSize and hiddenSize must be positive, got ${inputSize} and ${hiddenSize}`);
+    }
     this.inputSize = inputSize;
     this.hSize = hiddenSize;
     this.h = new Array(hiddenSize).fill(0);
@@ -258,6 +497,29 @@ var LSTMLayer = class {
     this.inputGate = new Gate(inputSize, hiddenSize);
     this.cellGate = new Gate(inputSize, hiddenSize);
     this.outputGate = new Gate(inputSize, hiddenSize);
+    const combSize = inputSize + hiddenSize;
+    this._optimizers = {
+      forgetW: Array.from(
+        { length: hiddenSize },
+        () => Array.from({ length: combSize }, () => optimizerFactory())
+      ),
+      forgetB: Array.from({ length: hiddenSize }, () => optimizerFactory()),
+      inputW: Array.from(
+        { length: hiddenSize },
+        () => Array.from({ length: combSize }, () => optimizerFactory())
+      ),
+      inputB: Array.from({ length: hiddenSize }, () => optimizerFactory()),
+      cellW: Array.from(
+        { length: hiddenSize },
+        () => Array.from({ length: combSize }, () => optimizerFactory())
+      ),
+      cellB: Array.from({ length: hiddenSize }, () => optimizerFactory()),
+      outputW: Array.from(
+        { length: hiddenSize },
+        () => Array.from({ length: combSize }, () => optimizerFactory())
+      ),
+      outputB: Array.from({ length: hiddenSize }, () => optimizerFactory())
+    };
   }
   // ── Reset state and trajectory (call at episode start) ────────────────────
   reset() {
@@ -267,6 +529,9 @@ var LSTMLayer = class {
   }
   // ── Forward pass ──────────────────────────────────────────────────────────
   predict(inputs) {
+    if (!Array.isArray(inputs) || inputs.length !== this.inputSize) {
+      throw new Error(`LSTMLayer.predict: expected array of length ${this.inputSize}, got ${inputs?.length}`);
+    }
     const combined = [...inputs, ...this.h];
     const c_prev = [...this.c];
     const zf = this.forgetGate.linear(combined);
@@ -341,15 +606,15 @@ var LSTMLayer = class {
     const scale = lr / T;
     for (let k = 0; k < hSize; k++) {
       for (let j = 0; j < combSize; j++) {
-        this.forgetGate.W[k][j] += scale * dWf[k][j];
-        this.inputGate.W[k][j] += scale * dWi[k][j];
-        this.cellGate.W[k][j] += scale * dWg[k][j];
-        this.outputGate.W[k][j] += scale * dWo[k][j];
+        this.forgetGate.W[k][j] = this._optimizers.forgetW[k][j].step(this.forgetGate.W[k][j], dWf[k][j], scale);
+        this.inputGate.W[k][j] = this._optimizers.inputW[k][j].step(this.inputGate.W[k][j], dWi[k][j], scale);
+        this.cellGate.W[k][j] = this._optimizers.cellW[k][j].step(this.cellGate.W[k][j], dWg[k][j], scale);
+        this.outputGate.W[k][j] = this._optimizers.outputW[k][j].step(this.outputGate.W[k][j], dWo[k][j], scale);
       }
-      this.forgetGate.b[k] += scale * dbf[k];
-      this.inputGate.b[k] += scale * dbi[k];
-      this.cellGate.b[k] += scale * dbg[k];
-      this.outputGate.b[k] += scale * dbo[k];
+      this.forgetGate.b[k] = this._optimizers.forgetB[k].step(this.forgetGate.b[k], dbf[k], scale);
+      this.inputGate.b[k] = this._optimizers.inputB[k].step(this.inputGate.b[k], dbi[k], scale);
+      this.cellGate.b[k] = this._optimizers.cellB[k].step(this.cellGate.b[k], dbg[k], scale);
+      this.outputGate.b[k] = this._optimizers.outputB[k].step(this.outputGate.b[k], dbo[k], scale);
     }
     this._traj = [];
   }
@@ -372,10 +637,38 @@ var LSTMLayer = class {
     this.outputGate.W = data.outputGate.W;
     this.outputGate.b = data.outputGate.b;
   }
+  // ── Flat weight serialization ─────────────────────────────────────────────
+  // Order: forgetGate (W, b), inputGate (W, b), cellGate (W, b), outputGate (W, b).
+  getWeightsFlat() {
+    const w = [];
+    for (const row of this.forgetGate.W) w.push(...row);
+    w.push(...this.forgetGate.b);
+    for (const row of this.inputGate.W) w.push(...row);
+    w.push(...this.inputGate.b);
+    for (const row of this.cellGate.W) w.push(...row);
+    w.push(...this.cellGate.b);
+    for (const row of this.outputGate.W) w.push(...row);
+    w.push(...this.outputGate.b);
+    return w;
+  }
+  setWeightsFlat(weights) {
+    let idx = 0;
+    for (let i = 0; i < this.forgetGate.W.length; i++)
+      for (let j = 0; j < this.forgetGate.W[i].length; j++) this.forgetGate.W[i][j] = weights[idx++];
+    for (let i = 0; i < this.forgetGate.b.length; i++) this.forgetGate.b[i] = weights[idx++];
+    for (let i = 0; i < this.inputGate.W.length; i++)
+      for (let j = 0; j < this.inputGate.W[i].length; j++) this.inputGate.W[i][j] = weights[idx++];
+    for (let i = 0; i < this.inputGate.b.length; i++) this.inputGate.b[i] = weights[idx++];
+    for (let i = 0; i < this.cellGate.W.length; i++)
+      for (let j = 0; j < this.cellGate.W[i].length; j++) this.cellGate.W[i][j] = weights[idx++];
+    for (let i = 0; i < this.cellGate.b.length; i++) this.cellGate.b[i] = weights[idx++];
+    for (let i = 0; i < this.outputGate.W.length; i++)
+      for (let j = 0; j < this.outputGate.W[i].length; j++) this.outputGate.W[i][j] = weights[idx++];
+    for (let i = 0; i < this.outputGate.b.length; i++) this.outputGate.b[i] = weights[idx++];
+  }
 };
 // src/NetworkLSTM.ts
-var defaultOptimizer4 = () => new SGD();
 var NetworkLSTM = class {
   // [T][layer+1][neuron]
   constructor(inputSize, hiddenSize, denseStructure, options = {}) {
@@ -383,7 +676,7 @@ var NetworkLSTM = class {
     this.hiddenSize = hiddenSize;
     this.lstm = new LSTMLayer(inputSize, hiddenSize);
     const activation = options.denseActivation ?? sigmoid2;
-    const optimizer = options.optimizer ?? defaultOptimizer4;
+    const optimizer = options.optimizer ?? defaultOptimizer;
     this.denseLayers = [];
     const sizes = [hiddenSize, ...denseStructure];
     for (let i = 1; i < sizes.length; i++) {
@@ -398,6 +691,7 @@ var NetworkLSTM = class {
   }
   // ── Forward pass ──────────────────────────────────────────────────────────
   predict(inputs) {
+    validateArray(inputs, this.inputSize, "NetworkLSTM.predict");
     const h = this.lstm.predict(inputs);
     const acts = [h];
     for (const layer of this.denseLayers) {
@@ -473,6 +767,30 @@ var NetworkLSTM = class {
       });
     });
   }
+  // ── Flat weight serialization ─────────────────────────────────────────────
+  // Order: LSTM (flat), then dense layer 0, dense layer 1, ..., dense layer N.
+  getWeightsFlat() {
+    const w = [];
+    w.push(...this.lstm.getWeightsFlat());
+    for (const layer of this.denseLayers) {
+      for (const n of layer.neurons) {
+        w.push(...n.weights, n.bias);
+      }
+    }
+    return w;
+  }
+  setWeightsFlat(weights) {
+    let idx = 0;
+    const lstmLen = this.lstm.getWeightsFlat().length;
+    this.lstm.setWeightsFlat(weights.slice(idx, idx + lstmLen));
+    idx += lstmLen;
+    for (const layer of this.denseLayers) {
+      for (const n of layer.neurons) {
+        for (let j = 0; j < n.weights.length; j++) n.weights[j] = weights[idx++];
+        n.bias = weights[idx++];
+      }
+    }
+  }
 };
 // src/MatMul.ts
@@ -480,6 +798,9 @@ function matMul(A, B) {
   const rows = A.length;
   const inner = B.length;
   const cols = B[0].length;
+  if (A[0].length !== B.length) {
+    throw new Error(`Incompatible dimensions for matrix multiplication: A cols (${A[0].length}) !== B rows (${B.length})`);
+  }
   const C = Array.from({ length: rows }, () => new Array(cols).fill(0));
   for (let i = 0; i < rows; i++)
     for (let k = 0; k < inner; k++) {
@@ -530,6 +851,33 @@ var WeightMatrix = class {
         this.W[i][j] = this.opts[i][j].step(this.W[i][j], g, lr);
       }
   }
+  // ── Flat weight serialization ─────────────────────────────────────────────
+  getWeights() {
+    const w = [];
+    for (const row of this.W) w.push(...row);
+    return w;
+  }
+  setWeights(weights) {
+    let idx = 0;
+    for (let i = 0; i < this.W.length; i++)
+      for (let j = 0; j < this.W[i].length; j++) this.W[i][j] = weights[idx++];
+  }
+};
+var BiasVector = class {
+  constructor(size) {
+    this.values = new Array(size).fill(0);
+    this.opts = Array.from({ length: size }, () => new Adam());
+  }
+  update(grad, lr) {
+    for (let i = 0; i < this.values.length; i++)
+      this.values[i] = this.opts[i].step(this.values[i], grad[i], lr);
+  }
+  getWeights() {
+    return [...this.values];
+  }
+  setWeights(weights) {
+    for (let i = 0; i < this.values.length; i++) this.values[i] = weights[i];
+  }
 };
 var EmbeddingMatrix = class {
   constructor(vocabSize, d_model) {
@@ -546,15 +894,29 @@ var EmbeddingMatrix = class {
     for (let m = 0; m < this.W[idx].length; m++)
       this.W[idx][m] += lr * grad[m];
   }
+  // ── Serializable interface ─────────────────────────────────────────────────
+  // Flattened order: row 0, row 1, ... row (vocabSize-1)
+  getWeights() {
+    const w = [];
+    for (const row of this.W) w.push(...row);
+    return w;
+  }
+  setWeights(weights) {
+    let idx = 0;
+    for (let i = 0; i < this.W.length; i++)
+      for (let j = 0; j < this.W[i].length; j++)
+        this.W[i][j] = weights[idx++];
+  }
 };
 // src/AttentionHead.ts
 var AttentionHead = class {
-  constructor(d_model, d_k, d_v) {
+  constructor(d_model, d_k, d_v, causal = false) {
     // d_v × d_model
     this.cache = null;
     this.d_k = d_k;
     this.d_v = d_v;
+    this.causal = causal;
     this.Wq = new WeightMatrix(d_k, d_model);
     this.Wk = new WeightMatrix(d_k, d_model);
     this.Wv = new WeightMatrix(d_v, d_model);
@@ -575,10 +937,10 @@ var AttentionHead = class {
     );
     const scores = Array.from(
       { length: seqLen },
-      (_, i) => Array.from(
-        { length: seqLen },
-        (_2, j) => Q[i].reduce((s, q, k) => s + q * K[j][k], 0) * scale
-      )
+      (_, i) => Array.from({ length: seqLen }, (_2, j) => {
+        if (this.causal && j > i) return -Infinity;
+        return Q[i].reduce((s, q, k) => s + q * K[j][k], 0) * scale;
+      })
     );
     const attn = scores.map((row) => softmax(row));
     const out = Array.from(
@@ -602,6 +964,7 @@ var AttentionHead = class {
   //   5. dWq  = dQ^T @ X,      dWk = dK^T @ X,  dWv = dV^T @ X
   //   6. dX   = dQ @ Wq  +  dK @ Wk  +  dV @ Wv
   backward(dOut, lr) {
+    if (!this.cache) throw new Error("AttentionHead.backward() called before predict()");
     const { X, Q, K, V, attn } = this.cache;
     const seqLen = X.length;
     const d_model = X[0].length;
@@ -674,21 +1037,40 @@ var AttentionHead = class {
   getAttentionWeights() {
     return this.cache ? this.cache.attn : null;
   }
+  // ── Flat weight serialization ─────────────────────────────────────────────
+  // Order: Wq, Wk, Wv.
+  getWeights() {
+    const w = [];
+    for (const row of this.Wq.W) w.push(...row);
+    for (const row of this.Wk.W) w.push(...row);
+    for (const row of this.Wv.W) w.push(...row);
+    return w;
+  }
+  setWeights(weights) {
+    let idx = 0;
+    for (let i = 0; i < this.Wq.W.length; i++)
+      for (let j = 0; j < this.Wq.W[i].length; j++) this.Wq.W[i][j] = weights[idx++];
+    for (let i = 0; i < this.Wk.W.length; i++)
+      for (let j = 0; j < this.Wk.W[i].length; j++) this.Wk.W[i][j] = weights[idx++];
+    for (let i = 0; i < this.Wv.W.length; i++)
+      for (let j = 0; j < this.Wv.W[i].length; j++) this.Wv.W[i][j] = weights[idx++];
+  }
 };
 // src/MultiHeadAttention.ts
 var MultiHeadAttention = class {
   // seqLen × (nHeads * d_k)
-  constructor(d_model, nHeads) {
+  constructor(d_model, nHeads, causal = false) {
     // d_model × (nHeads * d_k)
     // Cached for backward
     this._concat = null;
     this.nHeads = nHeads;
     this.d_model = d_model;
     this.d_k = Math.floor(d_model / nHeads);
+    this.causal = causal;
     this.heads = Array.from(
       { length: nHeads },
-      () => new AttentionHead(d_model, this.d_k, this.d_k)
+      () => new AttentionHead(d_model, this.d_k, this.d_k, causal)
     );
     this.Wo = new WeightMatrix(d_model, nHeads * this.d_k);
   }
@@ -710,6 +1092,7 @@ var MultiHeadAttention = class {
   // ── Backward ──────────────────────────────────────────────────────────────
   // dOut: seqLen × d_model  →  dX: seqLen × d_model
   backward(dOut, lr) {
+    if (!this._concat) throw new Error("MultiHeadAttention.backward() called before predict()");
     const seqLen = dOut.length;
     const concatD = this.nHeads * this.d_k;
     const d_model = this.d_model;
@@ -747,6 +1130,31 @@ var MultiHeadAttention = class {
   getAttentionWeights() {
     return this.heads.map((h) => h.getAttentionWeights());
   }
+  // ── Flat weight serialization ─────────────────────────────────────────────
+  // Order: head0 (Wq, Wk, Wv), head1, ..., headN, then Wo.
+  getWeights() {
+    const w = [];
+    for (const head of this.heads) {
+      for (const row of head.Wq.W) w.push(...row);
+      for (const row of head.Wk.W) w.push(...row);
+      for (const row of head.Wv.W) w.push(...row);
+    }
+    for (const row of this.Wo.W) w.push(...row);
+    return w;
+  }
+  setWeights(weights) {
+    let idx = 0;
+    for (const head of this.heads) {
+      for (let i = 0; i < head.Wq.W.length; i++)
+        for (let j = 0; j < head.Wq.W[i].length; j++) head.Wq.W[i][j] = weights[idx++];
+      for (let i = 0; i < head.Wk.W.length; i++)
+        for (let j = 0; j < head.Wk.W[i].length; j++) head.Wk.W[i][j] = weights[idx++];
+      for (let i = 0; i < head.Wv.W.length; i++)
+        for (let j = 0; j < head.Wv.W[i].length; j++) head.Wv.W[i][j] = weights[idx++];
+    }
+    for (let i = 0; i < this.Wo.W.length; i++)
+      for (let j = 0; j < this.Wo.W[i].length; j++) this.Wo.W[i][j] = weights[idx++];
+  }
 };
 // src/LayerNorm.ts
@@ -789,20 +1197,32 @@ var LayerNorm = class {
   backwardOne(dOut, pos, lr) {
     const { x_norm, std } = this._cache[pos];
     const N = dOut.length;
+    const gammaOld = this.gamma.slice();
     for (let i = 0; i < N; i++) {
       this.gamma[i] += lr * dOut[i] * x_norm[i];
       this.beta[i] += lr * dOut[i];
     }
-    const D = dOut.map((d, i) => d * this.gamma[i]);
+    const D = dOut.map((d, i) => d * gammaOld[i]);
     const mD = D.reduce((s, v) => s + v, 0) / N;
     const mDxn = D.reduce((s, d, i) => s + d * x_norm[i], 0) / N;
     return D.map((d, i) => (d - mD - x_norm[i] * mDxn) / std);
   }
+  // ── Flat weight serialization ─────────────────────────────────────────────
+  // Order: gamma, beta.
+  getWeights() {
+    return [...this.gamma, ...this.beta];
+  }
+  setWeights(weights) {
+    const dim = this.gamma.length;
+    for (let i = 0; i < dim; i++) this.gamma[i] = weights[i];
+    for (let i = 0; i < dim; i++) this.beta[i] = weights[dim + i];
+  }
 };
 // src/TransformerBlock.ts
 var TransformerBlock = class {
-  constructor({ d_model, nHeads, d_ff }) {
+  constructor({ d_model, nHeads, d_ff, causal = false }) {
+    // d_model
     // Forward caches (needed for backprop)
     this._X = null;
     this._attnOut = null;
@@ -814,15 +1234,13 @@ var TransformerBlock = class {
     this._ff2Out = null;
     this.d_model = d_model;
     this.d_ff = d_ff;
-    this.attn = new MultiHeadAttention(d_model, nHeads);
+    this.attn = new MultiHeadAttention(d_model, nHeads, causal);
     this.norm1 = new LayerNorm(d_model);
     this.norm2 = new LayerNorm(d_model);
     this.ff1 = new WeightMatrix(d_ff, d_model);
     this.ff2 = new WeightMatrix(d_model, d_ff);
-    this.b1 = new Array(d_ff).fill(0);
-    this.b2 = new Array(d_model).fill(0);
-    this.b1Opts = Array.from({ length: d_ff }, () => new Adam());
-    this.b2Opts = Array.from({ length: d_model }, () => new Adam());
+    this.b1 = new BiasVector(d_ff);
+    this.b2 = new BiasVector(d_model);
   }
   // ── Forward ───────────────────────────────────────────────────────────────
   // X: seqLen × d_model  →  out: seqLen × d_model
@@ -835,11 +1253,11 @@ var TransformerBlock = class {
       return this.norm1.predictOne(added, i);
     });
     const ff1Pre = h1.map(
-      (h) => this.ff1.W.map((row, k) => row.reduce((s, w, m) => s + w * h[m], this.b1[k]))
+      (h) => this.ff1.W.map((row, k) => row.reduce((s, w, m) => s + w * h[m], this.b1.values[k]))
     );
     const ff1Out = ff1Pre.map((pre) => pre.map((v) => Math.max(0, v)));
     const ff2Out = ff1Out.map(
-      (h) => this.ff2.W.map((row, k) => row.reduce((s, w, m) => s + w * h[m], this.b2[k]))
+      (h) => this.ff2.W.map((row, k) => row.reduce((s, w, m) => s + w * h[m], this.b2.values[k]))
     );
     this.norm2.resetCache(seqLen);
     const out = h1.map((h, i) => {
@@ -857,6 +1275,9 @@ var TransformerBlock = class {
   // ── Backward ──────────────────────────────────────────────────────────────
   // dOut: seqLen × d_model  →  dX: seqLen × d_model
   backward(dOut, lr) {
+    if (!this._h1 || !this._ff1Out || !this._ff1Pre) {
+      throw new Error("TransformerBlock.backward() called before predict()");
+    }
     const seqLen = dOut.length;
     const d_model = this.d_model;
     const h1 = this._h1;
@@ -881,8 +1302,7 @@ var TransformerBlock = class {
       (_, m) => dAdded2.reduce((s, da) => s + da[m], 0)
     );
     this.ff2.update(dW2, lr);
-    for (let m = 0; m < d_model; m++)
-      this.b2[m] = this.b2Opts[m].step(this.b2[m], db2[m], lr);
+    this.b2.update(db2, lr);
     const dFf1Pre = dFf1Out.map(
       (d, i) => d.map((v, k) => ff1Pre[i][k] > 0 ? v : 0)
     );
@@ -904,8 +1324,7 @@ var TransformerBlock = class {
       (_, k) => dFf1Pre.reduce((s, dp) => s + dp[k], 0)
     );
     this.ff1.update(dW1, lr);
-    for (let k = 0; k < this.d_ff; k++)
-      this.b1[k] = this.b1Opts[k].step(this.b1[k], db1[k], lr);
+    this.b1.update(db1, lr);
     const dH1 = Array.from(
       { length: seqLen },
       (_, i) => dH1_fromFf[i].map((v, m) => v + dAdded2[i][m])
@@ -927,6 +1346,36 @@ var TransformerBlock = class {
   getAttentionWeights() {
     return this.attn.getAttentionWeights();
   }
+  // ── Flat weight serialization ─────────────────────────────────────────────
+  // Order: attn (MHA), norm1 (gamma, beta), ff1, b1, ff2, b2, norm2 (gamma, beta).
+  getWeights() {
+    const w = [];
+    w.push(...this.attn.getWeights());
+    w.push(...this.norm1.gamma, ...this.norm1.beta);
+    for (const row of this.ff1.W) w.push(...row);
+    w.push(...this.b1.values);
+    for (const row of this.ff2.W) w.push(...row);
+    w.push(...this.b2.values);
+    w.push(...this.norm2.gamma, ...this.norm2.beta);
+    return w;
+  }
+  setWeights(weights) {
+    let idx = 0;
+    const attnLen = this.attn.getWeights().length;
+    this.attn.setWeights(weights.slice(idx, idx + attnLen));
+    idx += attnLen;
+    this.norm1.setWeights(weights.slice(idx, idx + this.norm1.getWeights().length));
+    idx += this.norm1.getWeights().length;
+    this.ff1.setWeights(weights.slice(idx, idx + this.ff1.getWeights().length));
+    idx += this.ff1.getWeights().length;
+    this.b1.setWeights(weights.slice(idx, idx + this.b1.values.length));
+    idx += this.b1.values.length;
+    this.ff2.setWeights(weights.slice(idx, idx + this.ff2.getWeights().length));
+    idx += this.ff2.getWeights().length;
+    this.b2.setWeights(weights.slice(idx, idx + this.b2.values.length));
+    idx += this.b2.values.length;
+    this.norm2.setWeights(weights.slice(idx, idx + this.norm2.getWeights().length));
+  }
 };
 // src/NetworkTransformer.ts
@@ -951,8 +1400,7 @@ var NetworkTransformer = class {
       () => new TransformerBlock({ d_model, nHeads, d_ff })
     );
     this.outputProj = new WeightMatrix(nClasses, d_model);
-    this.outputBias = new Array(nClasses).fill(0);
-    this.outBiasOpts = Array.from({ length: nClasses }, () => new Adam());
+    this.outputBias = new BiasVector(nClasses);
   }
   // ── Forward pass ──────────────────────────────────────────────────────────
   // tokens: seqLen integer ids  →  seqLen * nClasses logits (flattened)
@@ -960,7 +1408,7 @@ var NetworkTransformer = class {
     const h = this._forward(tokens);
     return h.flatMap(
       (hi) => this.outputProj.W.map(
-        (row, c) => row.reduce((s, w, m) => s + w * hi[m], this.outputBias[c])
+        (row, c) => row.reduce((s, w, m) => s + w * hi[m], this.outputBias.values[c])
       )
     );
   }
@@ -974,7 +1422,7 @@ var NetworkTransformer = class {
     const h = this._forward(tokens);
     const logits = h.map(
       (hi) => this.outputProj.W.map(
-        (row, c) => row.reduce((s, w, m) => s + w * hi[m], this.outputBias[c])
+        (row, c) => row.reduce((s, w, m) => s + w * hi[m], this.outputBias.values[c])
       )
     );
     let loss = 0;
@@ -1009,8 +1457,7 @@ var NetworkTransformer = class {
       (_, c) => dLogits.reduce((s, dl) => s + dl[c], 0)
     );
     this.outputProj.update(dWout, lr);
-    for (let c = 0; c < this.nClasses; c++)
-      this.outputBias[c] = this.outBiasOpts[c].step(this.outputBias[c], dBout[c], lr);
+    this.outputBias.update(dBout, lr);
     let dX = dH;
     for (let b = this.blocks.length - 1; b >= 0; b--)
       dX = this.blocks[b].backward(dX, lr);
@@ -1025,6 +1472,35 @@ var NetworkTransformer = class {
   getAttentionWeights() {
     return this.blocks.map((b) => b.getAttentionWeights());
   }
+  // ── Flat weight serialization ─────────────────────────────────────────────
+  // Order: tokenEmb, posEmb, block0, block1, ..., blockN, outputProj, outputBias.
+  getWeights() {
+    const w = [];
+    w.push(...this.tokenEmb.getWeights());
+    w.push(...this.posEmb.getWeights());
+    for (const block of this.blocks) w.push(...block.getWeights());
+    w.push(...this.outputProj.getWeights());
+    w.push(...this.outputBias.getWeights());
+    return w;
+  }
+  setWeights(weights) {
+    let idx = 0;
+    const tokenEmbLen = this.tokenEmb.getWeights().length;
+    this.tokenEmb.setWeights(weights.slice(idx, idx + tokenEmbLen));
+    idx += tokenEmbLen;
+    const posEmbLen = this.posEmb.getWeights().length;
+    this.posEmb.setWeights(weights.slice(idx, idx + posEmbLen));
+    idx += posEmbLen;
+    for (const block of this.blocks) {
+      const blockLen = block.getWeights().length;
+      block.setWeights(weights.slice(idx, idx + blockLen));
+      idx += blockLen;
+    }
+    const outProjLen = this.outputProj.getWeights().length;
+    this.outputProj.setWeights(weights.slice(idx, idx + outProjLen));
+    idx += outProjLen;
+    this.outputBias.setWeights(weights.slice(idx, idx + this.outputBias.values.length));
+  }
   // ── Internal ──────────────────────────────────────────────────────────────
   // Shared embedding + block forward pass.
   _forward(tokens) {
@@ -1044,25 +1520,28 @@ var NetworkTransformerRL = class {
   constructor(seqLen, inputDim, options = {}) {
     // Forward caches para backprop
     this._projected = null;
+    // For max pooling backward: argmax per dimension across all positions
+    this._argmax = null;
     const {
       d_model = 32,
       nHeads = 2,
       d_ff = 64,
       nBlocks = 2,
-      nActions = 2
+      nActions = 2,
+      pooling = "weighted"
     } = options;
     this.seqLen = seqLen;
     this.inputDim = inputDim;
     this.d_model = d_model;
     this.nActions = nActions;
+    this._pooling = pooling;
     this.inputProj = new WeightMatrix(d_model, inputDim);
     this.blocks = Array.from(
       { length: nBlocks },
-      () => new TransformerBlock({ d_model, nHeads, d_ff })
+      () => new TransformerBlock({ d_model, nHeads, d_ff, causal: true })
     );
     this.outputProj = new WeightMatrix(nActions, d_model);
-    this.outputBias = new Array(nActions).fill(0);
-    this.outBiasOpts = Array.from({ length: nActions }, () => new Adam());
+    this.outputBias = new BiasVector(nActions);
   }
   // ── Forward ────────────────────────────────────────────────────────────────
   // sequence: seqLen × inputDim → nActions Q-values
@@ -1070,7 +1549,7 @@ var NetworkTransformerRL = class {
     const h = this._forward(sequence);
     const pooled = this._pool(h);
     return this.outputProj.W.map(
-      (row, c) => row.reduce((s, w, m) => s + w * pooled[m], this.outputBias[c])
+      (row, c) => row.reduce((s, w, m) => s + w * pooled[m], this.outputBias.values[c])
     );
   }
   // ── Training ────────────────────────────────────────────────────────────────
@@ -1082,7 +1561,7 @@ var NetworkTransformerRL = class {
     const h = this._forward(sequence);
     const pooled = this._pool(h);
     const pred = this.outputProj.W.map(
-      (row, c) => row.reduce((s, w, m) => s + w * pooled[m], this.outputBias[c])
+      (row, c) => row.reduce((s, w, m) => s + w * pooled[m], this.outputBias.values[c])
     );
     const n = this.nActions;
     let loss = 0;
@@ -1105,13 +1584,8 @@ var NetworkTransformerRL = class {
     );
     const dBout = dPred.slice();
     this.outputProj.update(dWout, lr);
-    for (let c = 0; c < this.nActions; c++)
-      this.outputBias[c] = this.outBiasOpts[c].step(this.outputBias[c], dBout[c], lr);
-    let dH = Array.from(
-      { length: this.seqLen },
-      (_, i) => dPooled.map((v) => v / this.seqLen)
-      // Gradiente dividido entre posiciones
-    );
+    this.outputBias.update(dBout, lr);
+    let dH = this._distributePoolGradient(dPooled);
     for (let b = this.blocks.length - 1; b >= 0; b--)
       dH = this.blocks[b].backward(dH, lr);
     for (let i = 0; i < this.seqLen; i++) {
@@ -1130,8 +1604,32 @@ var NetworkTransformerRL = class {
   getAttentionWeights() {
     return this.blocks.map((b) => b.getAttentionWeights());
   }
-  // ── Serialization ──────────────────────────────────────────────────────────
-  getWeights() {
+  // ── Flat weight serialization ─────────────────────────────────────────────
+  // Order: inputProj, block0, block1, ..., blockN, outputProj, outputBias.
+  getWeightsFlat() {
+    const w = [];
+    w.push(...this.inputProj.getWeights());
+    for (const block of this.blocks) w.push(...block.getWeights());
+    w.push(...this.outputProj.getWeights());
+    w.push(...this.outputBias.getWeights());
+    return w;
+  }
+  setWeightsFlat(weights) {
+    let idx = 0;
+    const inputProjLen = this.inputProj.getWeights().length;
+    this.inputProj.setWeights(weights.slice(idx, idx + inputProjLen));
+    idx += inputProjLen;
+    for (const block of this.blocks) {
+      const blockLen = block.getWeights().length;
+      block.setWeights(weights.slice(idx, idx + blockLen));
+      idx += blockLen;
+    }
+    const outProjLen = this.outputProj.getWeights().length;
+    this.outputProj.setWeights(weights.slice(idx, idx + outProjLen));
+    idx += outProjLen;
+    this.outputBias.setWeights(weights.slice(idx, idx + this.outputBias.values.length));
+  }
+  getWeightsStructured() {
     return {
       inputProj: this.inputProj.W.map((r) => [...r]),
       blocks: this.blocks.map((b) => ({
@@ -1147,17 +1645,15 @@ var NetworkTransformerRL = class {
         norm2: { gamma: [...b.norm2.gamma], beta: [...b.norm2.beta] },
         ff1: b.ff1.W.map((r) => [...r]),
         ff2: b.ff2.W.map((r) => [...r]),
-        b1: [...b.b1],
-        b2: [...b.b2]
+        b1: [...b.b1.values],
+        b2: [...b.b2.values]
       })),
       outputProj: this.outputProj.W.map((r) => [...r]),
-      outputBias: [...this.outputBias]
+      outputBias: [...this.outputBias.values]
     };
   }
-  setWeights(data) {
-    data.inputProj.forEach((row, i) => {
-      this.inputProj.W[i] = [...row];
-    });
+  setWeightsStructured(data) {
+    this.inputProj.setWeights(data.inputProj.flat());
     data.blocks.forEach((bd, b) => {
       const blk = this.blocks[b];
       bd.attn.heads.forEach((hd, h) => {
@@ -1172,11 +1668,20 @@ var NetworkTransformerRL = class {
       blk.norm2.beta = [...bd.norm2.beta];
       blk.ff1.W = bd.ff1.map((r) => [...r]);
       blk.ff2.W = bd.ff2.map((r) => [...r]);
-      blk.b1 = [...bd.b1];
-      blk.b2 = [...bd.b2];
+      blk.b1.setWeights(bd.b1);
+      blk.b2.setWeights(bd.b2);
     });
     this.outputProj.W = data.outputProj.map((r) => [...r]);
-    this.outputBias = [...data.outputBias];
+    this.outputBias.setWeights(data.outputBias);
+  }
+  // ── Serializable interface (flat array) ────────────────────────────────────
+  // These satisfy the Serializable interface from ModelSaver, which requires
+  // getWeights(): number[] and setWeights(weights: number[]): void.
+  getWeights() {
+    return this.getWeightsFlat();
+  }
+  setWeights(weights) {
+    this.setWeightsFlat(weights);
   }
   // ── Internal ────────────────────────────────────────────────────────────────
   _forward(sequence) {
@@ -1191,6 +1696,44 @@ var NetworkTransformerRL = class {
     return h;
   }
   _pool(h) {
+    switch (this._pooling) {
+      case "avg":
+        return this._poolAvg(h);
+      case "max":
+        return this._poolMax(h);
+      case "last":
+        return this._poolLast(h);
+      case "weighted":
+      default:
+        return this._poolWeighted(h);
+    }
+  }
+  _poolAvg(h) {
+    const n = h.length;
+    return Array.from({ length: this.d_model }, (_, m) => {
+      let sum = 0;
+      for (let i = 0; i < n; i++)
+        sum += h[i][m];
+      return sum / n;
+    });
+  }
+  _poolMax(h) {
+    this._argmax = new Array(this.d_model).fill(0);
+    return Array.from({ length: this.d_model }, (_, m) => {
+      let maxVal = -Infinity;
+      for (let i = 0; i < h.length; i++) {
+        if (h[i][m] > maxVal) {
+          maxVal = h[i][m];
+          this._argmax[m] = i;
+        }
+      }
+      return maxVal;
+    });
+  }
+  _poolLast(h) {
+    return [...h[h.length - 1]];
+  }
+  _poolWeighted(h) {
     const weights = Array.from(
       { length: this.seqLen },
       (_, i) => i === this.seqLen - 1 ? 2 : 1
@@ -1203,6 +1746,55 @@ var NetworkTransformerRL = class {
       return sum / totalWeight;
     });
   }
+  /** Returns the current pooling type for inspection. */
+  getPoolingType() {
+    return this._pooling;
+  }
+  // ── Helper: distribute pooled gradient back to each position ────────────────
+  // Must match the same distribution as _pool() used during forward.
+  _distributePoolGradient(dPooled) {
+    switch (this._pooling) {
+      case "avg": {
+        const n = this.seqLen;
+        return Array.from(
+          { length: n },
+          () => dPooled.map((v) => v / n)
+        );
+      }
+      case "max": {
+        if (!this._argmax) {
+          const n = this.seqLen;
+          return Array.from(
+            { length: n },
+            () => dPooled.map((v) => v / n)
+          );
+        }
+        const argmax = this._argmax;
+        return Array.from(
+          { length: this.seqLen },
+          (_, i) => dPooled.map((v, m) => i === argmax[m] ? v : 0)
+        );
+      }
+      case "last": {
+        return Array.from(
+          { length: this.seqLen },
+          (_, i) => i === this.seqLen - 1 ? [...dPooled] : new Array(this.d_model).fill(0)
+        );
+      }
+      case "weighted":
+      default: {
+        const weights = Array.from(
+          { length: this.seqLen },
+          (_, i) => i === this.seqLen - 1 ? 2 : 1
+        );
+        const totalWeight = weights.reduce((a, b) => a + b, 0);
+        return Array.from(
+          { length: this.seqLen },
+          (_, i) => dPooled.map((v) => v * weights[i] / totalWeight)
+        );
+      }
+    }
+  }
 };
 // src/losses.ts
@@ -1227,13 +1819,802 @@ function crossEntropyDeltaRaw(predicted, actual) {
   const p = Math.max(eps, Math.min(1 - eps, predicted));
   return actual / p - (1 - actual) / (1 - p);
 }
+// src/GRU.ts
+function sigmoid4(x) {
+  return 1 / (1 + Math.exp(-x));
+}
+function tanhFn(x) {
+  const e = Math.exp(2 * x);
+  return (e - 1) / (e + 1);
+}
+var Gate2 = class {
+  constructor(inputSize, hSize, initBias = 0) {
+    const n = inputSize + hSize;
+    const limit = Math.sqrt(2 / (n + hSize));
+    this.W = Array.from(
+      { length: hSize },
+      () => Array.from({ length: n }, () => (Math.random() * 2 - 1) * limit)
+    );
+    this.b = new Array(hSize).fill(initBias);
+  }
+  linear(combined) {
+    return this.W.map(
+      (row, i) => row.reduce((s, w, j) => s + w * combined[j], this.b[i])
+    );
+  }
+};
+var GRULayer = class {
+  constructor(inputSize, hiddenSize, optimizerFactory = () => new SGD()) {
+    this._traj = [];
+    if (inputSize <= 0 || hiddenSize <= 0) {
+      throw new Error(`GRULayer: inputSize and hiddenSize must be positive, got ${inputSize} and ${hiddenSize}`);
+    }
+    this.inputSize = inputSize;
+    this.hSize = hiddenSize;
+    this.h = new Array(hiddenSize).fill(0);
+    this.resetGate = new Gate2(inputSize, hiddenSize);
+    this.updateGate = new Gate2(inputSize, hiddenSize);
+    this.newGate = new Gate2(inputSize, hiddenSize);
+    const combSize = inputSize + hiddenSize;
+    this._optimizers = {
+      resetW: Array.from(
+        { length: hiddenSize },
+        () => Array.from({ length: combSize }, () => optimizerFactory())
+      ),
+      resetB: Array.from({ length: hiddenSize }, () => optimizerFactory()),
+      updateW: Array.from(
+        { length: hiddenSize },
+        () => Array.from({ length: combSize }, () => optimizerFactory())
+      ),
+      updateB: Array.from({ length: hiddenSize }, () => optimizerFactory()),
+      newW: Array.from(
+        { length: hiddenSize },
+        () => Array.from({ length: combSize }, () => optimizerFactory())
+      ),
+      newB: Array.from({ length: hiddenSize }, () => optimizerFactory())
+    };
+  }
+  reset() {
+    this.h = new Array(this.hSize).fill(0);
+    this._traj = [];
+  }
+  predict(inputs) {
+    if (!Array.isArray(inputs) || inputs.length !== this.inputSize) {
+      throw new Error(`GRULayer.predict: expected array of length ${this.inputSize}, got ${inputs?.length}`);
+    }
+    const combined = [...inputs, ...this.h];
+    const h_prev = [...this.h];
+    const r_pre = this.resetGate.linear(combined);
+    const z_pre = this.updateGate.linear(combined);
+    const r_a = r_pre.map(sigmoid4);
+    const z_a = z_pre.map(sigmoid4);
+    const combined_r = [...inputs, ...r_a.map((r, i) => r * h_prev[i])];
+    const n_pre = this.newGate.linear(combined_r);
+    const n_a = n_pre.map(tanhFn);
+    const h = n_a.map((n, i) => (1 - z_a[i]) * n + z_a[i] * h_prev[i]);
+    this._traj.push({ combined, h_prev, r: r_pre, r_a, z: z_pre, z_a, combined_r, n_pre, n_a, h });
+    this.h = h;
+    return h;
+  }
+  backprop(dh_seq, lr) {
+    const T = this._traj.length;
+    if (T === 0 || dh_seq.length !== T) return;
+    const hSize = this.hSize;
+    const combSize = this.inputSize + hSize;
+    const dWr = Array.from({ length: hSize }, () => new Array(combSize).fill(0));
+    const dWz = Array.from({ length: hSize }, () => new Array(combSize).fill(0));
+    const dWn = Array.from({ length: hSize }, () => new Array(combSize).fill(0));
+    const dbr = new Array(hSize).fill(0);
+    const dbz = new Array(hSize).fill(0);
+    const dbn = new Array(hSize).fill(0);
+    let dh_next = new Array(hSize).fill(0);
+    for (let t = T - 1; t >= 0; t--) {
+      const s = this._traj[t];
+      const dh = dh_seq[t].map((d, i) => d + dh_next[i]);
+      const dz_a = dh.map((d, i) => (s.h_prev[i] - s.n_a[i]) * d);
+      const dn_a = dh.map((d, i) => (1 - s.z_a[i]) * d);
+      const dn_pre = dn_a.map((d, i) => d * (1 - s.n_a[i] ** 2));
+      const dz_pre = dz_a.map((d, i) => d * s.z_a[i] * (1 - s.z_a[i]));
+      const dr_hprev = Array.from(
+        { length: hSize },
+        (_, i) => this.newGate.W.reduce((sum, row, k) => sum + dn_pre[k] * row[this.inputSize + i], 0)
+      );
+      const dr_a = dr_hprev.map((d, i) => d * s.h_prev[i]);
+      const dr_pre = dr_a.map((d, i) => d * s.r_a[i] * (1 - s.r_a[i]));
+      for (let k = 0; k < hSize; k++) {
+        for (let j = 0; j < combSize; j++) {
+          dWr[k][j] += dr_pre[k] * s.combined[j];
+          dWz[k][j] += dz_pre[k] * s.combined[j];
+          dWn[k][j] += dn_pre[k] * s.combined_r[j];
+        }
+        dbr[k] += dr_pre[k];
+        dbz[k] += dz_pre[k];
+        dbn[k] += dn_pre[k];
+      }
+      dh_next = new Array(hSize).fill(0);
+      for (let k = 0; k < hSize; k++) {
+        for (let j = this.inputSize; j < combSize; j++) {
+          dh_next[j - this.inputSize] += dr_pre[k] * this.resetGate.W[k][j] + dz_pre[k] * this.updateGate.W[k][j];
+        }
+        dh_next[k] += dr_hprev[k] * s.r_a[k];
+        dh_next[k] += dh[k] * s.z_a[k];
+      }
+    }
+    const scale = lr / T;
+    for (let k = 0; k < hSize; k++) {
+      for (let j = 0; j < combSize; j++) {
+        this.resetGate.W[k][j] = this._optimizers.resetW[k][j].step(this.resetGate.W[k][j], dWr[k][j], scale);
+        this.updateGate.W[k][j] = this._optimizers.updateW[k][j].step(this.updateGate.W[k][j], dWz[k][j], scale);
+        this.newGate.W[k][j] = this._optimizers.newW[k][j].step(this.newGate.W[k][j], dWn[k][j], scale);
+      }
+      this.resetGate.b[k] = this._optimizers.resetB[k].step(this.resetGate.b[k], dbr[k], scale);
+      this.updateGate.b[k] = this._optimizers.updateB[k].step(this.updateGate.b[k], dbz[k], scale);
+      this.newGate.b[k] = this._optimizers.newB[k].step(this.newGate.b[k], dbn[k], scale);
+    }
+    this._traj = [];
+  }
+  // ── Flat weight serialization ─────────────────────────────────────────────
+  // Order: resetGate (W, b), updateGate (W, b), newGate (W, b).
+  getWeightsFlat() {
+    const w = [];
+    for (const row of this.resetGate.W) w.push(...row);
+    w.push(...this.resetGate.b);
+    for (const row of this.updateGate.W) w.push(...row);
+    w.push(...this.updateGate.b);
+    for (const row of this.newGate.W) w.push(...row);
+    w.push(...this.newGate.b);
+    return w;
+  }
+  setWeightsFlat(weights) {
+    let idx = 0;
+    for (let i = 0; i < this.resetGate.W.length; i++)
+      for (let j = 0; j < this.resetGate.W[i].length; j++) this.resetGate.W[i][j] = weights[idx++];
+    for (let i = 0; i < this.resetGate.b.length; i++) this.resetGate.b[i] = weights[idx++];
+    for (let i = 0; i < this.updateGate.W.length; i++)
+      for (let j = 0; j < this.updateGate.W[i].length; j++) this.updateGate.W[i][j] = weights[idx++];
+    for (let i = 0; i < this.updateGate.b.length; i++) this.updateGate.b[i] = weights[idx++];
+    for (let i = 0; i < this.newGate.W.length; i++)
+      for (let j = 0; j < this.newGate.W[i].length; j++) this.newGate.W[i][j] = weights[idx++];
+    for (let i = 0; i < this.newGate.b.length; i++) this.newGate.b[i] = weights[idx++];
+  }
+  getWeights() {
+    return {
+      resetGate: { W: this.resetGate.W, b: this.resetGate.b },
+      updateGate: { W: this.updateGate.W, b: this.updateGate.b },
+      newGate: { W: this.newGate.W, b: this.newGate.b }
+    };
+  }
+  setWeights(data) {
+    this.resetGate.W = data.resetGate.W;
+    this.resetGate.b = data.resetGate.b;
+    this.updateGate.W = data.updateGate.W;
+    this.updateGate.b = data.updateGate.b;
+    this.newGate.W = data.newGate.W;
+    this.newGate.b = data.newGate.b;
+  }
+};
+// src/BatchNorm.ts
+var BatchNorm = class {
+  constructor(dim, momentum = 0.1) {
+    this._xNorm = null;
+    this._std = null;
+    this.dim = dim;
+    this.momentum = momentum;
+    this.gamma = new Array(dim).fill(1);
+    this.beta = new Array(dim).fill(0);
+    this.runningMean = new Array(dim).fill(0);
+    this.runningVar = new Array(dim).fill(1);
+  }
+  // ── Forward ───────────────────────────────────────────────────────────────
+  forward(x) {
+    if (x.length !== this.dim) {
+      throw new Error(`BatchNorm.forward: expected array of length ${this.dim}, got ${x.length}`);
+    }
+    const eps = 1e-5;
+    for (let i = 0; i < this.dim; i++) {
+      this.runningMean[i] = this.momentum * this.runningMean[i] + (1 - this.momentum) * x[i];
+      const diff = x[i] - this.runningMean[i];
+      this.runningVar[i] = this.momentum * this.runningVar[i] + (1 - this.momentum) * diff * diff;
+    }
+    this._std = this.runningVar.map((v) => Math.sqrt(v + eps));
+    this._xNorm = x.map((v, i) => (v - this.runningMean[i]) / this._std[i]);
+    return this._xNorm.map((xn, i) => this.gamma[i] * xn + this.beta[i]);
+  }
+  // ── Backward ──────────────────────────────────────────────────────────────
+  backward(dOut) {
+    if (!this._xNorm || !this._std) {
+      throw new Error("BatchNorm.backward: call forward() first");
+    }
+    for (let i = 0; i < this.dim; i++) {
+    }
+    return dOut.map((d, i) => d * this.gamma[i] / this._std[i]);
+  }
+  // ── Train gamma and beta (call after backward) ────────────────────────────
+  trainParams(dOut, lr) {
+    if (!this._xNorm) return;
+    for (let i = 0; i < this.dim; i++) {
+      this.gamma[i] += lr * dOut[i] * this._xNorm[i];
+      this.beta[i] += lr * dOut[i];
+    }
+  }
+  // ── Flat weight serialization ─────────────────────────────────────────────
+  // Order: gamma, beta.
+  getWeights() {
+    return [...this.gamma, ...this.beta];
+  }
+  setWeights(weights) {
+    for (let i = 0; i < this.dim; i++) this.gamma[i] = weights[i];
+    for (let i = 0; i < this.dim; i++) this.beta[i] = weights[this.dim + i];
+  }
+};
+// src/Conv1D.ts
+var Conv1D = class {
+  constructor(inputLength, kernelSize, filters, stride = 1, padding = "valid", optimizerFactory = () => new SGD(), inputChannels = 1) {
+    // [filters]
+    this._input = null;
+    this._paddedInput = null;
+    if (inputLength <= 0 || kernelSize <= 0 || filters <= 0) {
+      throw new Error("Conv1D: inputLength, kernelSize, and filters must be positive");
+    }
+    if (kernelSize > inputLength && padding === "valid") {
+      throw new Error("Conv1D: kernelSize cannot exceed inputLength with valid padding");
+    }
+    if (inputChannels < 1) {
+      throw new Error("Conv1D: inputChannels must be >= 1");
+    }
+    this.inputLength = inputLength;
+    this.kernelSize = kernelSize;
+    this.filters = filters;
+    this.stride = stride;
+    this.padding = padding;
+    this.inputChannels = inputChannels;
+    const limit = Math.sqrt(2 / (kernelSize * inputChannels));
+    this.kernels = Array.from(
+      { length: filters },
+      () => Array.from(
+        { length: kernelSize },
+        () => Array.from({ length: inputChannels }, () => (Math.random() * 2 - 1) * limit)
+      )
+    );
+    this.biases = new Array(filters).fill(0);
+    this._kOpts = Array.from(
+      { length: filters },
+      () => Array.from(
+        { length: kernelSize },
+        () => Array.from({ length: inputChannels }, () => optimizerFactory())
+      )
+    );
+    this._bOpts = Array.from({ length: filters }, () => optimizerFactory());
+  }
+  // ── Forward ───────────────────────────────────────────────────────────────
+  // Accepts either number[] (when inputChannels=1) or number[][] (multi-channel).
+  forward(input) {
+    const input2D = this._normalizeInput(input);
+    this._input = input2D.map((row) => [...row]);
+    let padded;
+    if (this.padding === "same") {
+      const padSize = Math.floor((this.kernelSize - 1) / 2);
+      const padRow = new Array(this.inputChannels).fill(0);
+      padded = new Array(padSize).fill(null).map(() => [...padRow]).concat(input2D).concat(new Array(padSize).fill(null).map(() => [...padRow]));
+    } else {
+      padded = input2D;
+    }
+    this._paddedInput = padded;
+    const outputLength = Math.floor((padded.length - this.kernelSize) / this.stride) + 1;
+    const output = Array.from(
+      { length: this.filters },
+      () => new Array(outputLength).fill(0)
+    );
+    for (let f = 0; f < this.filters; f++) {
+      for (let pos = 0; pos < outputLength; pos++) {
+        const start = pos * this.stride;
+        let sum = this.biases[f];
+        for (let k = 0; k < this.kernelSize; k++) {
+          for (let c = 0; c < this.inputChannels; c++) {
+            sum += this.kernels[f][k][c] * padded[start + k][c];
+          }
+        }
+        output[f][pos] = sum;
+      }
+    }
+    return output;
+  }
+  // ── Backward ──────────────────────────────────────────────────────────────
+  backward(dOut, lr = 1e-3) {
+    if (!this._paddedInput || !this._input) {
+      throw new Error("Conv1D.backward: call forward() first");
+    }
+    const padded = this._paddedInput;
+    const outputLength = dOut[0].length;
+    const dKernels = Array.from(
+      { length: this.filters },
+      () => Array.from(
+        { length: this.kernelSize },
+        () => new Array(this.inputChannels).fill(0)
+      )
+    );
+    const dBiases = new Array(this.filters).fill(0);
+    const dPadded = padded.map((row) => new Array(this.inputChannels).fill(0));
+    for (let f = 0; f < this.filters; f++) {
+      for (let pos = 0; pos < outputLength; pos++) {
+        const start = pos * this.stride;
+        dBiases[f] += dOut[f][pos];
+        for (let k = 0; k < this.kernelSize; k++) {
+          for (let c = 0; c < this.inputChannels; c++) {
+            dKernels[f][k][c] += dOut[f][pos] * padded[start + k][c];
+            dPadded[start + k][c] += dOut[f][pos] * this.kernels[f][k][c];
+          }
+        }
+      }
+    }
+    for (let f = 0; f < this.filters; f++) {
+      for (let k = 0; k < this.kernelSize; k++) {
+        for (let c = 0; c < this.inputChannels; c++) {
+          this.kernels[f][k][c] = this._kOpts[f][k][c].step(this.kernels[f][k][c], dKernels[f][k][c], lr);
+        }
+      }
+      this.biases[f] = this._bOpts[f].step(this.biases[f], dBiases[f], lr);
+    }
+    if (this.padding === "same") {
+      const padSize = Math.floor((this.kernelSize - 1) / 2);
+      return dPadded.slice(padSize, padSize + this.inputLength);
+    }
+    return dPadded.slice(0, this.inputLength);
+  }
+  // ── Output length ─────────────────────────────────────────────────────────
+  getOutputLength() {
+    if (this.padding === "same") {
+      return Math.ceil(this.inputLength / this.stride);
+    }
+    return Math.floor((this.inputLength - this.kernelSize) / this.stride) + 1;
+  }
+  // ── Flat weight serialization ─────────────────────────────────────────────
+  // Order: kernels (flattened), biases.
+  getWeights() {
+    const w = [];
+    for (const kernel of this.kernels)
+      for (const k of kernel)
+        for (const c of k)
+          w.push(c);
+    w.push(...this.biases);
+    return w;
+  }
+  setWeights(weights) {
+    let idx = 0;
+    for (let f = 0; f < this.filters; f++)
+      for (let k = 0; k < this.kernelSize; k++)
+        for (let c = 0; c < this.inputChannels; c++)
+          this.kernels[f][k][c] = weights[idx++];
+    for (let f = 0; f < this.filters; f++)
+      this.biases[f] = weights[idx++];
+  }
+  // ── Normalize input to 2D format ─────────────────────────────────────────
+  _normalizeInput(input) {
+    if (input.length === 0) {
+      throw new Error("Conv1D.forward: input cannot be empty");
+    }
+    if (typeof input[0] === "number") {
+      if (this.inputChannels !== 1) {
+        throw new Error(`Conv1D.forward: expected 2D input with ${this.inputChannels} channels, got 1D`);
+      }
+      const input1D = input;
+      if (input1D.length !== this.inputLength) {
+        throw new Error(`Conv1D.forward: expected input of length ${this.inputLength}, got ${input1D.length}`);
+      }
+      return input1D.map((v) => [v]);
+    }
+    const input2D = input;
+    if (input2D.length !== this.inputLength) {
+      throw new Error(`Conv1D.forward: expected input of length ${this.inputLength}, got ${input2D.length}`);
+    }
+    for (let i = 0; i < input2D.length; i++) {
+      if (input2D[i].length !== this.inputChannels) {
+        throw new Error(`Conv1D.forward: expected ${this.inputChannels} channels at position ${i}, got ${input2D[i].length}`);
+      }
+    }
+    return input2D;
+  }
+};
+// src/Trainer.ts
+var Trainer = class {
+  constructor(network, options = {}) {
+    this._history = [];
+    this._bestLoss = Infinity;
+    this._patienceCounter = 0;
+    this._stopReason = "maxEpochs";
+    this._metrics = [];
+    this.network = network;
+    this.epochs = options.epochs ?? 1e3;
+    this.lrInitial = options.lr ?? 0.1;
+    this.lrDecay = options.lrDecay ?? 1;
+    this.verbose = options.verbose ?? false;
+    this.weightDecay = options.weightDecay ?? 0;
+    this._earlyStopping = options.earlyStopping;
+    this._computeMetrics = options.computeMetrics ?? false;
+    this.clipValue = options.clipValue ?? 0;
+  }
+  // ── Set external validation data (for early stopping) ────────────────────
+  setValidationData(dataset) {
+    if (dataset.inputs.length !== dataset.targets.length) {
+      throw new Error(
+        "Trainer.setValidationData: inputs and targets must have the same length"
+      );
+    }
+    this._validationData = dataset;
+  }
+  // ── Get best validation loss during training ─────────────────────────────
+  getBestLoss() {
+    return this._bestLoss === Infinity ? -1 : this._bestLoss;
+  }
+  // ── Why did training stop? ───────────────────────────────────────────────
+  getStopReason() {
+    return this._stopReason;
+  }
+  // ── Get per-epoch classification metrics ─────────────────────────────────
+  getMetrics() {
+    return [...this._metrics];
+  }
+  // ── Train on dataset ──────────────────────────────────────────────────────
+  train(dataset) {
+    const { inputs, targets } = dataset;
+    if (inputs.length !== targets.length) {
+      throw new Error(
+        "Trainer.train: inputs and targets must have the same length"
+      );
+    }
+    const n = inputs.length;
+    let lr = this.lrInitial;
+    this._history = [];
+    this._bestLoss = Infinity;
+    this._patienceCounter = 0;
+    this._stopReason = "maxEpochs";
+    this._metrics = [];
+    const netExt = this._hasWeights(this.network);
+    if (this.weightDecay > 0 && !netExt) {
+      console.warn(
+        "Trainer: weightDecay requires a network with getWeights/setWeights/predict. Skipping weight decay."
+      );
+    }
+    if (this._earlyStopping && !netExt) {
+      console.warn(
+        "Trainer: earlyStopping requires a network with predict(). Skipping early stopping."
+      );
+    }
+    if (this._computeMetrics && !netExt) {
+      console.warn(
+        "Trainer: computeMetrics requires a network with predict(). Skipping metrics."
+      );
+    }
+    const canDecay = this.weightDecay > 0 && netExt;
+    const canValidate = !!this._earlyStopping && netExt && !!this._validationData;
+    const canMetric = this._computeMetrics && netExt;
+    const isClass = canMetric && this._isClassification(targets);
+    if (canMetric && !isClass) {
+      console.warn(
+        "Trainer: computeMetrics is set but targets do not appear to be one-hot or single-class. Metrics will be skipped."
+      );
+    }
+    for (let epoch = 0; epoch < this.epochs; epoch++) {
+      const indices = Array.from({ length: n }, (_, i) => i);
+      for (let i = n - 1; i > 0; i--) {
+        const j = Math.floor(Math.random() * (i + 1));
+        [indices[i], indices[j]] = [indices[j], indices[i]];
+      }
+      let epochLoss = 0;
+      for (const i of indices) {
+        if (canDecay) {
+          const w = netExt.getWeights();
+          for (let j = 0; j < w.length; j++) {
+            w[j] *= 1 - lr * this.weightDecay;
+          }
+          netExt.setWeights(w);
+        }
+        epochLoss += this.network.train(inputs[i], targets[i], lr);
+      }
+      epochLoss /= n;
+      this._history.push(epochLoss);
+      if (canMetric && isClass) {
+        this._metrics.push(this._computeMetricsArray(netExt, inputs, targets));
+      }
+      if (canValidate && this._validationData) {
+        const valLoss = this._computeLoss(netExt, this._validationData);
+        const minDelta = this._earlyStopping.minDelta;
+        if (valLoss < this._bestLoss - minDelta) {
+          this._bestLoss = valLoss;
+          this._patienceCounter = 0;
+        } else {
+          this._patienceCounter++;
+        }
+        if (this._patienceCounter >= this._earlyStopping.patience) {
+          this._stopReason = "earlyStopping";
+          break;
+        }
+      }
+      lr *= this.lrDecay;
+      if (this.verbose && (epoch + 1) % 100 === 0) {
+        console.log(
+          `Epoch ${epoch + 1}/${this.epochs}, loss: ${epochLoss.toFixed(6)}, lr: ${lr.toFixed(6)}`
+        );
+      }
+    }
+    return this._history;
+  }
+  // ── Get loss history ──────────────────────────────────────────────────────
+  getHistory() {
+    return [...this._history];
+  }
+  // ── Private helpers ───────────────────────────────────────────────────────
+  /** Type guard: does this network support getWeights/setWeights/predict? */
+  _hasWeights(network) {
+    if ("getWeights" in network && "setWeights" in network && "predict" in network && typeof network.getWeights === "function" && typeof network.setWeights === "function" && typeof network.predict === "function") {
+      return network;
+    }
+    return null;
+  }
+  /** Mean squared error on a dataset (used for validation loss). */
+  _computeLoss(net, data) {
+    let totalLoss = 0;
+    for (let i = 0; i < data.inputs.length; i++) {
+      const pred = net.predict(data.inputs[i]);
+      const target = data.targets[i];
+      let sampleLoss = 0;
+      for (let j = 0; j < pred.length; j++) {
+        sampleLoss += (target[j] - pred[j]) ** 2;
+      }
+      totalLoss += sampleLoss / pred.length;
+    }
+    return totalLoss / data.inputs.length;
+  }
+  /** Heuristic: are targets classification-style (one-hot or single-class)? */
+  _isClassification(targets) {
+    if (targets.length === 0) return false;
+    const first = targets[0];
+    if (first.length === 1) return true;
+    for (const t of targets) {
+      let sum = 0;
+      for (const v of t) {
+        sum += v;
+        if (v < -0.01 || v > 0.01 && v < 0.99 && Math.abs(v - 1) > 0.01)
+          return false;
+      }
+      if (Math.abs(sum - 1) > 0.01) return false;
+    }
+    return true;
+  }
+  /** Compute classification metrics from predictions vs targets. */
+  _computeMetricsArray(net, inputs, targets) {
+    const targetLen = targets[0].length;
+    const nClasses = targetLen === 1 ? 2 : targetLen;
+    const confusion = Array.from(
+      { length: nClasses },
+      () => Array(nClasses).fill(0)
+    );
+    for (let i = 0; i < inputs.length; i++) {
+      const pred = net.predict(inputs[i]);
+      const target = targets[i];
+      let predClass;
+      let trueClass;
+      if (targetLen === 1) {
+        trueClass = target[0] >= 0.5 ? 1 : 0;
+        if (pred.length === 1) {
+          predClass = pred[0] >= 0.5 ? 1 : 0;
+        } else {
+          predClass = pred.indexOf(Math.max(...pred));
+        }
+      } else {
+        predClass = pred.indexOf(Math.max(...pred));
+        trueClass = target.indexOf(Math.max(...target));
+      }
+      predClass = Math.max(0, Math.min(nClasses - 1, predClass));
+      trueClass = Math.max(0, Math.min(nClasses - 1, trueClass));
+      confusion[trueClass][predClass]++;
+    }
+    let totalCorrect = 0;
+    let totalSamples = 0;
+    const precisions = [];
+    const recalls = [];
+    for (let c = 0; c < nClasses; c++) {
+      const tp = confusion[c][c];
+      totalCorrect += tp;
+      let colSum = 0;
+      let rowSum = 0;
+      for (let r = 0; r < nClasses; r++) {
+        colSum += confusion[r][c];
+        rowSum += confusion[c][r];
+      }
+      totalSamples += rowSum;
+      precisions.push(colSum > 0 ? tp / colSum : 0);
+      recalls.push(rowSum > 0 ? tp / rowSum : 0);
+    }
+    const accuracy = totalSamples > 0 ? totalCorrect / totalSamples : 0;
+    const macroPrecision = precisions.reduce((a, b) => a + b, 0) / nClasses;
+    const macroRecall = recalls.reduce((a, b) => a + b, 0) / nClasses;
+    const f1 = macroPrecision + macroRecall > 0 ? 2 * macroPrecision * macroRecall / (macroPrecision + macroRecall) : 0;
+    return {
+      accuracy,
+      precision: macroPrecision,
+      recall: macroRecall,
+      f1
+    };
+  }
+};
+// src/DataLoader.ts
+var DataLoader = class _DataLoader {
+  constructor(data, batchSize = 1, validationSplit = 0) {
+    if (data.inputs.length !== data.targets.length) {
+      throw new Error("DataLoader: inputs and targets must have the same length");
+    }
+    if (validationSplit < 0 || validationSplit >= 1) {
+      throw new Error(`DataLoader: validationSplit must be in [0, 1), got ${validationSplit}`);
+    }
+    this.data = data;
+    this.batchSize = batchSize;
+    this._validationSplit = validationSplit;
+    const fullIndices = Array.from({ length: data.inputs.length }, (_, i) => i);
+    for (let i = fullIndices.length - 1; i > 0; i--) {
+      const j = Math.floor(Math.random() * (i + 1));
+      [fullIndices[i], fullIndices[j]] = [fullIndices[j], fullIndices[i]];
+    }
+    if (validationSplit > 0) {
+      const valSize = Math.round(data.inputs.length * validationSplit);
+      const trainSize = data.inputs.length - valSize;
+      this._trainIndices = fullIndices.slice(0, trainSize);
+      this._valIndices = fullIndices.slice(trainSize);
+    } else {
+      this._trainIndices = [...fullIndices];
+      this._valIndices = [];
+    }
+    this._indices = [...this._trainIndices];
+    this._pos = 0;
+  }
+  // ── Shuffle the training data ──────────────────────────────────────────────
+  shuffle() {
+    for (let i = this._trainIndices.length - 1; i > 0; i--) {
+      const j = Math.floor(Math.random() * (i + 1));
+      [this._trainIndices[i], this._trainIndices[j]] = [this._trainIndices[j], this._trainIndices[i]];
+    }
+    this._indices = [...this._trainIndices];
+    this._pos = 0;
+  }
+  // ── Check if more batches are available ───────────────────────────────────
+  hasNext() {
+    return this._pos < this._indices.length;
+  }
+  // ── Get next batch ────────────────────────────────────────────────────────
+  next() {
+    const end = Math.min(this._pos + this.batchSize, this._indices.length);
+    const batchIndices = this._indices.slice(this._pos, end);
+    this._pos = end;
+    return {
+      inputs: batchIndices.map((i) => this.data.inputs[i]),
+      targets: batchIndices.map((i) => this.data.targets[i])
+    };
+  }
+  // ── Reset iteration ───────────────────────────────────────────────────────
+  reset() {
+    this._pos = 0;
+  }
+  // ── Get total number of training samples ───────────────────────────────────
+  get length() {
+    return this._trainIndices.length;
+  }
+  // ── Get validation data as a DataPair ──────────────────────────────────────
+  // Returns the validation samples (inputs + targets) in their shuffled order.
+  // Returns empty arrays if no validation split was configured.
+  getValidationData() {
+    return {
+      inputs: this._valIndices.map((i) => this.data.inputs[i]),
+      targets: this._valIndices.map((i) => this.data.targets[i])
+    };
+  }
+  // ── Get number of validation samples ───────────────────────────────────────
+  get validationLength() {
+    return this._valIndices.length;
+  }
+  // ── Create sequence windows from a time series ────────────────────────────
+  static sequences(data, seqLen, validationSplit = 0) {
+    if (data.length < seqLen + 1) {
+      throw new Error("DataLoader.sequences: data length must be >= seqLen + 1");
+    }
+    const inputs = [];
+    const targets = [];
+    for (let i = 0; i <= data.length - seqLen - 1; i++) {
+      inputs.push(data.slice(i, i + seqLen).flat());
+      targets.push(data[i + seqLen]);
+    }
+    return new _DataLoader({ inputs, targets }, 1, validationSplit);
+  }
+};
+// src/LRScheduler.ts
+var LRScheduler = class {
+  // ── Step Decay ────────────────────────────────────────────────────────────
+  // lr = initialLr * dropRate^floor(epoch / epochsDrop)
+  stepDecay(lr, epoch, dropRate, epochsDrop) {
+    return lr * Math.pow(dropRate, Math.floor(epoch / epochsDrop));
+  }
+  // ── Exponential Decay ─────────────────────────────────────────────────────
+  // lr = initialLr * decayRate^epoch
+  exponentialDecay(lr, epoch, decayRate) {
+    return lr * Math.pow(decayRate, epoch);
+  }
+  // ── Plateau Decay ─────────────────────────────────────────────────────────
+  // If loss hasn't improved for `patience` epochs, multiply lr by `factor`.
+  // Returns the new lr. Call this after each epoch with the current loss.
+  //
+  // Usage:
+  //   let patience_counter = 0
+  //   let best_loss = Infinity
+  //   for (let epoch = 0; epoch < 1000; epoch++) {
+  //     const loss = train(...)
+  //     lr = scheduler.plateauDecay(lr, loss, history, 10, 0.5)
+  //   }
+  plateauDecay(lr, currentLoss, history, patience, factor) {
+    if (history.length < patience) return lr;
+    const recentLosses = history.slice(-patience);
+    const minRecentLoss = Math.min(...recentLosses);
+    if (currentLoss >= minRecentLoss) {
+      return lr * factor;
+    }
+    return lr;
+  }
+  // ── Cosine Annealing ──────────────────────────────────────────────────────
+  // lr = minLr + 0.5 * (maxLr - minLr) * (1 + cos(π * epoch / maxEpochs))
+  cosineAnnealing(lr, epoch, maxEpochs, minLr = 0) {
+    return minLr + 0.5 * (lr - minLr) * (1 + Math.cos(Math.PI * epoch / maxEpochs));
+  }
+};
+// src/ModelSaver.ts
+var ModelSaver = class _ModelSaver {
+  // ── Serialize to JSON string ──────────────────────────────────────────────
+  static toJSON(model) {
+    return JSON.stringify({
+      weights: model.getWeights(),
+      timestamp: Date.now()
+    });
+  }
+  // ── Deserialize from JSON string ──────────────────────────────────────────
+  static fromJSON(model, json) {
+    const data = JSON.parse(json);
+    if (!data.weights || !Array.isArray(data.weights)) {
+      throw new Error("ModelSaver.fromJSON: invalid model data");
+    }
+    model.setWeights(data.weights);
+  }
+  // ── Save to file (requires write function) ────────────────────────────────
+  static saveToFile(model, path, writeFn) {
+    const json = _ModelSaver.toJSON(model);
+    writeFn(path, json);
+  }
+  // ── Load from file (requires read function) ───────────────────────────────
+  static loadFromFile(model, path, readFn) {
+    const json = readFn(path);
+    _ModelSaver.fromJSON(model, json);
+  }
+};
 export {
   Adam,
   AttentionHead,
+  BatchNorm,
+  BiasVector,
+  ClipOptimizer,
+  ClippedOptimizerFactory,
+  Conv1D,
+  DataLoader,
+  Dropout,
   EmbeddingMatrix,
+  GRULayer,
+  LRScheduler,
   LSTMLayer,
   Layer,
   LayerNorm,
+  ModelSaver,
   Momentum,
   MultiHeadAttention,
   Network,
@@ -1244,11 +2625,13 @@ export {
   Neuron,
   NeuronN,
   SGD,
+  Trainer,
   TransformerBlock,
   WeightMatrix,
   crossEntropy,
   crossEntropyDelta,
   crossEntropyDeltaRaw,
+  defaultOptimizer,
   elu,
   leakyRelu,
   linear,
@@ -1262,5 +2645,9 @@ export {
   softmax,
   softmaxBackward,
   tanh,
-  transpose
+  transpose,
+  validate2DArray,
+  validateArray,
+  validateArrayMinLength,
+  validateNumber
 };