@dniskav/neuron 0.2.5 → 0.2.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.js CHANGED
@@ -23,6 +23,7 @@ __export(index_exports, {
23
23
  Adam: () => Adam,
24
24
  AttentionHead: () => AttentionHead,
25
25
  BatchNorm: () => BatchNorm,
26
+ BiasVector: () => BiasVector,
26
27
  ClipOptimizer: () => ClipOptimizer,
27
28
  ClippedOptimizerFactory: () => ClippedOptimizerFactory,
28
29
  Conv1D: () => Conv1D,
@@ -51,6 +52,7 @@ __export(index_exports, {
51
52
  crossEntropy: () => crossEntropy,
52
53
  crossEntropyDelta: () => crossEntropyDelta,
53
54
  crossEntropyDeltaRaw: () => crossEntropyDeltaRaw,
55
+ defaultOptimizer: () => defaultOptimizer,
54
56
  elu: () => elu,
55
57
  leakyRelu: () => leakyRelu,
56
58
  linear: () => linear,
@@ -201,6 +203,7 @@ function makeElu(alpha = 1) {
201
203
  var elu = makeElu(1);
202
204
 
203
205
  // src/optimizers.ts
206
+ var defaultOptimizer = () => new SGD();
204
207
  var SGD = class {
205
208
  step(weight, gradient, lr) {
206
209
  return weight + lr * gradient;
@@ -249,7 +252,6 @@ var Adam = class {
249
252
  };
250
253
 
251
254
  // src/NeuronN.ts
252
- var defaultOptimizer = () => new SGD();
253
255
  var NeuronN = class {
254
256
  constructor(nInputs, activation = sigmoid2, optimizerFactory = defaultOptimizer) {
255
257
  const limit = Math.sqrt(1 / nInputs);
@@ -278,9 +280,8 @@ var NeuronN = class {
278
280
  };
279
281
 
280
282
  // src/Layer.ts
281
- var defaultOptimizer2 = () => new SGD();
282
283
  var Layer = class {
283
- constructor(nNeurons, nInputs, activation = sigmoid2, optimizerFactory = defaultOptimizer2) {
284
+ constructor(nNeurons, nInputs, activation = sigmoid2, optimizerFactory = defaultOptimizer) {
284
285
  this.neurons = Array.from(
285
286
  { length: nNeurons },
286
287
  () => new NeuronN(nInputs, activation, optimizerFactory)
@@ -300,7 +301,7 @@ var Network = class {
300
301
  predict(inputs) {
301
302
  validateArray(inputs, this.hiddenLayer.neurons[0].weights.length, "Network.predict");
302
303
  const hiddenOut = this.hiddenLayer.predict(inputs);
303
- return this.outputLayer.predict(hiddenOut)[0];
304
+ return this.outputLayer.predict(hiddenOut);
304
305
  }
305
306
  // Trains on a single example. Returns the squared error.
306
307
  train(inputs, target, lr) {
@@ -309,22 +310,17 @@ var Network = class {
309
310
  validateNumber(lr, "Network.train");
310
311
  const hiddenOut = this.hiddenLayer.predict(inputs);
311
312
  const prediction = this.outputLayer.predict(hiddenOut)[0];
312
- const outputError = target - prediction;
313
- const outputDelta = outputError * prediction * (1 - prediction);
314
313
  const outputNeuron = this.outputLayer.neurons[0];
314
+ const outputError = target - prediction;
315
+ const outputDelta = outputError * outputNeuron.activation.dfn(prediction);
315
316
  const hiddenDeltas = this.hiddenLayer.neurons.map((neuron, i) => {
316
- const hiddenOut_i = hiddenOut[i];
317
317
  const hiddenError = outputDelta * outputNeuron.weights[i];
318
- return hiddenError * hiddenOut_i * (1 - hiddenOut_i);
318
+ return hiddenError * neuron.activation.dfn(hiddenOut[i]);
319
319
  });
320
320
  this.hiddenLayer.neurons.forEach((neuron, i) => {
321
- neuron.weights = neuron.weights.map((w, j) => w + lr * hiddenDeltas[i] * inputs[j]);
322
- neuron.bias += lr * hiddenDeltas[i];
321
+ neuron._update(inputs.map((inp) => hiddenDeltas[i] * inp), hiddenDeltas[i], lr);
323
322
  });
324
- outputNeuron.weights = outputNeuron.weights.map(
325
- (w, i) => w + lr * outputDelta * hiddenOut[i]
326
- );
327
- outputNeuron.bias += lr * outputDelta;
323
+ outputNeuron._update(hiddenOut.map((h) => outputDelta * h), outputDelta, lr);
328
324
  return outputError * outputError;
329
325
  }
330
326
  // ── Flat weight serialization ─────────────────────────────────────────────
@@ -394,13 +390,12 @@ var Dropout = class {
394
390
  };
395
391
 
396
392
  // src/NetworkN.ts
397
- var defaultOptimizer3 = () => new SGD();
398
393
  var NetworkN = class {
399
394
  constructor(structure, options = {}) {
400
395
  this.structure = structure;
401
396
  const nLayers = structure.length - 1;
402
397
  const activations = options.activations ?? Array.from({ length: nLayers }, () => sigmoid2);
403
- const optimizer = options.optimizer ?? defaultOptimizer3;
398
+ const optimizer = options.optimizer ?? defaultOptimizer;
404
399
  const dropoutRate = options.dropoutRate ?? 0;
405
400
  if (activations.length !== nLayers) {
406
401
  throw new Error(`Expected ${nLayers} activations, got ${activations.length}`);
@@ -453,73 +448,69 @@ var NetworkN = class {
453
448
  train(inputs, targets, lr) {
454
449
  validateArray(inputs, this.structure[0], "NetworkN.train");
455
450
  validateArray(targets, this.structure[this.structure.length - 1], "NetworkN.train");
456
- const act = [inputs];
457
- for (let i = 0; i < this.layers.length; i++) {
458
- const layerInput = act[act.length - 1];
459
- const layerOutput = this.layers[i].predict(layerInput);
460
- let current;
461
- if (this._shouldResidual(i)) {
462
- if (this.structure[i] === this.structure[i + 1]) {
463
- current = layerOutput.map((v, j) => v + layerInput[j]);
464
- } else {
465
- current = [...layerOutput];
466
- }
467
- } else {
468
- current = [...layerOutput];
469
- }
470
- if (i < this._dropouts.length) {
471
- current = this._dropouts[i].forward(current, true);
472
- }
473
- act.push(current);
474
- }
451
+ const act = this._forwardAll(inputs, true);
475
452
  const pred = act[act.length - 1];
476
453
  const outAct = this.layers[this.layers.length - 1].neurons[0].activation;
477
- let deltas = pred.map((p, i) => (targets[i] - p) * outAct.dfn(p));
478
- for (let l = this.layers.length - 1; l >= 0; l--) {
479
- const layer = this.layers[l];
480
- if (l < this._dropouts.length) {
481
- deltas = this._dropouts[l].backward(deltas);
482
- }
483
- const layerIn = act[l];
484
- const prevAct = l > 0 ? this.layers[l - 1].neurons[0].activation : null;
485
- const prevDeltas = layerIn.map((out, j) => {
486
- const errProp = layer.neurons.reduce((s, n, k) => s + deltas[k] * n.weights[j], 0);
487
- return prevAct ? errProp * prevAct.dfn(out) : errProp;
488
- });
489
- if (this._shouldResidual(l) && this.structure[l] === this.structure[l + 1]) {
490
- for (let j = 0; j < prevDeltas.length; j++) {
491
- prevDeltas[j] += deltas[j];
492
- }
493
- }
494
- layer.neurons.forEach((n, k) => {
495
- n._update(layerIn.map((inp) => deltas[k] * inp), deltas[k], lr);
496
- });
497
- deltas = prevDeltas;
498
- }
454
+ const deltas = pred.map((p, i) => (targets[i] - p) * outAct.dfn(p));
455
+ this._backpropLayers(act, deltas, lr);
499
456
  return pred.reduce((s, p, i) => s + (targets[i] - p) ** 2, 0) / pred.length;
500
457
  }
501
458
  // Backprop with externally provided output-layer deltas.
502
459
  // Useful for custom loss functions (e.g. physics-based gradients).
503
460
  trainWithDeltas(inputs, outputDeltas, lr) {
461
+ const act = this._forwardAll(inputs, true);
462
+ this._backpropLayers(act, outputDeltas, lr);
463
+ }
464
+ // ── Flat weight serialization ─────────────────────────────────────────────
465
+ // Order: layer 0 (all neurons), layer 1, ..., layer N.
466
+ getWeights() {
467
+ for (const d of this._dropouts) d.resetMask();
468
+ const w = [];
469
+ for (const layer of this.layers) {
470
+ for (const n of layer.neurons) {
471
+ w.push(...n.weights, n.bias);
472
+ }
473
+ }
474
+ return w;
475
+ }
476
+ setWeights(weights) {
477
+ for (const d of this._dropouts) d.resetMask();
478
+ let idx = 0;
479
+ for (const layer of this.layers) {
480
+ for (const n of layer.neurons) {
481
+ for (let j = 0; j < n.weights.length; j++) n.weights[j] = weights[idx++];
482
+ n.bias = weights[idx++];
483
+ }
484
+ }
485
+ }
486
+ // ── Private helpers ──────────────────────────────────────────────────────
487
+ _shouldResidual(layerIndex) {
488
+ if (typeof this._residual === "function") return this._residual(layerIndex);
489
+ return this._residual;
490
+ }
491
+ // Forward pass storing activations at every layer boundary.
492
+ // Used by train(), trainWithDeltas(), and predict() shares the same logic.
493
+ _forwardAll(inputs, training) {
504
494
  const act = [inputs];
505
495
  for (let i = 0; i < this.layers.length; i++) {
506
496
  const layerInput = act[act.length - 1];
507
497
  const layerOutput = this.layers[i].predict(layerInput);
508
498
  let current;
509
- if (this._shouldResidual(i)) {
510
- if (this.structure[i] === this.structure[i + 1]) {
511
- current = layerOutput.map((v, j) => v + layerInput[j]);
512
- } else {
513
- current = [...layerOutput];
514
- }
499
+ if (this._shouldResidual(i) && this.structure[i] === this.structure[i + 1]) {
500
+ current = layerOutput.map((v, j) => v + layerInput[j]);
515
501
  } else {
516
- current = [...layerOutput];
502
+ current = layerOutput;
517
503
  }
518
504
  if (i < this._dropouts.length) {
519
- current = this._dropouts[i].forward(current, true);
505
+ current = this._dropouts[i].forward(current, training);
520
506
  }
521
507
  act.push(current);
522
508
  }
509
+ return act;
510
+ }
511
+ // Backward pass: updates all layer weights given the pre-computed activations
512
+ // and the initial output-layer deltas.
513
+ _backpropLayers(act, outputDeltas, lr) {
523
514
  let deltas = outputDeltas;
524
515
  for (let l = this.layers.length - 1; l >= 0; l--) {
525
516
  const layer = this.layers[l];
@@ -533,9 +524,7 @@ var NetworkN = class {
533
524
  return prevAct ? errProp * prevAct.dfn(out) : errProp;
534
525
  });
535
526
  if (this._shouldResidual(l) && this.structure[l] === this.structure[l + 1]) {
536
- for (let j = 0; j < prevDeltas.length; j++) {
537
- prevDeltas[j] += deltas[j];
538
- }
527
+ for (let j = 0; j < prevDeltas.length; j++) prevDeltas[j] += deltas[j];
539
528
  }
540
529
  layer.neurons.forEach((n, k) => {
541
530
  n._update(layerIn.map((inp) => deltas[k] * inp), deltas[k], lr);
@@ -543,33 +532,6 @@ var NetworkN = class {
543
532
  deltas = prevDeltas;
544
533
  }
545
534
  }
546
- // ── Flat weight serialization ─────────────────────────────────────────────
547
- // Order: layer 0 (all neurons), layer 1, ..., layer N.
548
- getWeights() {
549
- for (const d of this._dropouts) d.resetMask();
550
- const w = [];
551
- for (const layer of this.layers) {
552
- for (const n of layer.neurons) {
553
- w.push(...n.weights, n.bias);
554
- }
555
- }
556
- return w;
557
- }
558
- setWeights(weights) {
559
- for (const d of this._dropouts) d.resetMask();
560
- let idx = 0;
561
- for (const layer of this.layers) {
562
- for (const n of layer.neurons) {
563
- for (let j = 0; j < n.weights.length; j++) n.weights[j] = weights[idx++];
564
- n.bias = weights[idx++];
565
- }
566
- }
567
- }
568
- // ── Helper ───────────────────────────────────────────────────────────────
569
- _shouldResidual(layerIndex) {
570
- if (typeof this._residual === "function") return this._residual(layerIndex);
571
- return this._residual;
572
- }
573
535
  };
574
536
 
575
537
  // src/LSTMLayer.ts
@@ -584,7 +546,7 @@ var Gate = class {
584
546
  // shape: [hSize]
585
547
  constructor(inputSize, hSize, initBias = 0) {
586
548
  const n = inputSize + hSize;
587
- const limit = Math.sqrt(2 / n);
549
+ const limit = Math.sqrt(2 / (n + hSize));
588
550
  this.W = Array.from(
589
551
  { length: hSize },
590
552
  () => Array.from({ length: n }, () => (Math.random() * 2 - 1) * limit)
@@ -783,7 +745,6 @@ var LSTMLayer = class {
783
745
  };
784
746
 
785
747
  // src/NetworkLSTM.ts
786
- var defaultOptimizer4 = () => new SGD();
787
748
  var NetworkLSTM = class {
788
749
  // [T][layer+1][neuron]
789
750
  constructor(inputSize, hiddenSize, denseStructure, options = {}) {
@@ -791,7 +752,7 @@ var NetworkLSTM = class {
791
752
  this.hiddenSize = hiddenSize;
792
753
  this.lstm = new LSTMLayer(inputSize, hiddenSize);
793
754
  const activation = options.denseActivation ?? sigmoid2;
794
- const optimizer = options.optimizer ?? defaultOptimizer4;
755
+ const optimizer = options.optimizer ?? defaultOptimizer;
795
756
  this.denseLayers = [];
796
757
  const sizes = [hiddenSize, ...denseStructure];
797
758
  for (let i = 1; i < sizes.length; i++) {
@@ -978,6 +939,22 @@ var WeightMatrix = class {
978
939
  for (let j = 0; j < this.W[i].length; j++) this.W[i][j] = weights[idx++];
979
940
  }
980
941
  };
942
+ var BiasVector = class {
943
+ constructor(size) {
944
+ this.values = new Array(size).fill(0);
945
+ this.opts = Array.from({ length: size }, () => new Adam());
946
+ }
947
+ update(grad, lr) {
948
+ for (let i = 0; i < this.values.length; i++)
949
+ this.values[i] = this.opts[i].step(this.values[i], grad[i], lr);
950
+ }
951
+ getWeights() {
952
+ return [...this.values];
953
+ }
954
+ setWeights(weights) {
955
+ for (let i = 0; i < this.values.length; i++) this.values[i] = weights[i];
956
+ }
957
+ };
981
958
  var EmbeddingMatrix = class {
982
959
  constructor(vocabSize, d_model) {
983
960
  const limit = Math.sqrt(1 / d_model);
@@ -1063,6 +1040,7 @@ var AttentionHead = class {
1063
1040
  // 5. dWq = dQ^T @ X, dWk = dK^T @ X, dWv = dV^T @ X
1064
1041
  // 6. dX = dQ @ Wq + dK @ Wk + dV @ Wv
1065
1042
  backward(dOut, lr) {
1043
+ if (!this.cache) throw new Error("AttentionHead.backward() called before predict()");
1066
1044
  const { X, Q, K, V, attn } = this.cache;
1067
1045
  const seqLen = X.length;
1068
1046
  const d_model = X[0].length;
@@ -1190,6 +1168,7 @@ var MultiHeadAttention = class {
1190
1168
  // ── Backward ──────────────────────────────────────────────────────────────
1191
1169
  // dOut: seqLen × d_model → dX: seqLen × d_model
1192
1170
  backward(dOut, lr) {
1171
+ if (!this._concat) throw new Error("MultiHeadAttention.backward() called before predict()");
1193
1172
  const seqLen = dOut.length;
1194
1173
  const concatD = this.nHeads * this.d_k;
1195
1174
  const d_model = this.d_model;
@@ -1294,11 +1273,12 @@ var LayerNorm = class {
1294
1273
  backwardOne(dOut, pos, lr) {
1295
1274
  const { x_norm, std } = this._cache[pos];
1296
1275
  const N = dOut.length;
1276
+ const gammaOld = this.gamma.slice();
1297
1277
  for (let i = 0; i < N; i++) {
1298
1278
  this.gamma[i] += lr * dOut[i] * x_norm[i];
1299
1279
  this.beta[i] += lr * dOut[i];
1300
1280
  }
1301
- const D = dOut.map((d, i) => d * this.gamma[i]);
1281
+ const D = dOut.map((d, i) => d * gammaOld[i]);
1302
1282
  const mD = D.reduce((s, v) => s + v, 0) / N;
1303
1283
  const mDxn = D.reduce((s, d, i) => s + d * x_norm[i], 0) / N;
1304
1284
  return D.map((d, i) => (d - mD - x_norm[i] * mDxn) / std);
@@ -1318,6 +1298,7 @@ var LayerNorm = class {
1318
1298
  // src/TransformerBlock.ts
1319
1299
  var TransformerBlock = class {
1320
1300
  constructor({ d_model, nHeads, d_ff, causal = false }) {
1301
+ // d_model
1321
1302
  // Forward caches (needed for backprop)
1322
1303
  this._X = null;
1323
1304
  this._attnOut = null;
@@ -1334,10 +1315,8 @@ var TransformerBlock = class {
1334
1315
  this.norm2 = new LayerNorm(d_model);
1335
1316
  this.ff1 = new WeightMatrix(d_ff, d_model);
1336
1317
  this.ff2 = new WeightMatrix(d_model, d_ff);
1337
- this.b1 = new Array(d_ff).fill(0);
1338
- this.b2 = new Array(d_model).fill(0);
1339
- this.b1Opts = Array.from({ length: d_ff }, () => new Adam());
1340
- this.b2Opts = Array.from({ length: d_model }, () => new Adam());
1318
+ this.b1 = new BiasVector(d_ff);
1319
+ this.b2 = new BiasVector(d_model);
1341
1320
  }
1342
1321
  // ── Forward ───────────────────────────────────────────────────────────────
1343
1322
  // X: seqLen × d_model → out: seqLen × d_model
@@ -1350,11 +1329,11 @@ var TransformerBlock = class {
1350
1329
  return this.norm1.predictOne(added, i);
1351
1330
  });
1352
1331
  const ff1Pre = h1.map(
1353
- (h) => this.ff1.W.map((row, k) => row.reduce((s, w, m) => s + w * h[m], this.b1[k]))
1332
+ (h) => this.ff1.W.map((row, k) => row.reduce((s, w, m) => s + w * h[m], this.b1.values[k]))
1354
1333
  );
1355
1334
  const ff1Out = ff1Pre.map((pre) => pre.map((v) => Math.max(0, v)));
1356
1335
  const ff2Out = ff1Out.map(
1357
- (h) => this.ff2.W.map((row, k) => row.reduce((s, w, m) => s + w * h[m], this.b2[k]))
1336
+ (h) => this.ff2.W.map((row, k) => row.reduce((s, w, m) => s + w * h[m], this.b2.values[k]))
1358
1337
  );
1359
1338
  this.norm2.resetCache(seqLen);
1360
1339
  const out = h1.map((h, i) => {
@@ -1372,6 +1351,9 @@ var TransformerBlock = class {
1372
1351
  // ── Backward ──────────────────────────────────────────────────────────────
1373
1352
  // dOut: seqLen × d_model → dX: seqLen × d_model
1374
1353
  backward(dOut, lr) {
1354
+ if (!this._h1 || !this._ff1Out || !this._ff1Pre) {
1355
+ throw new Error("TransformerBlock.backward() called before predict()");
1356
+ }
1375
1357
  const seqLen = dOut.length;
1376
1358
  const d_model = this.d_model;
1377
1359
  const h1 = this._h1;
@@ -1396,8 +1378,7 @@ var TransformerBlock = class {
1396
1378
  (_, m) => dAdded2.reduce((s, da) => s + da[m], 0)
1397
1379
  );
1398
1380
  this.ff2.update(dW2, lr);
1399
- for (let m = 0; m < d_model; m++)
1400
- this.b2[m] = this.b2Opts[m].step(this.b2[m], db2[m], lr);
1381
+ this.b2.update(db2, lr);
1401
1382
  const dFf1Pre = dFf1Out.map(
1402
1383
  (d, i) => d.map((v, k) => ff1Pre[i][k] > 0 ? v : 0)
1403
1384
  );
@@ -1419,8 +1400,7 @@ var TransformerBlock = class {
1419
1400
  (_, k) => dFf1Pre.reduce((s, dp) => s + dp[k], 0)
1420
1401
  );
1421
1402
  this.ff1.update(dW1, lr);
1422
- for (let k = 0; k < this.d_ff; k++)
1423
- this.b1[k] = this.b1Opts[k].step(this.b1[k], db1[k], lr);
1403
+ this.b1.update(db1, lr);
1424
1404
  const dH1 = Array.from(
1425
1405
  { length: seqLen },
1426
1406
  (_, i) => dH1_fromFf[i].map((v, m) => v + dAdded2[i][m])
@@ -1449,9 +1429,9 @@ var TransformerBlock = class {
1449
1429
  w.push(...this.attn.getWeights());
1450
1430
  w.push(...this.norm1.gamma, ...this.norm1.beta);
1451
1431
  for (const row of this.ff1.W) w.push(...row);
1452
- w.push(...this.b1);
1432
+ w.push(...this.b1.values);
1453
1433
  for (const row of this.ff2.W) w.push(...row);
1454
- w.push(...this.b2);
1434
+ w.push(...this.b2.values);
1455
1435
  w.push(...this.norm2.gamma, ...this.norm2.beta);
1456
1436
  return w;
1457
1437
  }
@@ -1460,16 +1440,17 @@ var TransformerBlock = class {
1460
1440
  const attnLen = this.attn.getWeights().length;
1461
1441
  this.attn.setWeights(weights.slice(idx, idx + attnLen));
1462
1442
  idx += attnLen;
1463
- for (let i = 0; i < this.norm1.gamma.length; i++) this.norm1.gamma[i] = weights[idx++];
1464
- for (let i = 0; i < this.norm1.beta.length; i++) this.norm1.beta[i] = weights[idx++];
1465
- for (let i = 0; i < this.ff1.W.length; i++)
1466
- for (let j = 0; j < this.ff1.W[i].length; j++) this.ff1.W[i][j] = weights[idx++];
1467
- for (let i = 0; i < this.b1.length; i++) this.b1[i] = weights[idx++];
1468
- for (let i = 0; i < this.ff2.W.length; i++)
1469
- for (let j = 0; j < this.ff2.W[i].length; j++) this.ff2.W[i][j] = weights[idx++];
1470
- for (let i = 0; i < this.b2.length; i++) this.b2[i] = weights[idx++];
1471
- for (let i = 0; i < this.norm2.gamma.length; i++) this.norm2.gamma[i] = weights[idx++];
1472
- for (let i = 0; i < this.norm2.beta.length; i++) this.norm2.beta[i] = weights[idx++];
1443
+ this.norm1.setWeights(weights.slice(idx, idx + this.norm1.getWeights().length));
1444
+ idx += this.norm1.getWeights().length;
1445
+ this.ff1.setWeights(weights.slice(idx, idx + this.ff1.getWeights().length));
1446
+ idx += this.ff1.getWeights().length;
1447
+ this.b1.setWeights(weights.slice(idx, idx + this.b1.values.length));
1448
+ idx += this.b1.values.length;
1449
+ this.ff2.setWeights(weights.slice(idx, idx + this.ff2.getWeights().length));
1450
+ idx += this.ff2.getWeights().length;
1451
+ this.b2.setWeights(weights.slice(idx, idx + this.b2.values.length));
1452
+ idx += this.b2.values.length;
1453
+ this.norm2.setWeights(weights.slice(idx, idx + this.norm2.getWeights().length));
1473
1454
  }
1474
1455
  };
1475
1456
 
@@ -1495,8 +1476,7 @@ var NetworkTransformer = class {
1495
1476
  () => new TransformerBlock({ d_model, nHeads, d_ff })
1496
1477
  );
1497
1478
  this.outputProj = new WeightMatrix(nClasses, d_model);
1498
- this.outputBias = new Array(nClasses).fill(0);
1499
- this.outBiasOpts = Array.from({ length: nClasses }, () => new Adam());
1479
+ this.outputBias = new BiasVector(nClasses);
1500
1480
  }
1501
1481
  // ── Forward pass ──────────────────────────────────────────────────────────
1502
1482
  // tokens: seqLen integer ids → seqLen * nClasses logits (flattened)
@@ -1504,7 +1484,7 @@ var NetworkTransformer = class {
1504
1484
  const h = this._forward(tokens);
1505
1485
  return h.flatMap(
1506
1486
  (hi) => this.outputProj.W.map(
1507
- (row, c) => row.reduce((s, w, m) => s + w * hi[m], this.outputBias[c])
1487
+ (row, c) => row.reduce((s, w, m) => s + w * hi[m], this.outputBias.values[c])
1508
1488
  )
1509
1489
  );
1510
1490
  }
@@ -1518,7 +1498,7 @@ var NetworkTransformer = class {
1518
1498
  const h = this._forward(tokens);
1519
1499
  const logits = h.map(
1520
1500
  (hi) => this.outputProj.W.map(
1521
- (row, c) => row.reduce((s, w, m) => s + w * hi[m], this.outputBias[c])
1501
+ (row, c) => row.reduce((s, w, m) => s + w * hi[m], this.outputBias.values[c])
1522
1502
  )
1523
1503
  );
1524
1504
  let loss = 0;
@@ -1553,8 +1533,7 @@ var NetworkTransformer = class {
1553
1533
  (_, c) => dLogits.reduce((s, dl) => s + dl[c], 0)
1554
1534
  );
1555
1535
  this.outputProj.update(dWout, lr);
1556
- for (let c = 0; c < this.nClasses; c++)
1557
- this.outputBias[c] = this.outBiasOpts[c].step(this.outputBias[c], dBout[c], lr);
1536
+ this.outputBias.update(dBout, lr);
1558
1537
  let dX = dH;
1559
1538
  for (let b = this.blocks.length - 1; b >= 0; b--)
1560
1539
  dX = this.blocks[b].backward(dX, lr);
@@ -1573,27 +1552,30 @@ var NetworkTransformer = class {
1573
1552
  // Order: tokenEmb, posEmb, block0, block1, ..., blockN, outputProj, outputBias.
1574
1553
  getWeights() {
1575
1554
  const w = [];
1576
- for (const row of this.tokenEmb.W) w.push(...row);
1577
- for (const row of this.posEmb.W) w.push(...row);
1555
+ w.push(...this.tokenEmb.getWeights());
1556
+ w.push(...this.posEmb.getWeights());
1578
1557
  for (const block of this.blocks) w.push(...block.getWeights());
1579
- for (const row of this.outputProj.W) w.push(...row);
1580
- w.push(...this.outputBias);
1558
+ w.push(...this.outputProj.getWeights());
1559
+ w.push(...this.outputBias.getWeights());
1581
1560
  return w;
1582
1561
  }
1583
1562
  setWeights(weights) {
1584
1563
  let idx = 0;
1585
- for (let i = 0; i < this.tokenEmb.W.length; i++)
1586
- for (let j = 0; j < this.tokenEmb.W[i].length; j++) this.tokenEmb.W[i][j] = weights[idx++];
1587
- for (let i = 0; i < this.posEmb.W.length; i++)
1588
- for (let j = 0; j < this.posEmb.W[i].length; j++) this.posEmb.W[i][j] = weights[idx++];
1564
+ const tokenEmbLen = this.tokenEmb.getWeights().length;
1565
+ this.tokenEmb.setWeights(weights.slice(idx, idx + tokenEmbLen));
1566
+ idx += tokenEmbLen;
1567
+ const posEmbLen = this.posEmb.getWeights().length;
1568
+ this.posEmb.setWeights(weights.slice(idx, idx + posEmbLen));
1569
+ idx += posEmbLen;
1589
1570
  for (const block of this.blocks) {
1590
1571
  const blockLen = block.getWeights().length;
1591
1572
  block.setWeights(weights.slice(idx, idx + blockLen));
1592
1573
  idx += blockLen;
1593
1574
  }
1594
- for (let i = 0; i < this.outputProj.W.length; i++)
1595
- for (let j = 0; j < this.outputProj.W[i].length; j++) this.outputProj.W[i][j] = weights[idx++];
1596
- for (let i = 0; i < this.outputBias.length; i++) this.outputBias[i] = weights[idx++];
1575
+ const outProjLen = this.outputProj.getWeights().length;
1576
+ this.outputProj.setWeights(weights.slice(idx, idx + outProjLen));
1577
+ idx += outProjLen;
1578
+ this.outputBias.setWeights(weights.slice(idx, idx + this.outputBias.values.length));
1597
1579
  }
1598
1580
  // ── Internal ──────────────────────────────────────────────────────────────
1599
1581
  // Shared embedding + block forward pass.
@@ -1635,8 +1617,7 @@ var NetworkTransformerRL = class {
1635
1617
  () => new TransformerBlock({ d_model, nHeads, d_ff, causal: true })
1636
1618
  );
1637
1619
  this.outputProj = new WeightMatrix(nActions, d_model);
1638
- this.outputBias = new Array(nActions).fill(0);
1639
- this.outBiasOpts = Array.from({ length: nActions }, () => new Adam());
1620
+ this.outputBias = new BiasVector(nActions);
1640
1621
  }
1641
1622
  // ── Forward ────────────────────────────────────────────────────────────────
1642
1623
  // sequence: seqLen × inputDim → nActions Q-values
@@ -1644,7 +1625,7 @@ var NetworkTransformerRL = class {
1644
1625
  const h = this._forward(sequence);
1645
1626
  const pooled = this._pool(h);
1646
1627
  return this.outputProj.W.map(
1647
- (row, c) => row.reduce((s, w, m) => s + w * pooled[m], this.outputBias[c])
1628
+ (row, c) => row.reduce((s, w, m) => s + w * pooled[m], this.outputBias.values[c])
1648
1629
  );
1649
1630
  }
1650
1631
  // ── Training ────────────────────────────────────────────────────────────────
@@ -1656,7 +1637,7 @@ var NetworkTransformerRL = class {
1656
1637
  const h = this._forward(sequence);
1657
1638
  const pooled = this._pool(h);
1658
1639
  const pred = this.outputProj.W.map(
1659
- (row, c) => row.reduce((s, w, m) => s + w * pooled[m], this.outputBias[c])
1640
+ (row, c) => row.reduce((s, w, m) => s + w * pooled[m], this.outputBias.values[c])
1660
1641
  );
1661
1642
  const n = this.nActions;
1662
1643
  let loss = 0;
@@ -1679,8 +1660,7 @@ var NetworkTransformerRL = class {
1679
1660
  );
1680
1661
  const dBout = dPred.slice();
1681
1662
  this.outputProj.update(dWout, lr);
1682
- for (let c = 0; c < this.nActions; c++)
1683
- this.outputBias[c] = this.outBiasOpts[c].step(this.outputBias[c], dBout[c], lr);
1663
+ this.outputBias.update(dBout, lr);
1684
1664
  let dH = this._distributePoolGradient(dPooled);
1685
1665
  for (let b = this.blocks.length - 1; b >= 0; b--)
1686
1666
  dH = this.blocks[b].backward(dH, lr);
@@ -1704,24 +1684,26 @@ var NetworkTransformerRL = class {
1704
1684
  // Order: inputProj, block0, block1, ..., blockN, outputProj, outputBias.
1705
1685
  getWeightsFlat() {
1706
1686
  const w = [];
1707
- for (const row of this.inputProj.W) w.push(...row);
1687
+ w.push(...this.inputProj.getWeights());
1708
1688
  for (const block of this.blocks) w.push(...block.getWeights());
1709
- for (const row of this.outputProj.W) w.push(...row);
1710
- w.push(...this.outputBias);
1689
+ w.push(...this.outputProj.getWeights());
1690
+ w.push(...this.outputBias.getWeights());
1711
1691
  return w;
1712
1692
  }
1713
1693
  setWeightsFlat(weights) {
1714
1694
  let idx = 0;
1715
- for (let i = 0; i < this.inputProj.W.length; i++)
1716
- for (let j = 0; j < this.inputProj.W[i].length; j++) this.inputProj.W[i][j] = weights[idx++];
1695
+ const inputProjLen = this.inputProj.getWeights().length;
1696
+ this.inputProj.setWeights(weights.slice(idx, idx + inputProjLen));
1697
+ idx += inputProjLen;
1717
1698
  for (const block of this.blocks) {
1718
1699
  const blockLen = block.getWeights().length;
1719
1700
  block.setWeights(weights.slice(idx, idx + blockLen));
1720
1701
  idx += blockLen;
1721
1702
  }
1722
- for (let i = 0; i < this.outputProj.W.length; i++)
1723
- for (let j = 0; j < this.outputProj.W[i].length; j++) this.outputProj.W[i][j] = weights[idx++];
1724
- for (let i = 0; i < this.outputBias.length; i++) this.outputBias[i] = weights[idx++];
1703
+ const outProjLen = this.outputProj.getWeights().length;
1704
+ this.outputProj.setWeights(weights.slice(idx, idx + outProjLen));
1705
+ idx += outProjLen;
1706
+ this.outputBias.setWeights(weights.slice(idx, idx + this.outputBias.values.length));
1725
1707
  }
1726
1708
  getWeightsStructured() {
1727
1709
  return {
@@ -1739,17 +1721,15 @@ var NetworkTransformerRL = class {
1739
1721
  norm2: { gamma: [...b.norm2.gamma], beta: [...b.norm2.beta] },
1740
1722
  ff1: b.ff1.W.map((r) => [...r]),
1741
1723
  ff2: b.ff2.W.map((r) => [...r]),
1742
- b1: [...b.b1],
1743
- b2: [...b.b2]
1724
+ b1: [...b.b1.values],
1725
+ b2: [...b.b2.values]
1744
1726
  })),
1745
1727
  outputProj: this.outputProj.W.map((r) => [...r]),
1746
- outputBias: [...this.outputBias]
1728
+ outputBias: [...this.outputBias.values]
1747
1729
  };
1748
1730
  }
1749
1731
  setWeightsStructured(data) {
1750
- data.inputProj.forEach((row, i) => {
1751
- this.inputProj.W[i] = [...row];
1752
- });
1732
+ this.inputProj.setWeights(data.inputProj.flat());
1753
1733
  data.blocks.forEach((bd, b) => {
1754
1734
  const blk = this.blocks[b];
1755
1735
  bd.attn.heads.forEach((hd, h) => {
@@ -1764,11 +1744,11 @@ var NetworkTransformerRL = class {
1764
1744
  blk.norm2.beta = [...bd.norm2.beta];
1765
1745
  blk.ff1.W = bd.ff1.map((r) => [...r]);
1766
1746
  blk.ff2.W = bd.ff2.map((r) => [...r]);
1767
- blk.b1 = [...bd.b1];
1768
- blk.b2 = [...bd.b2];
1747
+ blk.b1.setWeights(bd.b1);
1748
+ blk.b2.setWeights(bd.b2);
1769
1749
  });
1770
1750
  this.outputProj.W = data.outputProj.map((r) => [...r]);
1771
- this.outputBias = [...data.outputBias];
1751
+ this.outputBias.setWeights(data.outputBias);
1772
1752
  }
1773
1753
  // ── Serializable interface (flat array) ────────────────────────────────────
1774
1754
  // These satisfy the Serializable interface from ModelSaver, which requires
@@ -1927,7 +1907,7 @@ function tanhFn(x) {
1927
1907
  var Gate2 = class {
1928
1908
  constructor(inputSize, hSize, initBias = 0) {
1929
1909
  const n = inputSize + hSize;
1930
- const limit = Math.sqrt(2 / n);
1910
+ const limit = Math.sqrt(2 / (n + hSize));
1931
1911
  this.W = Array.from(
1932
1912
  { length: hSize },
1933
1913
  () => Array.from({ length: n }, () => (Math.random() * 2 - 1) * limit)
@@ -2699,6 +2679,7 @@ var ModelSaver = class _ModelSaver {
2699
2679
  Adam,
2700
2680
  AttentionHead,
2701
2681
  BatchNorm,
2682
+ BiasVector,
2702
2683
  ClipOptimizer,
2703
2684
  ClippedOptimizerFactory,
2704
2685
  Conv1D,
@@ -2727,6 +2708,7 @@ var ModelSaver = class _ModelSaver {
2727
2708
  crossEntropy,
2728
2709
  crossEntropyDelta,
2729
2710
  crossEntropyDeltaRaw,
2711
+ defaultOptimizer,
2730
2712
  elu,
2731
2713
  leakyRelu,
2732
2714
  linear,