@dniskav/neuron 0.2.5 → 0.2.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/dist/index.mjs CHANGED
@@ -127,6 +127,7 @@ function makeElu(alpha = 1) {
127
127
  var elu = makeElu(1);
128
128
 
129
129
  // src/optimizers.ts
130
+ var defaultOptimizer = () => new SGD();
130
131
  var SGD = class {
131
132
  step(weight, gradient, lr) {
132
133
  return weight + lr * gradient;
@@ -175,7 +176,6 @@ var Adam = class {
175
176
  };
176
177
 
177
178
  // src/NeuronN.ts
178
- var defaultOptimizer = () => new SGD();
179
179
  var NeuronN = class {
180
180
  constructor(nInputs, activation = sigmoid2, optimizerFactory = defaultOptimizer) {
181
181
  const limit = Math.sqrt(1 / nInputs);
@@ -204,9 +204,8 @@ var NeuronN = class {
204
204
  };
205
205
 
206
206
  // src/Layer.ts
207
- var defaultOptimizer2 = () => new SGD();
208
207
  var Layer = class {
209
- constructor(nNeurons, nInputs, activation = sigmoid2, optimizerFactory = defaultOptimizer2) {
208
+ constructor(nNeurons, nInputs, activation = sigmoid2, optimizerFactory = defaultOptimizer) {
210
209
  this.neurons = Array.from(
211
210
  { length: nNeurons },
212
211
  () => new NeuronN(nInputs, activation, optimizerFactory)
@@ -226,7 +225,7 @@ var Network = class {
226
225
  predict(inputs) {
227
226
  validateArray(inputs, this.hiddenLayer.neurons[0].weights.length, "Network.predict");
228
227
  const hiddenOut = this.hiddenLayer.predict(inputs);
229
- return this.outputLayer.predict(hiddenOut)[0];
228
+ return this.outputLayer.predict(hiddenOut);
230
229
  }
231
230
  // Trains on a single example. Returns the squared error.
232
231
  train(inputs, target, lr) {
@@ -235,22 +234,17 @@ var Network = class {
235
234
  validateNumber(lr, "Network.train");
236
235
  const hiddenOut = this.hiddenLayer.predict(inputs);
237
236
  const prediction = this.outputLayer.predict(hiddenOut)[0];
238
- const outputError = target - prediction;
239
- const outputDelta = outputError * prediction * (1 - prediction);
240
237
  const outputNeuron = this.outputLayer.neurons[0];
238
+ const outputError = target - prediction;
239
+ const outputDelta = outputError * outputNeuron.activation.dfn(prediction);
241
240
  const hiddenDeltas = this.hiddenLayer.neurons.map((neuron, i) => {
242
- const hiddenOut_i = hiddenOut[i];
243
241
  const hiddenError = outputDelta * outputNeuron.weights[i];
244
- return hiddenError * hiddenOut_i * (1 - hiddenOut_i);
242
+ return hiddenError * neuron.activation.dfn(hiddenOut[i]);
245
243
  });
246
244
  this.hiddenLayer.neurons.forEach((neuron, i) => {
247
- neuron.weights = neuron.weights.map((w, j) => w + lr * hiddenDeltas[i] * inputs[j]);
248
- neuron.bias += lr * hiddenDeltas[i];
245
+ neuron._update(inputs.map((inp) => hiddenDeltas[i] * inp), hiddenDeltas[i], lr);
249
246
  });
250
- outputNeuron.weights = outputNeuron.weights.map(
251
- (w, i) => w + lr * outputDelta * hiddenOut[i]
252
- );
253
- outputNeuron.bias += lr * outputDelta;
247
+ outputNeuron._update(hiddenOut.map((h) => outputDelta * h), outputDelta, lr);
254
248
  return outputError * outputError;
255
249
  }
256
250
  // ── Flat weight serialization ─────────────────────────────────────────────
@@ -320,13 +314,12 @@ var Dropout = class {
320
314
  };
321
315
 
322
316
  // src/NetworkN.ts
323
- var defaultOptimizer3 = () => new SGD();
324
317
  var NetworkN = class {
325
318
  constructor(structure, options = {}) {
326
319
  this.structure = structure;
327
320
  const nLayers = structure.length - 1;
328
321
  const activations = options.activations ?? Array.from({ length: nLayers }, () => sigmoid2);
329
- const optimizer = options.optimizer ?? defaultOptimizer3;
322
+ const optimizer = options.optimizer ?? defaultOptimizer;
330
323
  const dropoutRate = options.dropoutRate ?? 0;
331
324
  if (activations.length !== nLayers) {
332
325
  throw new Error(`Expected ${nLayers} activations, got ${activations.length}`);
@@ -379,73 +372,69 @@ var NetworkN = class {
379
372
  train(inputs, targets, lr) {
380
373
  validateArray(inputs, this.structure[0], "NetworkN.train");
381
374
  validateArray(targets, this.structure[this.structure.length - 1], "NetworkN.train");
382
- const act = [inputs];
383
- for (let i = 0; i < this.layers.length; i++) {
384
- const layerInput = act[act.length - 1];
385
- const layerOutput = this.layers[i].predict(layerInput);
386
- let current;
387
- if (this._shouldResidual(i)) {
388
- if (this.structure[i] === this.structure[i + 1]) {
389
- current = layerOutput.map((v, j) => v + layerInput[j]);
390
- } else {
391
- current = [...layerOutput];
392
- }
393
- } else {
394
- current = [...layerOutput];
395
- }
396
- if (i < this._dropouts.length) {
397
- current = this._dropouts[i].forward(current, true);
398
- }
399
- act.push(current);
400
- }
375
+ const act = this._forwardAll(inputs, true);
401
376
  const pred = act[act.length - 1];
402
377
  const outAct = this.layers[this.layers.length - 1].neurons[0].activation;
403
- let deltas = pred.map((p, i) => (targets[i] - p) * outAct.dfn(p));
404
- for (let l = this.layers.length - 1; l >= 0; l--) {
405
- const layer = this.layers[l];
406
- if (l < this._dropouts.length) {
407
- deltas = this._dropouts[l].backward(deltas);
408
- }
409
- const layerIn = act[l];
410
- const prevAct = l > 0 ? this.layers[l - 1].neurons[0].activation : null;
411
- const prevDeltas = layerIn.map((out, j) => {
412
- const errProp = layer.neurons.reduce((s, n, k) => s + deltas[k] * n.weights[j], 0);
413
- return prevAct ? errProp * prevAct.dfn(out) : errProp;
414
- });
415
- if (this._shouldResidual(l) && this.structure[l] === this.structure[l + 1]) {
416
- for (let j = 0; j < prevDeltas.length; j++) {
417
- prevDeltas[j] += deltas[j];
418
- }
419
- }
420
- layer.neurons.forEach((n, k) => {
421
- n._update(layerIn.map((inp) => deltas[k] * inp), deltas[k], lr);
422
- });
423
- deltas = prevDeltas;
424
- }
378
+ const deltas = pred.map((p, i) => (targets[i] - p) * outAct.dfn(p));
379
+ this._backpropLayers(act, deltas, lr);
425
380
  return pred.reduce((s, p, i) => s + (targets[i] - p) ** 2, 0) / pred.length;
426
381
  }
427
382
  // Backprop with externally provided output-layer deltas.
428
383
  // Useful for custom loss functions (e.g. physics-based gradients).
429
384
  trainWithDeltas(inputs, outputDeltas, lr) {
385
+ const act = this._forwardAll(inputs, true);
386
+ this._backpropLayers(act, outputDeltas, lr);
387
+ }
388
+ // ── Flat weight serialization ─────────────────────────────────────────────
389
+ // Order: layer 0 (all neurons), layer 1, ..., layer N.
390
+ getWeights() {
391
+ for (const d of this._dropouts) d.resetMask();
392
+ const w = [];
393
+ for (const layer of this.layers) {
394
+ for (const n of layer.neurons) {
395
+ w.push(...n.weights, n.bias);
396
+ }
397
+ }
398
+ return w;
399
+ }
400
+ setWeights(weights) {
401
+ for (const d of this._dropouts) d.resetMask();
402
+ let idx = 0;
403
+ for (const layer of this.layers) {
404
+ for (const n of layer.neurons) {
405
+ for (let j = 0; j < n.weights.length; j++) n.weights[j] = weights[idx++];
406
+ n.bias = weights[idx++];
407
+ }
408
+ }
409
+ }
410
+ // ── Private helpers ──────────────────────────────────────────────────────
411
+ _shouldResidual(layerIndex) {
412
+ if (typeof this._residual === "function") return this._residual(layerIndex);
413
+ return this._residual;
414
+ }
415
+ // Forward pass storing activations at every layer boundary.
416
+ // Used by train(), trainWithDeltas(), and predict() shares the same logic.
417
+ _forwardAll(inputs, training) {
430
418
  const act = [inputs];
431
419
  for (let i = 0; i < this.layers.length; i++) {
432
420
  const layerInput = act[act.length - 1];
433
421
  const layerOutput = this.layers[i].predict(layerInput);
434
422
  let current;
435
- if (this._shouldResidual(i)) {
436
- if (this.structure[i] === this.structure[i + 1]) {
437
- current = layerOutput.map((v, j) => v + layerInput[j]);
438
- } else {
439
- current = [...layerOutput];
440
- }
423
+ if (this._shouldResidual(i) && this.structure[i] === this.structure[i + 1]) {
424
+ current = layerOutput.map((v, j) => v + layerInput[j]);
441
425
  } else {
442
- current = [...layerOutput];
426
+ current = layerOutput;
443
427
  }
444
428
  if (i < this._dropouts.length) {
445
- current = this._dropouts[i].forward(current, true);
429
+ current = this._dropouts[i].forward(current, training);
446
430
  }
447
431
  act.push(current);
448
432
  }
433
+ return act;
434
+ }
435
+ // Backward pass: updates all layer weights given the pre-computed activations
436
+ // and the initial output-layer deltas.
437
+ _backpropLayers(act, outputDeltas, lr) {
449
438
  let deltas = outputDeltas;
450
439
  for (let l = this.layers.length - 1; l >= 0; l--) {
451
440
  const layer = this.layers[l];
@@ -459,9 +448,7 @@ var NetworkN = class {
459
448
  return prevAct ? errProp * prevAct.dfn(out) : errProp;
460
449
  });
461
450
  if (this._shouldResidual(l) && this.structure[l] === this.structure[l + 1]) {
462
- for (let j = 0; j < prevDeltas.length; j++) {
463
- prevDeltas[j] += deltas[j];
464
- }
451
+ for (let j = 0; j < prevDeltas.length; j++) prevDeltas[j] += deltas[j];
465
452
  }
466
453
  layer.neurons.forEach((n, k) => {
467
454
  n._update(layerIn.map((inp) => deltas[k] * inp), deltas[k], lr);
@@ -469,33 +456,6 @@ var NetworkN = class {
469
456
  deltas = prevDeltas;
470
457
  }
471
458
  }
472
- // ── Flat weight serialization ─────────────────────────────────────────────
473
- // Order: layer 0 (all neurons), layer 1, ..., layer N.
474
- getWeights() {
475
- for (const d of this._dropouts) d.resetMask();
476
- const w = [];
477
- for (const layer of this.layers) {
478
- for (const n of layer.neurons) {
479
- w.push(...n.weights, n.bias);
480
- }
481
- }
482
- return w;
483
- }
484
- setWeights(weights) {
485
- for (const d of this._dropouts) d.resetMask();
486
- let idx = 0;
487
- for (const layer of this.layers) {
488
- for (const n of layer.neurons) {
489
- for (let j = 0; j < n.weights.length; j++) n.weights[j] = weights[idx++];
490
- n.bias = weights[idx++];
491
- }
492
- }
493
- }
494
- // ── Helper ───────────────────────────────────────────────────────────────
495
- _shouldResidual(layerIndex) {
496
- if (typeof this._residual === "function") return this._residual(layerIndex);
497
- return this._residual;
498
- }
499
459
  };
500
460
 
501
461
  // src/LSTMLayer.ts
@@ -510,7 +470,7 @@ var Gate = class {
510
470
  // shape: [hSize]
511
471
  constructor(inputSize, hSize, initBias = 0) {
512
472
  const n = inputSize + hSize;
513
- const limit = Math.sqrt(2 / n);
473
+ const limit = Math.sqrt(2 / (n + hSize));
514
474
  this.W = Array.from(
515
475
  { length: hSize },
516
476
  () => Array.from({ length: n }, () => (Math.random() * 2 - 1) * limit)
@@ -709,7 +669,6 @@ var LSTMLayer = class {
709
669
  };
710
670
 
711
671
  // src/NetworkLSTM.ts
712
- var defaultOptimizer4 = () => new SGD();
713
672
  var NetworkLSTM = class {
714
673
  // [T][layer+1][neuron]
715
674
  constructor(inputSize, hiddenSize, denseStructure, options = {}) {
@@ -717,7 +676,7 @@ var NetworkLSTM = class {
717
676
  this.hiddenSize = hiddenSize;
718
677
  this.lstm = new LSTMLayer(inputSize, hiddenSize);
719
678
  const activation = options.denseActivation ?? sigmoid2;
720
- const optimizer = options.optimizer ?? defaultOptimizer4;
679
+ const optimizer = options.optimizer ?? defaultOptimizer;
721
680
  this.denseLayers = [];
722
681
  const sizes = [hiddenSize, ...denseStructure];
723
682
  for (let i = 1; i < sizes.length; i++) {
@@ -904,6 +863,22 @@ var WeightMatrix = class {
904
863
  for (let j = 0; j < this.W[i].length; j++) this.W[i][j] = weights[idx++];
905
864
  }
906
865
  };
866
+ var BiasVector = class {
867
+ constructor(size) {
868
+ this.values = new Array(size).fill(0);
869
+ this.opts = Array.from({ length: size }, () => new Adam());
870
+ }
871
+ update(grad, lr) {
872
+ for (let i = 0; i < this.values.length; i++)
873
+ this.values[i] = this.opts[i].step(this.values[i], grad[i], lr);
874
+ }
875
+ getWeights() {
876
+ return [...this.values];
877
+ }
878
+ setWeights(weights) {
879
+ for (let i = 0; i < this.values.length; i++) this.values[i] = weights[i];
880
+ }
881
+ };
907
882
  var EmbeddingMatrix = class {
908
883
  constructor(vocabSize, d_model) {
909
884
  const limit = Math.sqrt(1 / d_model);
@@ -989,6 +964,7 @@ var AttentionHead = class {
989
964
  // 5. dWq = dQ^T @ X, dWk = dK^T @ X, dWv = dV^T @ X
990
965
  // 6. dX = dQ @ Wq + dK @ Wk + dV @ Wv
991
966
  backward(dOut, lr) {
967
+ if (!this.cache) throw new Error("AttentionHead.backward() called before predict()");
992
968
  const { X, Q, K, V, attn } = this.cache;
993
969
  const seqLen = X.length;
994
970
  const d_model = X[0].length;
@@ -1116,6 +1092,7 @@ var MultiHeadAttention = class {
1116
1092
  // ── Backward ──────────────────────────────────────────────────────────────
1117
1093
  // dOut: seqLen × d_model → dX: seqLen × d_model
1118
1094
  backward(dOut, lr) {
1095
+ if (!this._concat) throw new Error("MultiHeadAttention.backward() called before predict()");
1119
1096
  const seqLen = dOut.length;
1120
1097
  const concatD = this.nHeads * this.d_k;
1121
1098
  const d_model = this.d_model;
@@ -1220,11 +1197,12 @@ var LayerNorm = class {
1220
1197
  backwardOne(dOut, pos, lr) {
1221
1198
  const { x_norm, std } = this._cache[pos];
1222
1199
  const N = dOut.length;
1200
+ const gammaOld = this.gamma.slice();
1223
1201
  for (let i = 0; i < N; i++) {
1224
1202
  this.gamma[i] += lr * dOut[i] * x_norm[i];
1225
1203
  this.beta[i] += lr * dOut[i];
1226
1204
  }
1227
- const D = dOut.map((d, i) => d * this.gamma[i]);
1205
+ const D = dOut.map((d, i) => d * gammaOld[i]);
1228
1206
  const mD = D.reduce((s, v) => s + v, 0) / N;
1229
1207
  const mDxn = D.reduce((s, d, i) => s + d * x_norm[i], 0) / N;
1230
1208
  return D.map((d, i) => (d - mD - x_norm[i] * mDxn) / std);
@@ -1244,6 +1222,7 @@ var LayerNorm = class {
1244
1222
  // src/TransformerBlock.ts
1245
1223
  var TransformerBlock = class {
1246
1224
  constructor({ d_model, nHeads, d_ff, causal = false }) {
1225
+ // d_model
1247
1226
  // Forward caches (needed for backprop)
1248
1227
  this._X = null;
1249
1228
  this._attnOut = null;
@@ -1260,10 +1239,8 @@ var TransformerBlock = class {
1260
1239
  this.norm2 = new LayerNorm(d_model);
1261
1240
  this.ff1 = new WeightMatrix(d_ff, d_model);
1262
1241
  this.ff2 = new WeightMatrix(d_model, d_ff);
1263
- this.b1 = new Array(d_ff).fill(0);
1264
- this.b2 = new Array(d_model).fill(0);
1265
- this.b1Opts = Array.from({ length: d_ff }, () => new Adam());
1266
- this.b2Opts = Array.from({ length: d_model }, () => new Adam());
1242
+ this.b1 = new BiasVector(d_ff);
1243
+ this.b2 = new BiasVector(d_model);
1267
1244
  }
1268
1245
  // ── Forward ───────────────────────────────────────────────────────────────
1269
1246
  // X: seqLen × d_model → out: seqLen × d_model
@@ -1276,11 +1253,11 @@ var TransformerBlock = class {
1276
1253
  return this.norm1.predictOne(added, i);
1277
1254
  });
1278
1255
  const ff1Pre = h1.map(
1279
- (h) => this.ff1.W.map((row, k) => row.reduce((s, w, m) => s + w * h[m], this.b1[k]))
1256
+ (h) => this.ff1.W.map((row, k) => row.reduce((s, w, m) => s + w * h[m], this.b1.values[k]))
1280
1257
  );
1281
1258
  const ff1Out = ff1Pre.map((pre) => pre.map((v) => Math.max(0, v)));
1282
1259
  const ff2Out = ff1Out.map(
1283
- (h) => this.ff2.W.map((row, k) => row.reduce((s, w, m) => s + w * h[m], this.b2[k]))
1260
+ (h) => this.ff2.W.map((row, k) => row.reduce((s, w, m) => s + w * h[m], this.b2.values[k]))
1284
1261
  );
1285
1262
  this.norm2.resetCache(seqLen);
1286
1263
  const out = h1.map((h, i) => {
@@ -1298,6 +1275,9 @@ var TransformerBlock = class {
1298
1275
  // ── Backward ──────────────────────────────────────────────────────────────
1299
1276
  // dOut: seqLen × d_model → dX: seqLen × d_model
1300
1277
  backward(dOut, lr) {
1278
+ if (!this._h1 || !this._ff1Out || !this._ff1Pre) {
1279
+ throw new Error("TransformerBlock.backward() called before predict()");
1280
+ }
1301
1281
  const seqLen = dOut.length;
1302
1282
  const d_model = this.d_model;
1303
1283
  const h1 = this._h1;
@@ -1322,8 +1302,7 @@ var TransformerBlock = class {
1322
1302
  (_, m) => dAdded2.reduce((s, da) => s + da[m], 0)
1323
1303
  );
1324
1304
  this.ff2.update(dW2, lr);
1325
- for (let m = 0; m < d_model; m++)
1326
- this.b2[m] = this.b2Opts[m].step(this.b2[m], db2[m], lr);
1305
+ this.b2.update(db2, lr);
1327
1306
  const dFf1Pre = dFf1Out.map(
1328
1307
  (d, i) => d.map((v, k) => ff1Pre[i][k] > 0 ? v : 0)
1329
1308
  );
@@ -1345,8 +1324,7 @@ var TransformerBlock = class {
1345
1324
  (_, k) => dFf1Pre.reduce((s, dp) => s + dp[k], 0)
1346
1325
  );
1347
1326
  this.ff1.update(dW1, lr);
1348
- for (let k = 0; k < this.d_ff; k++)
1349
- this.b1[k] = this.b1Opts[k].step(this.b1[k], db1[k], lr);
1327
+ this.b1.update(db1, lr);
1350
1328
  const dH1 = Array.from(
1351
1329
  { length: seqLen },
1352
1330
  (_, i) => dH1_fromFf[i].map((v, m) => v + dAdded2[i][m])
@@ -1375,9 +1353,9 @@ var TransformerBlock = class {
1375
1353
  w.push(...this.attn.getWeights());
1376
1354
  w.push(...this.norm1.gamma, ...this.norm1.beta);
1377
1355
  for (const row of this.ff1.W) w.push(...row);
1378
- w.push(...this.b1);
1356
+ w.push(...this.b1.values);
1379
1357
  for (const row of this.ff2.W) w.push(...row);
1380
- w.push(...this.b2);
1358
+ w.push(...this.b2.values);
1381
1359
  w.push(...this.norm2.gamma, ...this.norm2.beta);
1382
1360
  return w;
1383
1361
  }
@@ -1386,16 +1364,17 @@ var TransformerBlock = class {
1386
1364
  const attnLen = this.attn.getWeights().length;
1387
1365
  this.attn.setWeights(weights.slice(idx, idx + attnLen));
1388
1366
  idx += attnLen;
1389
- for (let i = 0; i < this.norm1.gamma.length; i++) this.norm1.gamma[i] = weights[idx++];
1390
- for (let i = 0; i < this.norm1.beta.length; i++) this.norm1.beta[i] = weights[idx++];
1391
- for (let i = 0; i < this.ff1.W.length; i++)
1392
- for (let j = 0; j < this.ff1.W[i].length; j++) this.ff1.W[i][j] = weights[idx++];
1393
- for (let i = 0; i < this.b1.length; i++) this.b1[i] = weights[idx++];
1394
- for (let i = 0; i < this.ff2.W.length; i++)
1395
- for (let j = 0; j < this.ff2.W[i].length; j++) this.ff2.W[i][j] = weights[idx++];
1396
- for (let i = 0; i < this.b2.length; i++) this.b2[i] = weights[idx++];
1397
- for (let i = 0; i < this.norm2.gamma.length; i++) this.norm2.gamma[i] = weights[idx++];
1398
- for (let i = 0; i < this.norm2.beta.length; i++) this.norm2.beta[i] = weights[idx++];
1367
+ this.norm1.setWeights(weights.slice(idx, idx + this.norm1.getWeights().length));
1368
+ idx += this.norm1.getWeights().length;
1369
+ this.ff1.setWeights(weights.slice(idx, idx + this.ff1.getWeights().length));
1370
+ idx += this.ff1.getWeights().length;
1371
+ this.b1.setWeights(weights.slice(idx, idx + this.b1.values.length));
1372
+ idx += this.b1.values.length;
1373
+ this.ff2.setWeights(weights.slice(idx, idx + this.ff2.getWeights().length));
1374
+ idx += this.ff2.getWeights().length;
1375
+ this.b2.setWeights(weights.slice(idx, idx + this.b2.values.length));
1376
+ idx += this.b2.values.length;
1377
+ this.norm2.setWeights(weights.slice(idx, idx + this.norm2.getWeights().length));
1399
1378
  }
1400
1379
  };
1401
1380
 
@@ -1421,8 +1400,7 @@ var NetworkTransformer = class {
1421
1400
  () => new TransformerBlock({ d_model, nHeads, d_ff })
1422
1401
  );
1423
1402
  this.outputProj = new WeightMatrix(nClasses, d_model);
1424
- this.outputBias = new Array(nClasses).fill(0);
1425
- this.outBiasOpts = Array.from({ length: nClasses }, () => new Adam());
1403
+ this.outputBias = new BiasVector(nClasses);
1426
1404
  }
1427
1405
  // ── Forward pass ──────────────────────────────────────────────────────────
1428
1406
  // tokens: seqLen integer ids → seqLen * nClasses logits (flattened)
@@ -1430,7 +1408,7 @@ var NetworkTransformer = class {
1430
1408
  const h = this._forward(tokens);
1431
1409
  return h.flatMap(
1432
1410
  (hi) => this.outputProj.W.map(
1433
- (row, c) => row.reduce((s, w, m) => s + w * hi[m], this.outputBias[c])
1411
+ (row, c) => row.reduce((s, w, m) => s + w * hi[m], this.outputBias.values[c])
1434
1412
  )
1435
1413
  );
1436
1414
  }
@@ -1444,7 +1422,7 @@ var NetworkTransformer = class {
1444
1422
  const h = this._forward(tokens);
1445
1423
  const logits = h.map(
1446
1424
  (hi) => this.outputProj.W.map(
1447
- (row, c) => row.reduce((s, w, m) => s + w * hi[m], this.outputBias[c])
1425
+ (row, c) => row.reduce((s, w, m) => s + w * hi[m], this.outputBias.values[c])
1448
1426
  )
1449
1427
  );
1450
1428
  let loss = 0;
@@ -1479,8 +1457,7 @@ var NetworkTransformer = class {
1479
1457
  (_, c) => dLogits.reduce((s, dl) => s + dl[c], 0)
1480
1458
  );
1481
1459
  this.outputProj.update(dWout, lr);
1482
- for (let c = 0; c < this.nClasses; c++)
1483
- this.outputBias[c] = this.outBiasOpts[c].step(this.outputBias[c], dBout[c], lr);
1460
+ this.outputBias.update(dBout, lr);
1484
1461
  let dX = dH;
1485
1462
  for (let b = this.blocks.length - 1; b >= 0; b--)
1486
1463
  dX = this.blocks[b].backward(dX, lr);
@@ -1499,27 +1476,30 @@ var NetworkTransformer = class {
1499
1476
  // Order: tokenEmb, posEmb, block0, block1, ..., blockN, outputProj, outputBias.
1500
1477
  getWeights() {
1501
1478
  const w = [];
1502
- for (const row of this.tokenEmb.W) w.push(...row);
1503
- for (const row of this.posEmb.W) w.push(...row);
1479
+ w.push(...this.tokenEmb.getWeights());
1480
+ w.push(...this.posEmb.getWeights());
1504
1481
  for (const block of this.blocks) w.push(...block.getWeights());
1505
- for (const row of this.outputProj.W) w.push(...row);
1506
- w.push(...this.outputBias);
1482
+ w.push(...this.outputProj.getWeights());
1483
+ w.push(...this.outputBias.getWeights());
1507
1484
  return w;
1508
1485
  }
1509
1486
  setWeights(weights) {
1510
1487
  let idx = 0;
1511
- for (let i = 0; i < this.tokenEmb.W.length; i++)
1512
- for (let j = 0; j < this.tokenEmb.W[i].length; j++) this.tokenEmb.W[i][j] = weights[idx++];
1513
- for (let i = 0; i < this.posEmb.W.length; i++)
1514
- for (let j = 0; j < this.posEmb.W[i].length; j++) this.posEmb.W[i][j] = weights[idx++];
1488
+ const tokenEmbLen = this.tokenEmb.getWeights().length;
1489
+ this.tokenEmb.setWeights(weights.slice(idx, idx + tokenEmbLen));
1490
+ idx += tokenEmbLen;
1491
+ const posEmbLen = this.posEmb.getWeights().length;
1492
+ this.posEmb.setWeights(weights.slice(idx, idx + posEmbLen));
1493
+ idx += posEmbLen;
1515
1494
  for (const block of this.blocks) {
1516
1495
  const blockLen = block.getWeights().length;
1517
1496
  block.setWeights(weights.slice(idx, idx + blockLen));
1518
1497
  idx += blockLen;
1519
1498
  }
1520
- for (let i = 0; i < this.outputProj.W.length; i++)
1521
- for (let j = 0; j < this.outputProj.W[i].length; j++) this.outputProj.W[i][j] = weights[idx++];
1522
- for (let i = 0; i < this.outputBias.length; i++) this.outputBias[i] = weights[idx++];
1499
+ const outProjLen = this.outputProj.getWeights().length;
1500
+ this.outputProj.setWeights(weights.slice(idx, idx + outProjLen));
1501
+ idx += outProjLen;
1502
+ this.outputBias.setWeights(weights.slice(idx, idx + this.outputBias.values.length));
1523
1503
  }
1524
1504
  // ── Internal ──────────────────────────────────────────────────────────────
1525
1505
  // Shared embedding + block forward pass.
@@ -1561,8 +1541,7 @@ var NetworkTransformerRL = class {
1561
1541
  () => new TransformerBlock({ d_model, nHeads, d_ff, causal: true })
1562
1542
  );
1563
1543
  this.outputProj = new WeightMatrix(nActions, d_model);
1564
- this.outputBias = new Array(nActions).fill(0);
1565
- this.outBiasOpts = Array.from({ length: nActions }, () => new Adam());
1544
+ this.outputBias = new BiasVector(nActions);
1566
1545
  }
1567
1546
  // ── Forward ────────────────────────────────────────────────────────────────
1568
1547
  // sequence: seqLen × inputDim → nActions Q-values
@@ -1570,7 +1549,7 @@ var NetworkTransformerRL = class {
1570
1549
  const h = this._forward(sequence);
1571
1550
  const pooled = this._pool(h);
1572
1551
  return this.outputProj.W.map(
1573
- (row, c) => row.reduce((s, w, m) => s + w * pooled[m], this.outputBias[c])
1552
+ (row, c) => row.reduce((s, w, m) => s + w * pooled[m], this.outputBias.values[c])
1574
1553
  );
1575
1554
  }
1576
1555
  // ── Training ────────────────────────────────────────────────────────────────
@@ -1582,7 +1561,7 @@ var NetworkTransformerRL = class {
1582
1561
  const h = this._forward(sequence);
1583
1562
  const pooled = this._pool(h);
1584
1563
  const pred = this.outputProj.W.map(
1585
- (row, c) => row.reduce((s, w, m) => s + w * pooled[m], this.outputBias[c])
1564
+ (row, c) => row.reduce((s, w, m) => s + w * pooled[m], this.outputBias.values[c])
1586
1565
  );
1587
1566
  const n = this.nActions;
1588
1567
  let loss = 0;
@@ -1605,8 +1584,7 @@ var NetworkTransformerRL = class {
1605
1584
  );
1606
1585
  const dBout = dPred.slice();
1607
1586
  this.outputProj.update(dWout, lr);
1608
- for (let c = 0; c < this.nActions; c++)
1609
- this.outputBias[c] = this.outBiasOpts[c].step(this.outputBias[c], dBout[c], lr);
1587
+ this.outputBias.update(dBout, lr);
1610
1588
  let dH = this._distributePoolGradient(dPooled);
1611
1589
  for (let b = this.blocks.length - 1; b >= 0; b--)
1612
1590
  dH = this.blocks[b].backward(dH, lr);
@@ -1630,24 +1608,26 @@ var NetworkTransformerRL = class {
1630
1608
  // Order: inputProj, block0, block1, ..., blockN, outputProj, outputBias.
1631
1609
  getWeightsFlat() {
1632
1610
  const w = [];
1633
- for (const row of this.inputProj.W) w.push(...row);
1611
+ w.push(...this.inputProj.getWeights());
1634
1612
  for (const block of this.blocks) w.push(...block.getWeights());
1635
- for (const row of this.outputProj.W) w.push(...row);
1636
- w.push(...this.outputBias);
1613
+ w.push(...this.outputProj.getWeights());
1614
+ w.push(...this.outputBias.getWeights());
1637
1615
  return w;
1638
1616
  }
1639
1617
  setWeightsFlat(weights) {
1640
1618
  let idx = 0;
1641
- for (let i = 0; i < this.inputProj.W.length; i++)
1642
- for (let j = 0; j < this.inputProj.W[i].length; j++) this.inputProj.W[i][j] = weights[idx++];
1619
+ const inputProjLen = this.inputProj.getWeights().length;
1620
+ this.inputProj.setWeights(weights.slice(idx, idx + inputProjLen));
1621
+ idx += inputProjLen;
1643
1622
  for (const block of this.blocks) {
1644
1623
  const blockLen = block.getWeights().length;
1645
1624
  block.setWeights(weights.slice(idx, idx + blockLen));
1646
1625
  idx += blockLen;
1647
1626
  }
1648
- for (let i = 0; i < this.outputProj.W.length; i++)
1649
- for (let j = 0; j < this.outputProj.W[i].length; j++) this.outputProj.W[i][j] = weights[idx++];
1650
- for (let i = 0; i < this.outputBias.length; i++) this.outputBias[i] = weights[idx++];
1627
+ const outProjLen = this.outputProj.getWeights().length;
1628
+ this.outputProj.setWeights(weights.slice(idx, idx + outProjLen));
1629
+ idx += outProjLen;
1630
+ this.outputBias.setWeights(weights.slice(idx, idx + this.outputBias.values.length));
1651
1631
  }
1652
1632
  getWeightsStructured() {
1653
1633
  return {
@@ -1665,17 +1645,15 @@ var NetworkTransformerRL = class {
1665
1645
  norm2: { gamma: [...b.norm2.gamma], beta: [...b.norm2.beta] },
1666
1646
  ff1: b.ff1.W.map((r) => [...r]),
1667
1647
  ff2: b.ff2.W.map((r) => [...r]),
1668
- b1: [...b.b1],
1669
- b2: [...b.b2]
1648
+ b1: [...b.b1.values],
1649
+ b2: [...b.b2.values]
1670
1650
  })),
1671
1651
  outputProj: this.outputProj.W.map((r) => [...r]),
1672
- outputBias: [...this.outputBias]
1652
+ outputBias: [...this.outputBias.values]
1673
1653
  };
1674
1654
  }
1675
1655
  setWeightsStructured(data) {
1676
- data.inputProj.forEach((row, i) => {
1677
- this.inputProj.W[i] = [...row];
1678
- });
1656
+ this.inputProj.setWeights(data.inputProj.flat());
1679
1657
  data.blocks.forEach((bd, b) => {
1680
1658
  const blk = this.blocks[b];
1681
1659
  bd.attn.heads.forEach((hd, h) => {
@@ -1690,11 +1668,11 @@ var NetworkTransformerRL = class {
1690
1668
  blk.norm2.beta = [...bd.norm2.beta];
1691
1669
  blk.ff1.W = bd.ff1.map((r) => [...r]);
1692
1670
  blk.ff2.W = bd.ff2.map((r) => [...r]);
1693
- blk.b1 = [...bd.b1];
1694
- blk.b2 = [...bd.b2];
1671
+ blk.b1.setWeights(bd.b1);
1672
+ blk.b2.setWeights(bd.b2);
1695
1673
  });
1696
1674
  this.outputProj.W = data.outputProj.map((r) => [...r]);
1697
- this.outputBias = [...data.outputBias];
1675
+ this.outputBias.setWeights(data.outputBias);
1698
1676
  }
1699
1677
  // ── Serializable interface (flat array) ────────────────────────────────────
1700
1678
  // These satisfy the Serializable interface from ModelSaver, which requires
@@ -1853,7 +1831,7 @@ function tanhFn(x) {
1853
1831
  var Gate2 = class {
1854
1832
  constructor(inputSize, hSize, initBias = 0) {
1855
1833
  const n = inputSize + hSize;
1856
- const limit = Math.sqrt(2 / n);
1834
+ const limit = Math.sqrt(2 / (n + hSize));
1857
1835
  this.W = Array.from(
1858
1836
  { length: hSize },
1859
1837
  () => Array.from({ length: n }, () => (Math.random() * 2 - 1) * limit)
@@ -2624,6 +2602,7 @@ export {
2624
2602
  Adam,
2625
2603
  AttentionHead,
2626
2604
  BatchNorm,
2605
+ BiasVector,
2627
2606
  ClipOptimizer,
2628
2607
  ClippedOptimizerFactory,
2629
2608
  Conv1D,
@@ -2652,6 +2631,7 @@ export {
2652
2631
  crossEntropy,
2653
2632
  crossEntropyDelta,
2654
2633
  crossEntropyDeltaRaw,
2634
+ defaultOptimizer,
2655
2635
  elu,
2656
2636
  leakyRelu,
2657
2637
  linear,
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@dniskav/neuron",
3
- "version": "0.2.5",
3
+ "version": "0.2.6",
4
4
  "description": "Minimal neural network from scratch — neuron, layer, network, backpropagation. No dependencies.",
5
5
  "main": "dist/index.js",
6
6
  "module": "dist/index.mjs",