@dniskav/neuron 0.2.5 → 0.2.6
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +28 -3
- package/dist/index.d.mts +17 -10
- package/dist/index.d.ts +17 -10
- package/dist/index.js +144 -162
- package/dist/index.mjs +142 -162
- package/package.json +1 -1
package/dist/index.js
CHANGED
|
@@ -23,6 +23,7 @@ __export(index_exports, {
|
|
|
23
23
|
Adam: () => Adam,
|
|
24
24
|
AttentionHead: () => AttentionHead,
|
|
25
25
|
BatchNorm: () => BatchNorm,
|
|
26
|
+
BiasVector: () => BiasVector,
|
|
26
27
|
ClipOptimizer: () => ClipOptimizer,
|
|
27
28
|
ClippedOptimizerFactory: () => ClippedOptimizerFactory,
|
|
28
29
|
Conv1D: () => Conv1D,
|
|
@@ -51,6 +52,7 @@ __export(index_exports, {
|
|
|
51
52
|
crossEntropy: () => crossEntropy,
|
|
52
53
|
crossEntropyDelta: () => crossEntropyDelta,
|
|
53
54
|
crossEntropyDeltaRaw: () => crossEntropyDeltaRaw,
|
|
55
|
+
defaultOptimizer: () => defaultOptimizer,
|
|
54
56
|
elu: () => elu,
|
|
55
57
|
leakyRelu: () => leakyRelu,
|
|
56
58
|
linear: () => linear,
|
|
@@ -201,6 +203,7 @@ function makeElu(alpha = 1) {
|
|
|
201
203
|
var elu = makeElu(1);
|
|
202
204
|
|
|
203
205
|
// src/optimizers.ts
|
|
206
|
+
var defaultOptimizer = () => new SGD();
|
|
204
207
|
var SGD = class {
|
|
205
208
|
step(weight, gradient, lr) {
|
|
206
209
|
return weight + lr * gradient;
|
|
@@ -249,7 +252,6 @@ var Adam = class {
|
|
|
249
252
|
};
|
|
250
253
|
|
|
251
254
|
// src/NeuronN.ts
|
|
252
|
-
var defaultOptimizer = () => new SGD();
|
|
253
255
|
var NeuronN = class {
|
|
254
256
|
constructor(nInputs, activation = sigmoid2, optimizerFactory = defaultOptimizer) {
|
|
255
257
|
const limit = Math.sqrt(1 / nInputs);
|
|
@@ -278,9 +280,8 @@ var NeuronN = class {
|
|
|
278
280
|
};
|
|
279
281
|
|
|
280
282
|
// src/Layer.ts
|
|
281
|
-
var defaultOptimizer2 = () => new SGD();
|
|
282
283
|
var Layer = class {
|
|
283
|
-
constructor(nNeurons, nInputs, activation = sigmoid2, optimizerFactory =
|
|
284
|
+
constructor(nNeurons, nInputs, activation = sigmoid2, optimizerFactory = defaultOptimizer) {
|
|
284
285
|
this.neurons = Array.from(
|
|
285
286
|
{ length: nNeurons },
|
|
286
287
|
() => new NeuronN(nInputs, activation, optimizerFactory)
|
|
@@ -300,7 +301,7 @@ var Network = class {
|
|
|
300
301
|
predict(inputs) {
|
|
301
302
|
validateArray(inputs, this.hiddenLayer.neurons[0].weights.length, "Network.predict");
|
|
302
303
|
const hiddenOut = this.hiddenLayer.predict(inputs);
|
|
303
|
-
return this.outputLayer.predict(hiddenOut)
|
|
304
|
+
return this.outputLayer.predict(hiddenOut);
|
|
304
305
|
}
|
|
305
306
|
// Trains on a single example. Returns the squared error.
|
|
306
307
|
train(inputs, target, lr) {
|
|
@@ -309,22 +310,17 @@ var Network = class {
|
|
|
309
310
|
validateNumber(lr, "Network.train");
|
|
310
311
|
const hiddenOut = this.hiddenLayer.predict(inputs);
|
|
311
312
|
const prediction = this.outputLayer.predict(hiddenOut)[0];
|
|
312
|
-
const outputError = target - prediction;
|
|
313
|
-
const outputDelta = outputError * prediction * (1 - prediction);
|
|
314
313
|
const outputNeuron = this.outputLayer.neurons[0];
|
|
314
|
+
const outputError = target - prediction;
|
|
315
|
+
const outputDelta = outputError * outputNeuron.activation.dfn(prediction);
|
|
315
316
|
const hiddenDeltas = this.hiddenLayer.neurons.map((neuron, i) => {
|
|
316
|
-
const hiddenOut_i = hiddenOut[i];
|
|
317
317
|
const hiddenError = outputDelta * outputNeuron.weights[i];
|
|
318
|
-
return hiddenError *
|
|
318
|
+
return hiddenError * neuron.activation.dfn(hiddenOut[i]);
|
|
319
319
|
});
|
|
320
320
|
this.hiddenLayer.neurons.forEach((neuron, i) => {
|
|
321
|
-
neuron.
|
|
322
|
-
neuron.bias += lr * hiddenDeltas[i];
|
|
321
|
+
neuron._update(inputs.map((inp) => hiddenDeltas[i] * inp), hiddenDeltas[i], lr);
|
|
323
322
|
});
|
|
324
|
-
outputNeuron.
|
|
325
|
-
(w, i) => w + lr * outputDelta * hiddenOut[i]
|
|
326
|
-
);
|
|
327
|
-
outputNeuron.bias += lr * outputDelta;
|
|
323
|
+
outputNeuron._update(hiddenOut.map((h) => outputDelta * h), outputDelta, lr);
|
|
328
324
|
return outputError * outputError;
|
|
329
325
|
}
|
|
330
326
|
// ── Flat weight serialization ─────────────────────────────────────────────
|
|
@@ -394,13 +390,12 @@ var Dropout = class {
|
|
|
394
390
|
};
|
|
395
391
|
|
|
396
392
|
// src/NetworkN.ts
|
|
397
|
-
var defaultOptimizer3 = () => new SGD();
|
|
398
393
|
var NetworkN = class {
|
|
399
394
|
constructor(structure, options = {}) {
|
|
400
395
|
this.structure = structure;
|
|
401
396
|
const nLayers = structure.length - 1;
|
|
402
397
|
const activations = options.activations ?? Array.from({ length: nLayers }, () => sigmoid2);
|
|
403
|
-
const optimizer = options.optimizer ??
|
|
398
|
+
const optimizer = options.optimizer ?? defaultOptimizer;
|
|
404
399
|
const dropoutRate = options.dropoutRate ?? 0;
|
|
405
400
|
if (activations.length !== nLayers) {
|
|
406
401
|
throw new Error(`Expected ${nLayers} activations, got ${activations.length}`);
|
|
@@ -453,73 +448,69 @@ var NetworkN = class {
|
|
|
453
448
|
train(inputs, targets, lr) {
|
|
454
449
|
validateArray(inputs, this.structure[0], "NetworkN.train");
|
|
455
450
|
validateArray(targets, this.structure[this.structure.length - 1], "NetworkN.train");
|
|
456
|
-
const act =
|
|
457
|
-
for (let i = 0; i < this.layers.length; i++) {
|
|
458
|
-
const layerInput = act[act.length - 1];
|
|
459
|
-
const layerOutput = this.layers[i].predict(layerInput);
|
|
460
|
-
let current;
|
|
461
|
-
if (this._shouldResidual(i)) {
|
|
462
|
-
if (this.structure[i] === this.structure[i + 1]) {
|
|
463
|
-
current = layerOutput.map((v, j) => v + layerInput[j]);
|
|
464
|
-
} else {
|
|
465
|
-
current = [...layerOutput];
|
|
466
|
-
}
|
|
467
|
-
} else {
|
|
468
|
-
current = [...layerOutput];
|
|
469
|
-
}
|
|
470
|
-
if (i < this._dropouts.length) {
|
|
471
|
-
current = this._dropouts[i].forward(current, true);
|
|
472
|
-
}
|
|
473
|
-
act.push(current);
|
|
474
|
-
}
|
|
451
|
+
const act = this._forwardAll(inputs, true);
|
|
475
452
|
const pred = act[act.length - 1];
|
|
476
453
|
const outAct = this.layers[this.layers.length - 1].neurons[0].activation;
|
|
477
|
-
|
|
478
|
-
|
|
479
|
-
const layer = this.layers[l];
|
|
480
|
-
if (l < this._dropouts.length) {
|
|
481
|
-
deltas = this._dropouts[l].backward(deltas);
|
|
482
|
-
}
|
|
483
|
-
const layerIn = act[l];
|
|
484
|
-
const prevAct = l > 0 ? this.layers[l - 1].neurons[0].activation : null;
|
|
485
|
-
const prevDeltas = layerIn.map((out, j) => {
|
|
486
|
-
const errProp = layer.neurons.reduce((s, n, k) => s + deltas[k] * n.weights[j], 0);
|
|
487
|
-
return prevAct ? errProp * prevAct.dfn(out) : errProp;
|
|
488
|
-
});
|
|
489
|
-
if (this._shouldResidual(l) && this.structure[l] === this.structure[l + 1]) {
|
|
490
|
-
for (let j = 0; j < prevDeltas.length; j++) {
|
|
491
|
-
prevDeltas[j] += deltas[j];
|
|
492
|
-
}
|
|
493
|
-
}
|
|
494
|
-
layer.neurons.forEach((n, k) => {
|
|
495
|
-
n._update(layerIn.map((inp) => deltas[k] * inp), deltas[k], lr);
|
|
496
|
-
});
|
|
497
|
-
deltas = prevDeltas;
|
|
498
|
-
}
|
|
454
|
+
const deltas = pred.map((p, i) => (targets[i] - p) * outAct.dfn(p));
|
|
455
|
+
this._backpropLayers(act, deltas, lr);
|
|
499
456
|
return pred.reduce((s, p, i) => s + (targets[i] - p) ** 2, 0) / pred.length;
|
|
500
457
|
}
|
|
501
458
|
// Backprop with externally provided output-layer deltas.
|
|
502
459
|
// Useful for custom loss functions (e.g. physics-based gradients).
|
|
503
460
|
trainWithDeltas(inputs, outputDeltas, lr) {
|
|
461
|
+
const act = this._forwardAll(inputs, true);
|
|
462
|
+
this._backpropLayers(act, outputDeltas, lr);
|
|
463
|
+
}
|
|
464
|
+
// ── Flat weight serialization ─────────────────────────────────────────────
|
|
465
|
+
// Order: layer 0 (all neurons), layer 1, ..., layer N.
|
|
466
|
+
getWeights() {
|
|
467
|
+
for (const d of this._dropouts) d.resetMask();
|
|
468
|
+
const w = [];
|
|
469
|
+
for (const layer of this.layers) {
|
|
470
|
+
for (const n of layer.neurons) {
|
|
471
|
+
w.push(...n.weights, n.bias);
|
|
472
|
+
}
|
|
473
|
+
}
|
|
474
|
+
return w;
|
|
475
|
+
}
|
|
476
|
+
setWeights(weights) {
|
|
477
|
+
for (const d of this._dropouts) d.resetMask();
|
|
478
|
+
let idx = 0;
|
|
479
|
+
for (const layer of this.layers) {
|
|
480
|
+
for (const n of layer.neurons) {
|
|
481
|
+
for (let j = 0; j < n.weights.length; j++) n.weights[j] = weights[idx++];
|
|
482
|
+
n.bias = weights[idx++];
|
|
483
|
+
}
|
|
484
|
+
}
|
|
485
|
+
}
|
|
486
|
+
// ── Private helpers ──────────────────────────────────────────────────────
|
|
487
|
+
_shouldResidual(layerIndex) {
|
|
488
|
+
if (typeof this._residual === "function") return this._residual(layerIndex);
|
|
489
|
+
return this._residual;
|
|
490
|
+
}
|
|
491
|
+
// Forward pass storing activations at every layer boundary.
|
|
492
|
+
// Used by train(), trainWithDeltas(), and predict() shares the same logic.
|
|
493
|
+
_forwardAll(inputs, training) {
|
|
504
494
|
const act = [inputs];
|
|
505
495
|
for (let i = 0; i < this.layers.length; i++) {
|
|
506
496
|
const layerInput = act[act.length - 1];
|
|
507
497
|
const layerOutput = this.layers[i].predict(layerInput);
|
|
508
498
|
let current;
|
|
509
|
-
if (this._shouldResidual(i)) {
|
|
510
|
-
|
|
511
|
-
current = layerOutput.map((v, j) => v + layerInput[j]);
|
|
512
|
-
} else {
|
|
513
|
-
current = [...layerOutput];
|
|
514
|
-
}
|
|
499
|
+
if (this._shouldResidual(i) && this.structure[i] === this.structure[i + 1]) {
|
|
500
|
+
current = layerOutput.map((v, j) => v + layerInput[j]);
|
|
515
501
|
} else {
|
|
516
|
-
current =
|
|
502
|
+
current = layerOutput;
|
|
517
503
|
}
|
|
518
504
|
if (i < this._dropouts.length) {
|
|
519
|
-
current = this._dropouts[i].forward(current,
|
|
505
|
+
current = this._dropouts[i].forward(current, training);
|
|
520
506
|
}
|
|
521
507
|
act.push(current);
|
|
522
508
|
}
|
|
509
|
+
return act;
|
|
510
|
+
}
|
|
511
|
+
// Backward pass: updates all layer weights given the pre-computed activations
|
|
512
|
+
// and the initial output-layer deltas.
|
|
513
|
+
_backpropLayers(act, outputDeltas, lr) {
|
|
523
514
|
let deltas = outputDeltas;
|
|
524
515
|
for (let l = this.layers.length - 1; l >= 0; l--) {
|
|
525
516
|
const layer = this.layers[l];
|
|
@@ -533,9 +524,7 @@ var NetworkN = class {
|
|
|
533
524
|
return prevAct ? errProp * prevAct.dfn(out) : errProp;
|
|
534
525
|
});
|
|
535
526
|
if (this._shouldResidual(l) && this.structure[l] === this.structure[l + 1]) {
|
|
536
|
-
for (let j = 0; j < prevDeltas.length; j++)
|
|
537
|
-
prevDeltas[j] += deltas[j];
|
|
538
|
-
}
|
|
527
|
+
for (let j = 0; j < prevDeltas.length; j++) prevDeltas[j] += deltas[j];
|
|
539
528
|
}
|
|
540
529
|
layer.neurons.forEach((n, k) => {
|
|
541
530
|
n._update(layerIn.map((inp) => deltas[k] * inp), deltas[k], lr);
|
|
@@ -543,33 +532,6 @@ var NetworkN = class {
|
|
|
543
532
|
deltas = prevDeltas;
|
|
544
533
|
}
|
|
545
534
|
}
|
|
546
|
-
// ── Flat weight serialization ─────────────────────────────────────────────
|
|
547
|
-
// Order: layer 0 (all neurons), layer 1, ..., layer N.
|
|
548
|
-
getWeights() {
|
|
549
|
-
for (const d of this._dropouts) d.resetMask();
|
|
550
|
-
const w = [];
|
|
551
|
-
for (const layer of this.layers) {
|
|
552
|
-
for (const n of layer.neurons) {
|
|
553
|
-
w.push(...n.weights, n.bias);
|
|
554
|
-
}
|
|
555
|
-
}
|
|
556
|
-
return w;
|
|
557
|
-
}
|
|
558
|
-
setWeights(weights) {
|
|
559
|
-
for (const d of this._dropouts) d.resetMask();
|
|
560
|
-
let idx = 0;
|
|
561
|
-
for (const layer of this.layers) {
|
|
562
|
-
for (const n of layer.neurons) {
|
|
563
|
-
for (let j = 0; j < n.weights.length; j++) n.weights[j] = weights[idx++];
|
|
564
|
-
n.bias = weights[idx++];
|
|
565
|
-
}
|
|
566
|
-
}
|
|
567
|
-
}
|
|
568
|
-
// ── Helper ───────────────────────────────────────────────────────────────
|
|
569
|
-
_shouldResidual(layerIndex) {
|
|
570
|
-
if (typeof this._residual === "function") return this._residual(layerIndex);
|
|
571
|
-
return this._residual;
|
|
572
|
-
}
|
|
573
535
|
};
|
|
574
536
|
|
|
575
537
|
// src/LSTMLayer.ts
|
|
@@ -584,7 +546,7 @@ var Gate = class {
|
|
|
584
546
|
// shape: [hSize]
|
|
585
547
|
constructor(inputSize, hSize, initBias = 0) {
|
|
586
548
|
const n = inputSize + hSize;
|
|
587
|
-
const limit = Math.sqrt(2 / n);
|
|
549
|
+
const limit = Math.sqrt(2 / (n + hSize));
|
|
588
550
|
this.W = Array.from(
|
|
589
551
|
{ length: hSize },
|
|
590
552
|
() => Array.from({ length: n }, () => (Math.random() * 2 - 1) * limit)
|
|
@@ -783,7 +745,6 @@ var LSTMLayer = class {
|
|
|
783
745
|
};
|
|
784
746
|
|
|
785
747
|
// src/NetworkLSTM.ts
|
|
786
|
-
var defaultOptimizer4 = () => new SGD();
|
|
787
748
|
var NetworkLSTM = class {
|
|
788
749
|
// [T][layer+1][neuron]
|
|
789
750
|
constructor(inputSize, hiddenSize, denseStructure, options = {}) {
|
|
@@ -791,7 +752,7 @@ var NetworkLSTM = class {
|
|
|
791
752
|
this.hiddenSize = hiddenSize;
|
|
792
753
|
this.lstm = new LSTMLayer(inputSize, hiddenSize);
|
|
793
754
|
const activation = options.denseActivation ?? sigmoid2;
|
|
794
|
-
const optimizer = options.optimizer ??
|
|
755
|
+
const optimizer = options.optimizer ?? defaultOptimizer;
|
|
795
756
|
this.denseLayers = [];
|
|
796
757
|
const sizes = [hiddenSize, ...denseStructure];
|
|
797
758
|
for (let i = 1; i < sizes.length; i++) {
|
|
@@ -978,6 +939,22 @@ var WeightMatrix = class {
|
|
|
978
939
|
for (let j = 0; j < this.W[i].length; j++) this.W[i][j] = weights[idx++];
|
|
979
940
|
}
|
|
980
941
|
};
|
|
942
|
+
var BiasVector = class {
|
|
943
|
+
constructor(size) {
|
|
944
|
+
this.values = new Array(size).fill(0);
|
|
945
|
+
this.opts = Array.from({ length: size }, () => new Adam());
|
|
946
|
+
}
|
|
947
|
+
update(grad, lr) {
|
|
948
|
+
for (let i = 0; i < this.values.length; i++)
|
|
949
|
+
this.values[i] = this.opts[i].step(this.values[i], grad[i], lr);
|
|
950
|
+
}
|
|
951
|
+
getWeights() {
|
|
952
|
+
return [...this.values];
|
|
953
|
+
}
|
|
954
|
+
setWeights(weights) {
|
|
955
|
+
for (let i = 0; i < this.values.length; i++) this.values[i] = weights[i];
|
|
956
|
+
}
|
|
957
|
+
};
|
|
981
958
|
var EmbeddingMatrix = class {
|
|
982
959
|
constructor(vocabSize, d_model) {
|
|
983
960
|
const limit = Math.sqrt(1 / d_model);
|
|
@@ -1063,6 +1040,7 @@ var AttentionHead = class {
|
|
|
1063
1040
|
// 5. dWq = dQ^T @ X, dWk = dK^T @ X, dWv = dV^T @ X
|
|
1064
1041
|
// 6. dX = dQ @ Wq + dK @ Wk + dV @ Wv
|
|
1065
1042
|
backward(dOut, lr) {
|
|
1043
|
+
if (!this.cache) throw new Error("AttentionHead.backward() called before predict()");
|
|
1066
1044
|
const { X, Q, K, V, attn } = this.cache;
|
|
1067
1045
|
const seqLen = X.length;
|
|
1068
1046
|
const d_model = X[0].length;
|
|
@@ -1190,6 +1168,7 @@ var MultiHeadAttention = class {
|
|
|
1190
1168
|
// ── Backward ──────────────────────────────────────────────────────────────
|
|
1191
1169
|
// dOut: seqLen × d_model → dX: seqLen × d_model
|
|
1192
1170
|
backward(dOut, lr) {
|
|
1171
|
+
if (!this._concat) throw new Error("MultiHeadAttention.backward() called before predict()");
|
|
1193
1172
|
const seqLen = dOut.length;
|
|
1194
1173
|
const concatD = this.nHeads * this.d_k;
|
|
1195
1174
|
const d_model = this.d_model;
|
|
@@ -1294,11 +1273,12 @@ var LayerNorm = class {
|
|
|
1294
1273
|
backwardOne(dOut, pos, lr) {
|
|
1295
1274
|
const { x_norm, std } = this._cache[pos];
|
|
1296
1275
|
const N = dOut.length;
|
|
1276
|
+
const gammaOld = this.gamma.slice();
|
|
1297
1277
|
for (let i = 0; i < N; i++) {
|
|
1298
1278
|
this.gamma[i] += lr * dOut[i] * x_norm[i];
|
|
1299
1279
|
this.beta[i] += lr * dOut[i];
|
|
1300
1280
|
}
|
|
1301
|
-
const D = dOut.map((d, i) => d *
|
|
1281
|
+
const D = dOut.map((d, i) => d * gammaOld[i]);
|
|
1302
1282
|
const mD = D.reduce((s, v) => s + v, 0) / N;
|
|
1303
1283
|
const mDxn = D.reduce((s, d, i) => s + d * x_norm[i], 0) / N;
|
|
1304
1284
|
return D.map((d, i) => (d - mD - x_norm[i] * mDxn) / std);
|
|
@@ -1318,6 +1298,7 @@ var LayerNorm = class {
|
|
|
1318
1298
|
// src/TransformerBlock.ts
|
|
1319
1299
|
var TransformerBlock = class {
|
|
1320
1300
|
constructor({ d_model, nHeads, d_ff, causal = false }) {
|
|
1301
|
+
// d_model
|
|
1321
1302
|
// Forward caches (needed for backprop)
|
|
1322
1303
|
this._X = null;
|
|
1323
1304
|
this._attnOut = null;
|
|
@@ -1334,10 +1315,8 @@ var TransformerBlock = class {
|
|
|
1334
1315
|
this.norm2 = new LayerNorm(d_model);
|
|
1335
1316
|
this.ff1 = new WeightMatrix(d_ff, d_model);
|
|
1336
1317
|
this.ff2 = new WeightMatrix(d_model, d_ff);
|
|
1337
|
-
this.b1 = new
|
|
1338
|
-
this.b2 = new
|
|
1339
|
-
this.b1Opts = Array.from({ length: d_ff }, () => new Adam());
|
|
1340
|
-
this.b2Opts = Array.from({ length: d_model }, () => new Adam());
|
|
1318
|
+
this.b1 = new BiasVector(d_ff);
|
|
1319
|
+
this.b2 = new BiasVector(d_model);
|
|
1341
1320
|
}
|
|
1342
1321
|
// ── Forward ───────────────────────────────────────────────────────────────
|
|
1343
1322
|
// X: seqLen × d_model → out: seqLen × d_model
|
|
@@ -1350,11 +1329,11 @@ var TransformerBlock = class {
|
|
|
1350
1329
|
return this.norm1.predictOne(added, i);
|
|
1351
1330
|
});
|
|
1352
1331
|
const ff1Pre = h1.map(
|
|
1353
|
-
(h) => this.ff1.W.map((row, k) => row.reduce((s, w, m) => s + w * h[m], this.b1[k]))
|
|
1332
|
+
(h) => this.ff1.W.map((row, k) => row.reduce((s, w, m) => s + w * h[m], this.b1.values[k]))
|
|
1354
1333
|
);
|
|
1355
1334
|
const ff1Out = ff1Pre.map((pre) => pre.map((v) => Math.max(0, v)));
|
|
1356
1335
|
const ff2Out = ff1Out.map(
|
|
1357
|
-
(h) => this.ff2.W.map((row, k) => row.reduce((s, w, m) => s + w * h[m], this.b2[k]))
|
|
1336
|
+
(h) => this.ff2.W.map((row, k) => row.reduce((s, w, m) => s + w * h[m], this.b2.values[k]))
|
|
1358
1337
|
);
|
|
1359
1338
|
this.norm2.resetCache(seqLen);
|
|
1360
1339
|
const out = h1.map((h, i) => {
|
|
@@ -1372,6 +1351,9 @@ var TransformerBlock = class {
|
|
|
1372
1351
|
// ── Backward ──────────────────────────────────────────────────────────────
|
|
1373
1352
|
// dOut: seqLen × d_model → dX: seqLen × d_model
|
|
1374
1353
|
backward(dOut, lr) {
|
|
1354
|
+
if (!this._h1 || !this._ff1Out || !this._ff1Pre) {
|
|
1355
|
+
throw new Error("TransformerBlock.backward() called before predict()");
|
|
1356
|
+
}
|
|
1375
1357
|
const seqLen = dOut.length;
|
|
1376
1358
|
const d_model = this.d_model;
|
|
1377
1359
|
const h1 = this._h1;
|
|
@@ -1396,8 +1378,7 @@ var TransformerBlock = class {
|
|
|
1396
1378
|
(_, m) => dAdded2.reduce((s, da) => s + da[m], 0)
|
|
1397
1379
|
);
|
|
1398
1380
|
this.ff2.update(dW2, lr);
|
|
1399
|
-
|
|
1400
|
-
this.b2[m] = this.b2Opts[m].step(this.b2[m], db2[m], lr);
|
|
1381
|
+
this.b2.update(db2, lr);
|
|
1401
1382
|
const dFf1Pre = dFf1Out.map(
|
|
1402
1383
|
(d, i) => d.map((v, k) => ff1Pre[i][k] > 0 ? v : 0)
|
|
1403
1384
|
);
|
|
@@ -1419,8 +1400,7 @@ var TransformerBlock = class {
|
|
|
1419
1400
|
(_, k) => dFf1Pre.reduce((s, dp) => s + dp[k], 0)
|
|
1420
1401
|
);
|
|
1421
1402
|
this.ff1.update(dW1, lr);
|
|
1422
|
-
|
|
1423
|
-
this.b1[k] = this.b1Opts[k].step(this.b1[k], db1[k], lr);
|
|
1403
|
+
this.b1.update(db1, lr);
|
|
1424
1404
|
const dH1 = Array.from(
|
|
1425
1405
|
{ length: seqLen },
|
|
1426
1406
|
(_, i) => dH1_fromFf[i].map((v, m) => v + dAdded2[i][m])
|
|
@@ -1449,9 +1429,9 @@ var TransformerBlock = class {
|
|
|
1449
1429
|
w.push(...this.attn.getWeights());
|
|
1450
1430
|
w.push(...this.norm1.gamma, ...this.norm1.beta);
|
|
1451
1431
|
for (const row of this.ff1.W) w.push(...row);
|
|
1452
|
-
w.push(...this.b1);
|
|
1432
|
+
w.push(...this.b1.values);
|
|
1453
1433
|
for (const row of this.ff2.W) w.push(...row);
|
|
1454
|
-
w.push(...this.b2);
|
|
1434
|
+
w.push(...this.b2.values);
|
|
1455
1435
|
w.push(...this.norm2.gamma, ...this.norm2.beta);
|
|
1456
1436
|
return w;
|
|
1457
1437
|
}
|
|
@@ -1460,16 +1440,17 @@ var TransformerBlock = class {
|
|
|
1460
1440
|
const attnLen = this.attn.getWeights().length;
|
|
1461
1441
|
this.attn.setWeights(weights.slice(idx, idx + attnLen));
|
|
1462
1442
|
idx += attnLen;
|
|
1463
|
-
|
|
1464
|
-
|
|
1465
|
-
|
|
1466
|
-
|
|
1467
|
-
|
|
1468
|
-
|
|
1469
|
-
|
|
1470
|
-
|
|
1471
|
-
|
|
1472
|
-
|
|
1443
|
+
this.norm1.setWeights(weights.slice(idx, idx + this.norm1.getWeights().length));
|
|
1444
|
+
idx += this.norm1.getWeights().length;
|
|
1445
|
+
this.ff1.setWeights(weights.slice(idx, idx + this.ff1.getWeights().length));
|
|
1446
|
+
idx += this.ff1.getWeights().length;
|
|
1447
|
+
this.b1.setWeights(weights.slice(idx, idx + this.b1.values.length));
|
|
1448
|
+
idx += this.b1.values.length;
|
|
1449
|
+
this.ff2.setWeights(weights.slice(idx, idx + this.ff2.getWeights().length));
|
|
1450
|
+
idx += this.ff2.getWeights().length;
|
|
1451
|
+
this.b2.setWeights(weights.slice(idx, idx + this.b2.values.length));
|
|
1452
|
+
idx += this.b2.values.length;
|
|
1453
|
+
this.norm2.setWeights(weights.slice(idx, idx + this.norm2.getWeights().length));
|
|
1473
1454
|
}
|
|
1474
1455
|
};
|
|
1475
1456
|
|
|
@@ -1495,8 +1476,7 @@ var NetworkTransformer = class {
|
|
|
1495
1476
|
() => new TransformerBlock({ d_model, nHeads, d_ff })
|
|
1496
1477
|
);
|
|
1497
1478
|
this.outputProj = new WeightMatrix(nClasses, d_model);
|
|
1498
|
-
this.outputBias = new
|
|
1499
|
-
this.outBiasOpts = Array.from({ length: nClasses }, () => new Adam());
|
|
1479
|
+
this.outputBias = new BiasVector(nClasses);
|
|
1500
1480
|
}
|
|
1501
1481
|
// ── Forward pass ──────────────────────────────────────────────────────────
|
|
1502
1482
|
// tokens: seqLen integer ids → seqLen * nClasses logits (flattened)
|
|
@@ -1504,7 +1484,7 @@ var NetworkTransformer = class {
|
|
|
1504
1484
|
const h = this._forward(tokens);
|
|
1505
1485
|
return h.flatMap(
|
|
1506
1486
|
(hi) => this.outputProj.W.map(
|
|
1507
|
-
(row, c) => row.reduce((s, w, m) => s + w * hi[m], this.outputBias[c])
|
|
1487
|
+
(row, c) => row.reduce((s, w, m) => s + w * hi[m], this.outputBias.values[c])
|
|
1508
1488
|
)
|
|
1509
1489
|
);
|
|
1510
1490
|
}
|
|
@@ -1518,7 +1498,7 @@ var NetworkTransformer = class {
|
|
|
1518
1498
|
const h = this._forward(tokens);
|
|
1519
1499
|
const logits = h.map(
|
|
1520
1500
|
(hi) => this.outputProj.W.map(
|
|
1521
|
-
(row, c) => row.reduce((s, w, m) => s + w * hi[m], this.outputBias[c])
|
|
1501
|
+
(row, c) => row.reduce((s, w, m) => s + w * hi[m], this.outputBias.values[c])
|
|
1522
1502
|
)
|
|
1523
1503
|
);
|
|
1524
1504
|
let loss = 0;
|
|
@@ -1553,8 +1533,7 @@ var NetworkTransformer = class {
|
|
|
1553
1533
|
(_, c) => dLogits.reduce((s, dl) => s + dl[c], 0)
|
|
1554
1534
|
);
|
|
1555
1535
|
this.outputProj.update(dWout, lr);
|
|
1556
|
-
|
|
1557
|
-
this.outputBias[c] = this.outBiasOpts[c].step(this.outputBias[c], dBout[c], lr);
|
|
1536
|
+
this.outputBias.update(dBout, lr);
|
|
1558
1537
|
let dX = dH;
|
|
1559
1538
|
for (let b = this.blocks.length - 1; b >= 0; b--)
|
|
1560
1539
|
dX = this.blocks[b].backward(dX, lr);
|
|
@@ -1573,27 +1552,30 @@ var NetworkTransformer = class {
|
|
|
1573
1552
|
// Order: tokenEmb, posEmb, block0, block1, ..., blockN, outputProj, outputBias.
|
|
1574
1553
|
getWeights() {
|
|
1575
1554
|
const w = [];
|
|
1576
|
-
|
|
1577
|
-
|
|
1555
|
+
w.push(...this.tokenEmb.getWeights());
|
|
1556
|
+
w.push(...this.posEmb.getWeights());
|
|
1578
1557
|
for (const block of this.blocks) w.push(...block.getWeights());
|
|
1579
|
-
|
|
1580
|
-
w.push(...this.outputBias);
|
|
1558
|
+
w.push(...this.outputProj.getWeights());
|
|
1559
|
+
w.push(...this.outputBias.getWeights());
|
|
1581
1560
|
return w;
|
|
1582
1561
|
}
|
|
1583
1562
|
setWeights(weights) {
|
|
1584
1563
|
let idx = 0;
|
|
1585
|
-
|
|
1586
|
-
|
|
1587
|
-
|
|
1588
|
-
|
|
1564
|
+
const tokenEmbLen = this.tokenEmb.getWeights().length;
|
|
1565
|
+
this.tokenEmb.setWeights(weights.slice(idx, idx + tokenEmbLen));
|
|
1566
|
+
idx += tokenEmbLen;
|
|
1567
|
+
const posEmbLen = this.posEmb.getWeights().length;
|
|
1568
|
+
this.posEmb.setWeights(weights.slice(idx, idx + posEmbLen));
|
|
1569
|
+
idx += posEmbLen;
|
|
1589
1570
|
for (const block of this.blocks) {
|
|
1590
1571
|
const blockLen = block.getWeights().length;
|
|
1591
1572
|
block.setWeights(weights.slice(idx, idx + blockLen));
|
|
1592
1573
|
idx += blockLen;
|
|
1593
1574
|
}
|
|
1594
|
-
|
|
1595
|
-
|
|
1596
|
-
|
|
1575
|
+
const outProjLen = this.outputProj.getWeights().length;
|
|
1576
|
+
this.outputProj.setWeights(weights.slice(idx, idx + outProjLen));
|
|
1577
|
+
idx += outProjLen;
|
|
1578
|
+
this.outputBias.setWeights(weights.slice(idx, idx + this.outputBias.values.length));
|
|
1597
1579
|
}
|
|
1598
1580
|
// ── Internal ──────────────────────────────────────────────────────────────
|
|
1599
1581
|
// Shared embedding + block forward pass.
|
|
@@ -1635,8 +1617,7 @@ var NetworkTransformerRL = class {
|
|
|
1635
1617
|
() => new TransformerBlock({ d_model, nHeads, d_ff, causal: true })
|
|
1636
1618
|
);
|
|
1637
1619
|
this.outputProj = new WeightMatrix(nActions, d_model);
|
|
1638
|
-
this.outputBias = new
|
|
1639
|
-
this.outBiasOpts = Array.from({ length: nActions }, () => new Adam());
|
|
1620
|
+
this.outputBias = new BiasVector(nActions);
|
|
1640
1621
|
}
|
|
1641
1622
|
// ── Forward ────────────────────────────────────────────────────────────────
|
|
1642
1623
|
// sequence: seqLen × inputDim → nActions Q-values
|
|
@@ -1644,7 +1625,7 @@ var NetworkTransformerRL = class {
|
|
|
1644
1625
|
const h = this._forward(sequence);
|
|
1645
1626
|
const pooled = this._pool(h);
|
|
1646
1627
|
return this.outputProj.W.map(
|
|
1647
|
-
(row, c) => row.reduce((s, w, m) => s + w * pooled[m], this.outputBias[c])
|
|
1628
|
+
(row, c) => row.reduce((s, w, m) => s + w * pooled[m], this.outputBias.values[c])
|
|
1648
1629
|
);
|
|
1649
1630
|
}
|
|
1650
1631
|
// ── Training ────────────────────────────────────────────────────────────────
|
|
@@ -1656,7 +1637,7 @@ var NetworkTransformerRL = class {
|
|
|
1656
1637
|
const h = this._forward(sequence);
|
|
1657
1638
|
const pooled = this._pool(h);
|
|
1658
1639
|
const pred = this.outputProj.W.map(
|
|
1659
|
-
(row, c) => row.reduce((s, w, m) => s + w * pooled[m], this.outputBias[c])
|
|
1640
|
+
(row, c) => row.reduce((s, w, m) => s + w * pooled[m], this.outputBias.values[c])
|
|
1660
1641
|
);
|
|
1661
1642
|
const n = this.nActions;
|
|
1662
1643
|
let loss = 0;
|
|
@@ -1679,8 +1660,7 @@ var NetworkTransformerRL = class {
|
|
|
1679
1660
|
);
|
|
1680
1661
|
const dBout = dPred.slice();
|
|
1681
1662
|
this.outputProj.update(dWout, lr);
|
|
1682
|
-
|
|
1683
|
-
this.outputBias[c] = this.outBiasOpts[c].step(this.outputBias[c], dBout[c], lr);
|
|
1663
|
+
this.outputBias.update(dBout, lr);
|
|
1684
1664
|
let dH = this._distributePoolGradient(dPooled);
|
|
1685
1665
|
for (let b = this.blocks.length - 1; b >= 0; b--)
|
|
1686
1666
|
dH = this.blocks[b].backward(dH, lr);
|
|
@@ -1704,24 +1684,26 @@ var NetworkTransformerRL = class {
|
|
|
1704
1684
|
// Order: inputProj, block0, block1, ..., blockN, outputProj, outputBias.
|
|
1705
1685
|
getWeightsFlat() {
|
|
1706
1686
|
const w = [];
|
|
1707
|
-
|
|
1687
|
+
w.push(...this.inputProj.getWeights());
|
|
1708
1688
|
for (const block of this.blocks) w.push(...block.getWeights());
|
|
1709
|
-
|
|
1710
|
-
w.push(...this.outputBias);
|
|
1689
|
+
w.push(...this.outputProj.getWeights());
|
|
1690
|
+
w.push(...this.outputBias.getWeights());
|
|
1711
1691
|
return w;
|
|
1712
1692
|
}
|
|
1713
1693
|
setWeightsFlat(weights) {
|
|
1714
1694
|
let idx = 0;
|
|
1715
|
-
|
|
1716
|
-
|
|
1695
|
+
const inputProjLen = this.inputProj.getWeights().length;
|
|
1696
|
+
this.inputProj.setWeights(weights.slice(idx, idx + inputProjLen));
|
|
1697
|
+
idx += inputProjLen;
|
|
1717
1698
|
for (const block of this.blocks) {
|
|
1718
1699
|
const blockLen = block.getWeights().length;
|
|
1719
1700
|
block.setWeights(weights.slice(idx, idx + blockLen));
|
|
1720
1701
|
idx += blockLen;
|
|
1721
1702
|
}
|
|
1722
|
-
|
|
1723
|
-
|
|
1724
|
-
|
|
1703
|
+
const outProjLen = this.outputProj.getWeights().length;
|
|
1704
|
+
this.outputProj.setWeights(weights.slice(idx, idx + outProjLen));
|
|
1705
|
+
idx += outProjLen;
|
|
1706
|
+
this.outputBias.setWeights(weights.slice(idx, idx + this.outputBias.values.length));
|
|
1725
1707
|
}
|
|
1726
1708
|
getWeightsStructured() {
|
|
1727
1709
|
return {
|
|
@@ -1739,17 +1721,15 @@ var NetworkTransformerRL = class {
|
|
|
1739
1721
|
norm2: { gamma: [...b.norm2.gamma], beta: [...b.norm2.beta] },
|
|
1740
1722
|
ff1: b.ff1.W.map((r) => [...r]),
|
|
1741
1723
|
ff2: b.ff2.W.map((r) => [...r]),
|
|
1742
|
-
b1: [...b.b1],
|
|
1743
|
-
b2: [...b.b2]
|
|
1724
|
+
b1: [...b.b1.values],
|
|
1725
|
+
b2: [...b.b2.values]
|
|
1744
1726
|
})),
|
|
1745
1727
|
outputProj: this.outputProj.W.map((r) => [...r]),
|
|
1746
|
-
outputBias: [...this.outputBias]
|
|
1728
|
+
outputBias: [...this.outputBias.values]
|
|
1747
1729
|
};
|
|
1748
1730
|
}
|
|
1749
1731
|
setWeightsStructured(data) {
|
|
1750
|
-
data.inputProj.
|
|
1751
|
-
this.inputProj.W[i] = [...row];
|
|
1752
|
-
});
|
|
1732
|
+
this.inputProj.setWeights(data.inputProj.flat());
|
|
1753
1733
|
data.blocks.forEach((bd, b) => {
|
|
1754
1734
|
const blk = this.blocks[b];
|
|
1755
1735
|
bd.attn.heads.forEach((hd, h) => {
|
|
@@ -1764,11 +1744,11 @@ var NetworkTransformerRL = class {
|
|
|
1764
1744
|
blk.norm2.beta = [...bd.norm2.beta];
|
|
1765
1745
|
blk.ff1.W = bd.ff1.map((r) => [...r]);
|
|
1766
1746
|
blk.ff2.W = bd.ff2.map((r) => [...r]);
|
|
1767
|
-
blk.b1
|
|
1768
|
-
blk.b2
|
|
1747
|
+
blk.b1.setWeights(bd.b1);
|
|
1748
|
+
blk.b2.setWeights(bd.b2);
|
|
1769
1749
|
});
|
|
1770
1750
|
this.outputProj.W = data.outputProj.map((r) => [...r]);
|
|
1771
|
-
this.outputBias
|
|
1751
|
+
this.outputBias.setWeights(data.outputBias);
|
|
1772
1752
|
}
|
|
1773
1753
|
// ── Serializable interface (flat array) ────────────────────────────────────
|
|
1774
1754
|
// These satisfy the Serializable interface from ModelSaver, which requires
|
|
@@ -1927,7 +1907,7 @@ function tanhFn(x) {
|
|
|
1927
1907
|
var Gate2 = class {
|
|
1928
1908
|
constructor(inputSize, hSize, initBias = 0) {
|
|
1929
1909
|
const n = inputSize + hSize;
|
|
1930
|
-
const limit = Math.sqrt(2 / n);
|
|
1910
|
+
const limit = Math.sqrt(2 / (n + hSize));
|
|
1931
1911
|
this.W = Array.from(
|
|
1932
1912
|
{ length: hSize },
|
|
1933
1913
|
() => Array.from({ length: n }, () => (Math.random() * 2 - 1) * limit)
|
|
@@ -2699,6 +2679,7 @@ var ModelSaver = class _ModelSaver {
|
|
|
2699
2679
|
Adam,
|
|
2700
2680
|
AttentionHead,
|
|
2701
2681
|
BatchNorm,
|
|
2682
|
+
BiasVector,
|
|
2702
2683
|
ClipOptimizer,
|
|
2703
2684
|
ClippedOptimizerFactory,
|
|
2704
2685
|
Conv1D,
|
|
@@ -2727,6 +2708,7 @@ var ModelSaver = class _ModelSaver {
|
|
|
2727
2708
|
crossEntropy,
|
|
2728
2709
|
crossEntropyDelta,
|
|
2729
2710
|
crossEntropyDeltaRaw,
|
|
2711
|
+
defaultOptimizer,
|
|
2730
2712
|
elu,
|
|
2731
2713
|
leakyRelu,
|
|
2732
2714
|
linear,
|