@dniskav/neuron 0.2.5 → 0.2.7
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/README.md +53 -3
- package/dist/index.d.mts +17 -10
- package/dist/index.d.ts +17 -10
- package/dist/index.js +144 -162
- package/dist/index.mjs +142 -162
- package/package.json +1 -1
package/dist/index.mjs
CHANGED
|
@@ -127,6 +127,7 @@ function makeElu(alpha = 1) {
|
|
|
127
127
|
var elu = makeElu(1);
|
|
128
128
|
|
|
129
129
|
// src/optimizers.ts
|
|
130
|
+
var defaultOptimizer = () => new SGD();
|
|
130
131
|
var SGD = class {
|
|
131
132
|
step(weight, gradient, lr) {
|
|
132
133
|
return weight + lr * gradient;
|
|
@@ -175,7 +176,6 @@ var Adam = class {
|
|
|
175
176
|
};
|
|
176
177
|
|
|
177
178
|
// src/NeuronN.ts
|
|
178
|
-
var defaultOptimizer = () => new SGD();
|
|
179
179
|
var NeuronN = class {
|
|
180
180
|
constructor(nInputs, activation = sigmoid2, optimizerFactory = defaultOptimizer) {
|
|
181
181
|
const limit = Math.sqrt(1 / nInputs);
|
|
@@ -204,9 +204,8 @@ var NeuronN = class {
|
|
|
204
204
|
};
|
|
205
205
|
|
|
206
206
|
// src/Layer.ts
|
|
207
|
-
var defaultOptimizer2 = () => new SGD();
|
|
208
207
|
var Layer = class {
|
|
209
|
-
constructor(nNeurons, nInputs, activation = sigmoid2, optimizerFactory =
|
|
208
|
+
constructor(nNeurons, nInputs, activation = sigmoid2, optimizerFactory = defaultOptimizer) {
|
|
210
209
|
this.neurons = Array.from(
|
|
211
210
|
{ length: nNeurons },
|
|
212
211
|
() => new NeuronN(nInputs, activation, optimizerFactory)
|
|
@@ -226,7 +225,7 @@ var Network = class {
|
|
|
226
225
|
predict(inputs) {
|
|
227
226
|
validateArray(inputs, this.hiddenLayer.neurons[0].weights.length, "Network.predict");
|
|
228
227
|
const hiddenOut = this.hiddenLayer.predict(inputs);
|
|
229
|
-
return this.outputLayer.predict(hiddenOut)
|
|
228
|
+
return this.outputLayer.predict(hiddenOut);
|
|
230
229
|
}
|
|
231
230
|
// Trains on a single example. Returns the squared error.
|
|
232
231
|
train(inputs, target, lr) {
|
|
@@ -235,22 +234,17 @@ var Network = class {
|
|
|
235
234
|
validateNumber(lr, "Network.train");
|
|
236
235
|
const hiddenOut = this.hiddenLayer.predict(inputs);
|
|
237
236
|
const prediction = this.outputLayer.predict(hiddenOut)[0];
|
|
238
|
-
const outputError = target - prediction;
|
|
239
|
-
const outputDelta = outputError * prediction * (1 - prediction);
|
|
240
237
|
const outputNeuron = this.outputLayer.neurons[0];
|
|
238
|
+
const outputError = target - prediction;
|
|
239
|
+
const outputDelta = outputError * outputNeuron.activation.dfn(prediction);
|
|
241
240
|
const hiddenDeltas = this.hiddenLayer.neurons.map((neuron, i) => {
|
|
242
|
-
const hiddenOut_i = hiddenOut[i];
|
|
243
241
|
const hiddenError = outputDelta * outputNeuron.weights[i];
|
|
244
|
-
return hiddenError *
|
|
242
|
+
return hiddenError * neuron.activation.dfn(hiddenOut[i]);
|
|
245
243
|
});
|
|
246
244
|
this.hiddenLayer.neurons.forEach((neuron, i) => {
|
|
247
|
-
neuron.
|
|
248
|
-
neuron.bias += lr * hiddenDeltas[i];
|
|
245
|
+
neuron._update(inputs.map((inp) => hiddenDeltas[i] * inp), hiddenDeltas[i], lr);
|
|
249
246
|
});
|
|
250
|
-
outputNeuron.
|
|
251
|
-
(w, i) => w + lr * outputDelta * hiddenOut[i]
|
|
252
|
-
);
|
|
253
|
-
outputNeuron.bias += lr * outputDelta;
|
|
247
|
+
outputNeuron._update(hiddenOut.map((h) => outputDelta * h), outputDelta, lr);
|
|
254
248
|
return outputError * outputError;
|
|
255
249
|
}
|
|
256
250
|
// ── Flat weight serialization ─────────────────────────────────────────────
|
|
@@ -320,13 +314,12 @@ var Dropout = class {
|
|
|
320
314
|
};
|
|
321
315
|
|
|
322
316
|
// src/NetworkN.ts
|
|
323
|
-
var defaultOptimizer3 = () => new SGD();
|
|
324
317
|
var NetworkN = class {
|
|
325
318
|
constructor(structure, options = {}) {
|
|
326
319
|
this.structure = structure;
|
|
327
320
|
const nLayers = structure.length - 1;
|
|
328
321
|
const activations = options.activations ?? Array.from({ length: nLayers }, () => sigmoid2);
|
|
329
|
-
const optimizer = options.optimizer ??
|
|
322
|
+
const optimizer = options.optimizer ?? defaultOptimizer;
|
|
330
323
|
const dropoutRate = options.dropoutRate ?? 0;
|
|
331
324
|
if (activations.length !== nLayers) {
|
|
332
325
|
throw new Error(`Expected ${nLayers} activations, got ${activations.length}`);
|
|
@@ -379,73 +372,69 @@ var NetworkN = class {
|
|
|
379
372
|
train(inputs, targets, lr) {
|
|
380
373
|
validateArray(inputs, this.structure[0], "NetworkN.train");
|
|
381
374
|
validateArray(targets, this.structure[this.structure.length - 1], "NetworkN.train");
|
|
382
|
-
const act =
|
|
383
|
-
for (let i = 0; i < this.layers.length; i++) {
|
|
384
|
-
const layerInput = act[act.length - 1];
|
|
385
|
-
const layerOutput = this.layers[i].predict(layerInput);
|
|
386
|
-
let current;
|
|
387
|
-
if (this._shouldResidual(i)) {
|
|
388
|
-
if (this.structure[i] === this.structure[i + 1]) {
|
|
389
|
-
current = layerOutput.map((v, j) => v + layerInput[j]);
|
|
390
|
-
} else {
|
|
391
|
-
current = [...layerOutput];
|
|
392
|
-
}
|
|
393
|
-
} else {
|
|
394
|
-
current = [...layerOutput];
|
|
395
|
-
}
|
|
396
|
-
if (i < this._dropouts.length) {
|
|
397
|
-
current = this._dropouts[i].forward(current, true);
|
|
398
|
-
}
|
|
399
|
-
act.push(current);
|
|
400
|
-
}
|
|
375
|
+
const act = this._forwardAll(inputs, true);
|
|
401
376
|
const pred = act[act.length - 1];
|
|
402
377
|
const outAct = this.layers[this.layers.length - 1].neurons[0].activation;
|
|
403
|
-
|
|
404
|
-
|
|
405
|
-
const layer = this.layers[l];
|
|
406
|
-
if (l < this._dropouts.length) {
|
|
407
|
-
deltas = this._dropouts[l].backward(deltas);
|
|
408
|
-
}
|
|
409
|
-
const layerIn = act[l];
|
|
410
|
-
const prevAct = l > 0 ? this.layers[l - 1].neurons[0].activation : null;
|
|
411
|
-
const prevDeltas = layerIn.map((out, j) => {
|
|
412
|
-
const errProp = layer.neurons.reduce((s, n, k) => s + deltas[k] * n.weights[j], 0);
|
|
413
|
-
return prevAct ? errProp * prevAct.dfn(out) : errProp;
|
|
414
|
-
});
|
|
415
|
-
if (this._shouldResidual(l) && this.structure[l] === this.structure[l + 1]) {
|
|
416
|
-
for (let j = 0; j < prevDeltas.length; j++) {
|
|
417
|
-
prevDeltas[j] += deltas[j];
|
|
418
|
-
}
|
|
419
|
-
}
|
|
420
|
-
layer.neurons.forEach((n, k) => {
|
|
421
|
-
n._update(layerIn.map((inp) => deltas[k] * inp), deltas[k], lr);
|
|
422
|
-
});
|
|
423
|
-
deltas = prevDeltas;
|
|
424
|
-
}
|
|
378
|
+
const deltas = pred.map((p, i) => (targets[i] - p) * outAct.dfn(p));
|
|
379
|
+
this._backpropLayers(act, deltas, lr);
|
|
425
380
|
return pred.reduce((s, p, i) => s + (targets[i] - p) ** 2, 0) / pred.length;
|
|
426
381
|
}
|
|
427
382
|
// Backprop with externally provided output-layer deltas.
|
|
428
383
|
// Useful for custom loss functions (e.g. physics-based gradients).
|
|
429
384
|
trainWithDeltas(inputs, outputDeltas, lr) {
|
|
385
|
+
const act = this._forwardAll(inputs, true);
|
|
386
|
+
this._backpropLayers(act, outputDeltas, lr);
|
|
387
|
+
}
|
|
388
|
+
// ── Flat weight serialization ─────────────────────────────────────────────
|
|
389
|
+
// Order: layer 0 (all neurons), layer 1, ..., layer N.
|
|
390
|
+
getWeights() {
|
|
391
|
+
for (const d of this._dropouts) d.resetMask();
|
|
392
|
+
const w = [];
|
|
393
|
+
for (const layer of this.layers) {
|
|
394
|
+
for (const n of layer.neurons) {
|
|
395
|
+
w.push(...n.weights, n.bias);
|
|
396
|
+
}
|
|
397
|
+
}
|
|
398
|
+
return w;
|
|
399
|
+
}
|
|
400
|
+
setWeights(weights) {
|
|
401
|
+
for (const d of this._dropouts) d.resetMask();
|
|
402
|
+
let idx = 0;
|
|
403
|
+
for (const layer of this.layers) {
|
|
404
|
+
for (const n of layer.neurons) {
|
|
405
|
+
for (let j = 0; j < n.weights.length; j++) n.weights[j] = weights[idx++];
|
|
406
|
+
n.bias = weights[idx++];
|
|
407
|
+
}
|
|
408
|
+
}
|
|
409
|
+
}
|
|
410
|
+
// ── Private helpers ──────────────────────────────────────────────────────
|
|
411
|
+
_shouldResidual(layerIndex) {
|
|
412
|
+
if (typeof this._residual === "function") return this._residual(layerIndex);
|
|
413
|
+
return this._residual;
|
|
414
|
+
}
|
|
415
|
+
// Forward pass storing activations at every layer boundary.
|
|
416
|
+
// Used by train(), trainWithDeltas(), and predict() shares the same logic.
|
|
417
|
+
_forwardAll(inputs, training) {
|
|
430
418
|
const act = [inputs];
|
|
431
419
|
for (let i = 0; i < this.layers.length; i++) {
|
|
432
420
|
const layerInput = act[act.length - 1];
|
|
433
421
|
const layerOutput = this.layers[i].predict(layerInput);
|
|
434
422
|
let current;
|
|
435
|
-
if (this._shouldResidual(i)) {
|
|
436
|
-
|
|
437
|
-
current = layerOutput.map((v, j) => v + layerInput[j]);
|
|
438
|
-
} else {
|
|
439
|
-
current = [...layerOutput];
|
|
440
|
-
}
|
|
423
|
+
if (this._shouldResidual(i) && this.structure[i] === this.structure[i + 1]) {
|
|
424
|
+
current = layerOutput.map((v, j) => v + layerInput[j]);
|
|
441
425
|
} else {
|
|
442
|
-
current =
|
|
426
|
+
current = layerOutput;
|
|
443
427
|
}
|
|
444
428
|
if (i < this._dropouts.length) {
|
|
445
|
-
current = this._dropouts[i].forward(current,
|
|
429
|
+
current = this._dropouts[i].forward(current, training);
|
|
446
430
|
}
|
|
447
431
|
act.push(current);
|
|
448
432
|
}
|
|
433
|
+
return act;
|
|
434
|
+
}
|
|
435
|
+
// Backward pass: updates all layer weights given the pre-computed activations
|
|
436
|
+
// and the initial output-layer deltas.
|
|
437
|
+
_backpropLayers(act, outputDeltas, lr) {
|
|
449
438
|
let deltas = outputDeltas;
|
|
450
439
|
for (let l = this.layers.length - 1; l >= 0; l--) {
|
|
451
440
|
const layer = this.layers[l];
|
|
@@ -459,9 +448,7 @@ var NetworkN = class {
|
|
|
459
448
|
return prevAct ? errProp * prevAct.dfn(out) : errProp;
|
|
460
449
|
});
|
|
461
450
|
if (this._shouldResidual(l) && this.structure[l] === this.structure[l + 1]) {
|
|
462
|
-
for (let j = 0; j < prevDeltas.length; j++)
|
|
463
|
-
prevDeltas[j] += deltas[j];
|
|
464
|
-
}
|
|
451
|
+
for (let j = 0; j < prevDeltas.length; j++) prevDeltas[j] += deltas[j];
|
|
465
452
|
}
|
|
466
453
|
layer.neurons.forEach((n, k) => {
|
|
467
454
|
n._update(layerIn.map((inp) => deltas[k] * inp), deltas[k], lr);
|
|
@@ -469,33 +456,6 @@ var NetworkN = class {
|
|
|
469
456
|
deltas = prevDeltas;
|
|
470
457
|
}
|
|
471
458
|
}
|
|
472
|
-
// ── Flat weight serialization ─────────────────────────────────────────────
|
|
473
|
-
// Order: layer 0 (all neurons), layer 1, ..., layer N.
|
|
474
|
-
getWeights() {
|
|
475
|
-
for (const d of this._dropouts) d.resetMask();
|
|
476
|
-
const w = [];
|
|
477
|
-
for (const layer of this.layers) {
|
|
478
|
-
for (const n of layer.neurons) {
|
|
479
|
-
w.push(...n.weights, n.bias);
|
|
480
|
-
}
|
|
481
|
-
}
|
|
482
|
-
return w;
|
|
483
|
-
}
|
|
484
|
-
setWeights(weights) {
|
|
485
|
-
for (const d of this._dropouts) d.resetMask();
|
|
486
|
-
let idx = 0;
|
|
487
|
-
for (const layer of this.layers) {
|
|
488
|
-
for (const n of layer.neurons) {
|
|
489
|
-
for (let j = 0; j < n.weights.length; j++) n.weights[j] = weights[idx++];
|
|
490
|
-
n.bias = weights[idx++];
|
|
491
|
-
}
|
|
492
|
-
}
|
|
493
|
-
}
|
|
494
|
-
// ── Helper ───────────────────────────────────────────────────────────────
|
|
495
|
-
_shouldResidual(layerIndex) {
|
|
496
|
-
if (typeof this._residual === "function") return this._residual(layerIndex);
|
|
497
|
-
return this._residual;
|
|
498
|
-
}
|
|
499
459
|
};
|
|
500
460
|
|
|
501
461
|
// src/LSTMLayer.ts
|
|
@@ -510,7 +470,7 @@ var Gate = class {
|
|
|
510
470
|
// shape: [hSize]
|
|
511
471
|
constructor(inputSize, hSize, initBias = 0) {
|
|
512
472
|
const n = inputSize + hSize;
|
|
513
|
-
const limit = Math.sqrt(2 / n);
|
|
473
|
+
const limit = Math.sqrt(2 / (n + hSize));
|
|
514
474
|
this.W = Array.from(
|
|
515
475
|
{ length: hSize },
|
|
516
476
|
() => Array.from({ length: n }, () => (Math.random() * 2 - 1) * limit)
|
|
@@ -709,7 +669,6 @@ var LSTMLayer = class {
|
|
|
709
669
|
};
|
|
710
670
|
|
|
711
671
|
// src/NetworkLSTM.ts
|
|
712
|
-
var defaultOptimizer4 = () => new SGD();
|
|
713
672
|
var NetworkLSTM = class {
|
|
714
673
|
// [T][layer+1][neuron]
|
|
715
674
|
constructor(inputSize, hiddenSize, denseStructure, options = {}) {
|
|
@@ -717,7 +676,7 @@ var NetworkLSTM = class {
|
|
|
717
676
|
this.hiddenSize = hiddenSize;
|
|
718
677
|
this.lstm = new LSTMLayer(inputSize, hiddenSize);
|
|
719
678
|
const activation = options.denseActivation ?? sigmoid2;
|
|
720
|
-
const optimizer = options.optimizer ??
|
|
679
|
+
const optimizer = options.optimizer ?? defaultOptimizer;
|
|
721
680
|
this.denseLayers = [];
|
|
722
681
|
const sizes = [hiddenSize, ...denseStructure];
|
|
723
682
|
for (let i = 1; i < sizes.length; i++) {
|
|
@@ -904,6 +863,22 @@ var WeightMatrix = class {
|
|
|
904
863
|
for (let j = 0; j < this.W[i].length; j++) this.W[i][j] = weights[idx++];
|
|
905
864
|
}
|
|
906
865
|
};
|
|
866
|
+
var BiasVector = class {
|
|
867
|
+
constructor(size) {
|
|
868
|
+
this.values = new Array(size).fill(0);
|
|
869
|
+
this.opts = Array.from({ length: size }, () => new Adam());
|
|
870
|
+
}
|
|
871
|
+
update(grad, lr) {
|
|
872
|
+
for (let i = 0; i < this.values.length; i++)
|
|
873
|
+
this.values[i] = this.opts[i].step(this.values[i], grad[i], lr);
|
|
874
|
+
}
|
|
875
|
+
getWeights() {
|
|
876
|
+
return [...this.values];
|
|
877
|
+
}
|
|
878
|
+
setWeights(weights) {
|
|
879
|
+
for (let i = 0; i < this.values.length; i++) this.values[i] = weights[i];
|
|
880
|
+
}
|
|
881
|
+
};
|
|
907
882
|
var EmbeddingMatrix = class {
|
|
908
883
|
constructor(vocabSize, d_model) {
|
|
909
884
|
const limit = Math.sqrt(1 / d_model);
|
|
@@ -989,6 +964,7 @@ var AttentionHead = class {
|
|
|
989
964
|
// 5. dWq = dQ^T @ X, dWk = dK^T @ X, dWv = dV^T @ X
|
|
990
965
|
// 6. dX = dQ @ Wq + dK @ Wk + dV @ Wv
|
|
991
966
|
backward(dOut, lr) {
|
|
967
|
+
if (!this.cache) throw new Error("AttentionHead.backward() called before predict()");
|
|
992
968
|
const { X, Q, K, V, attn } = this.cache;
|
|
993
969
|
const seqLen = X.length;
|
|
994
970
|
const d_model = X[0].length;
|
|
@@ -1116,6 +1092,7 @@ var MultiHeadAttention = class {
|
|
|
1116
1092
|
// ── Backward ──────────────────────────────────────────────────────────────
|
|
1117
1093
|
// dOut: seqLen × d_model → dX: seqLen × d_model
|
|
1118
1094
|
backward(dOut, lr) {
|
|
1095
|
+
if (!this._concat) throw new Error("MultiHeadAttention.backward() called before predict()");
|
|
1119
1096
|
const seqLen = dOut.length;
|
|
1120
1097
|
const concatD = this.nHeads * this.d_k;
|
|
1121
1098
|
const d_model = this.d_model;
|
|
@@ -1220,11 +1197,12 @@ var LayerNorm = class {
|
|
|
1220
1197
|
backwardOne(dOut, pos, lr) {
|
|
1221
1198
|
const { x_norm, std } = this._cache[pos];
|
|
1222
1199
|
const N = dOut.length;
|
|
1200
|
+
const gammaOld = this.gamma.slice();
|
|
1223
1201
|
for (let i = 0; i < N; i++) {
|
|
1224
1202
|
this.gamma[i] += lr * dOut[i] * x_norm[i];
|
|
1225
1203
|
this.beta[i] += lr * dOut[i];
|
|
1226
1204
|
}
|
|
1227
|
-
const D = dOut.map((d, i) => d *
|
|
1205
|
+
const D = dOut.map((d, i) => d * gammaOld[i]);
|
|
1228
1206
|
const mD = D.reduce((s, v) => s + v, 0) / N;
|
|
1229
1207
|
const mDxn = D.reduce((s, d, i) => s + d * x_norm[i], 0) / N;
|
|
1230
1208
|
return D.map((d, i) => (d - mD - x_norm[i] * mDxn) / std);
|
|
@@ -1244,6 +1222,7 @@ var LayerNorm = class {
|
|
|
1244
1222
|
// src/TransformerBlock.ts
|
|
1245
1223
|
var TransformerBlock = class {
|
|
1246
1224
|
constructor({ d_model, nHeads, d_ff, causal = false }) {
|
|
1225
|
+
// d_model
|
|
1247
1226
|
// Forward caches (needed for backprop)
|
|
1248
1227
|
this._X = null;
|
|
1249
1228
|
this._attnOut = null;
|
|
@@ -1260,10 +1239,8 @@ var TransformerBlock = class {
|
|
|
1260
1239
|
this.norm2 = new LayerNorm(d_model);
|
|
1261
1240
|
this.ff1 = new WeightMatrix(d_ff, d_model);
|
|
1262
1241
|
this.ff2 = new WeightMatrix(d_model, d_ff);
|
|
1263
|
-
this.b1 = new
|
|
1264
|
-
this.b2 = new
|
|
1265
|
-
this.b1Opts = Array.from({ length: d_ff }, () => new Adam());
|
|
1266
|
-
this.b2Opts = Array.from({ length: d_model }, () => new Adam());
|
|
1242
|
+
this.b1 = new BiasVector(d_ff);
|
|
1243
|
+
this.b2 = new BiasVector(d_model);
|
|
1267
1244
|
}
|
|
1268
1245
|
// ── Forward ───────────────────────────────────────────────────────────────
|
|
1269
1246
|
// X: seqLen × d_model → out: seqLen × d_model
|
|
@@ -1276,11 +1253,11 @@ var TransformerBlock = class {
|
|
|
1276
1253
|
return this.norm1.predictOne(added, i);
|
|
1277
1254
|
});
|
|
1278
1255
|
const ff1Pre = h1.map(
|
|
1279
|
-
(h) => this.ff1.W.map((row, k) => row.reduce((s, w, m) => s + w * h[m], this.b1[k]))
|
|
1256
|
+
(h) => this.ff1.W.map((row, k) => row.reduce((s, w, m) => s + w * h[m], this.b1.values[k]))
|
|
1280
1257
|
);
|
|
1281
1258
|
const ff1Out = ff1Pre.map((pre) => pre.map((v) => Math.max(0, v)));
|
|
1282
1259
|
const ff2Out = ff1Out.map(
|
|
1283
|
-
(h) => this.ff2.W.map((row, k) => row.reduce((s, w, m) => s + w * h[m], this.b2[k]))
|
|
1260
|
+
(h) => this.ff2.W.map((row, k) => row.reduce((s, w, m) => s + w * h[m], this.b2.values[k]))
|
|
1284
1261
|
);
|
|
1285
1262
|
this.norm2.resetCache(seqLen);
|
|
1286
1263
|
const out = h1.map((h, i) => {
|
|
@@ -1298,6 +1275,9 @@ var TransformerBlock = class {
|
|
|
1298
1275
|
// ── Backward ──────────────────────────────────────────────────────────────
|
|
1299
1276
|
// dOut: seqLen × d_model → dX: seqLen × d_model
|
|
1300
1277
|
backward(dOut, lr) {
|
|
1278
|
+
if (!this._h1 || !this._ff1Out || !this._ff1Pre) {
|
|
1279
|
+
throw new Error("TransformerBlock.backward() called before predict()");
|
|
1280
|
+
}
|
|
1301
1281
|
const seqLen = dOut.length;
|
|
1302
1282
|
const d_model = this.d_model;
|
|
1303
1283
|
const h1 = this._h1;
|
|
@@ -1322,8 +1302,7 @@ var TransformerBlock = class {
|
|
|
1322
1302
|
(_, m) => dAdded2.reduce((s, da) => s + da[m], 0)
|
|
1323
1303
|
);
|
|
1324
1304
|
this.ff2.update(dW2, lr);
|
|
1325
|
-
|
|
1326
|
-
this.b2[m] = this.b2Opts[m].step(this.b2[m], db2[m], lr);
|
|
1305
|
+
this.b2.update(db2, lr);
|
|
1327
1306
|
const dFf1Pre = dFf1Out.map(
|
|
1328
1307
|
(d, i) => d.map((v, k) => ff1Pre[i][k] > 0 ? v : 0)
|
|
1329
1308
|
);
|
|
@@ -1345,8 +1324,7 @@ var TransformerBlock = class {
|
|
|
1345
1324
|
(_, k) => dFf1Pre.reduce((s, dp) => s + dp[k], 0)
|
|
1346
1325
|
);
|
|
1347
1326
|
this.ff1.update(dW1, lr);
|
|
1348
|
-
|
|
1349
|
-
this.b1[k] = this.b1Opts[k].step(this.b1[k], db1[k], lr);
|
|
1327
|
+
this.b1.update(db1, lr);
|
|
1350
1328
|
const dH1 = Array.from(
|
|
1351
1329
|
{ length: seqLen },
|
|
1352
1330
|
(_, i) => dH1_fromFf[i].map((v, m) => v + dAdded2[i][m])
|
|
@@ -1375,9 +1353,9 @@ var TransformerBlock = class {
|
|
|
1375
1353
|
w.push(...this.attn.getWeights());
|
|
1376
1354
|
w.push(...this.norm1.gamma, ...this.norm1.beta);
|
|
1377
1355
|
for (const row of this.ff1.W) w.push(...row);
|
|
1378
|
-
w.push(...this.b1);
|
|
1356
|
+
w.push(...this.b1.values);
|
|
1379
1357
|
for (const row of this.ff2.W) w.push(...row);
|
|
1380
|
-
w.push(...this.b2);
|
|
1358
|
+
w.push(...this.b2.values);
|
|
1381
1359
|
w.push(...this.norm2.gamma, ...this.norm2.beta);
|
|
1382
1360
|
return w;
|
|
1383
1361
|
}
|
|
@@ -1386,16 +1364,17 @@ var TransformerBlock = class {
|
|
|
1386
1364
|
const attnLen = this.attn.getWeights().length;
|
|
1387
1365
|
this.attn.setWeights(weights.slice(idx, idx + attnLen));
|
|
1388
1366
|
idx += attnLen;
|
|
1389
|
-
|
|
1390
|
-
|
|
1391
|
-
|
|
1392
|
-
|
|
1393
|
-
|
|
1394
|
-
|
|
1395
|
-
|
|
1396
|
-
|
|
1397
|
-
|
|
1398
|
-
|
|
1367
|
+
this.norm1.setWeights(weights.slice(idx, idx + this.norm1.getWeights().length));
|
|
1368
|
+
idx += this.norm1.getWeights().length;
|
|
1369
|
+
this.ff1.setWeights(weights.slice(idx, idx + this.ff1.getWeights().length));
|
|
1370
|
+
idx += this.ff1.getWeights().length;
|
|
1371
|
+
this.b1.setWeights(weights.slice(idx, idx + this.b1.values.length));
|
|
1372
|
+
idx += this.b1.values.length;
|
|
1373
|
+
this.ff2.setWeights(weights.slice(idx, idx + this.ff2.getWeights().length));
|
|
1374
|
+
idx += this.ff2.getWeights().length;
|
|
1375
|
+
this.b2.setWeights(weights.slice(idx, idx + this.b2.values.length));
|
|
1376
|
+
idx += this.b2.values.length;
|
|
1377
|
+
this.norm2.setWeights(weights.slice(idx, idx + this.norm2.getWeights().length));
|
|
1399
1378
|
}
|
|
1400
1379
|
};
|
|
1401
1380
|
|
|
@@ -1421,8 +1400,7 @@ var NetworkTransformer = class {
|
|
|
1421
1400
|
() => new TransformerBlock({ d_model, nHeads, d_ff })
|
|
1422
1401
|
);
|
|
1423
1402
|
this.outputProj = new WeightMatrix(nClasses, d_model);
|
|
1424
|
-
this.outputBias = new
|
|
1425
|
-
this.outBiasOpts = Array.from({ length: nClasses }, () => new Adam());
|
|
1403
|
+
this.outputBias = new BiasVector(nClasses);
|
|
1426
1404
|
}
|
|
1427
1405
|
// ── Forward pass ──────────────────────────────────────────────────────────
|
|
1428
1406
|
// tokens: seqLen integer ids → seqLen * nClasses logits (flattened)
|
|
@@ -1430,7 +1408,7 @@ var NetworkTransformer = class {
|
|
|
1430
1408
|
const h = this._forward(tokens);
|
|
1431
1409
|
return h.flatMap(
|
|
1432
1410
|
(hi) => this.outputProj.W.map(
|
|
1433
|
-
(row, c) => row.reduce((s, w, m) => s + w * hi[m], this.outputBias[c])
|
|
1411
|
+
(row, c) => row.reduce((s, w, m) => s + w * hi[m], this.outputBias.values[c])
|
|
1434
1412
|
)
|
|
1435
1413
|
);
|
|
1436
1414
|
}
|
|
@@ -1444,7 +1422,7 @@ var NetworkTransformer = class {
|
|
|
1444
1422
|
const h = this._forward(tokens);
|
|
1445
1423
|
const logits = h.map(
|
|
1446
1424
|
(hi) => this.outputProj.W.map(
|
|
1447
|
-
(row, c) => row.reduce((s, w, m) => s + w * hi[m], this.outputBias[c])
|
|
1425
|
+
(row, c) => row.reduce((s, w, m) => s + w * hi[m], this.outputBias.values[c])
|
|
1448
1426
|
)
|
|
1449
1427
|
);
|
|
1450
1428
|
let loss = 0;
|
|
@@ -1479,8 +1457,7 @@ var NetworkTransformer = class {
|
|
|
1479
1457
|
(_, c) => dLogits.reduce((s, dl) => s + dl[c], 0)
|
|
1480
1458
|
);
|
|
1481
1459
|
this.outputProj.update(dWout, lr);
|
|
1482
|
-
|
|
1483
|
-
this.outputBias[c] = this.outBiasOpts[c].step(this.outputBias[c], dBout[c], lr);
|
|
1460
|
+
this.outputBias.update(dBout, lr);
|
|
1484
1461
|
let dX = dH;
|
|
1485
1462
|
for (let b = this.blocks.length - 1; b >= 0; b--)
|
|
1486
1463
|
dX = this.blocks[b].backward(dX, lr);
|
|
@@ -1499,27 +1476,30 @@ var NetworkTransformer = class {
|
|
|
1499
1476
|
// Order: tokenEmb, posEmb, block0, block1, ..., blockN, outputProj, outputBias.
|
|
1500
1477
|
getWeights() {
|
|
1501
1478
|
const w = [];
|
|
1502
|
-
|
|
1503
|
-
|
|
1479
|
+
w.push(...this.tokenEmb.getWeights());
|
|
1480
|
+
w.push(...this.posEmb.getWeights());
|
|
1504
1481
|
for (const block of this.blocks) w.push(...block.getWeights());
|
|
1505
|
-
|
|
1506
|
-
w.push(...this.outputBias);
|
|
1482
|
+
w.push(...this.outputProj.getWeights());
|
|
1483
|
+
w.push(...this.outputBias.getWeights());
|
|
1507
1484
|
return w;
|
|
1508
1485
|
}
|
|
1509
1486
|
setWeights(weights) {
|
|
1510
1487
|
let idx = 0;
|
|
1511
|
-
|
|
1512
|
-
|
|
1513
|
-
|
|
1514
|
-
|
|
1488
|
+
const tokenEmbLen = this.tokenEmb.getWeights().length;
|
|
1489
|
+
this.tokenEmb.setWeights(weights.slice(idx, idx + tokenEmbLen));
|
|
1490
|
+
idx += tokenEmbLen;
|
|
1491
|
+
const posEmbLen = this.posEmb.getWeights().length;
|
|
1492
|
+
this.posEmb.setWeights(weights.slice(idx, idx + posEmbLen));
|
|
1493
|
+
idx += posEmbLen;
|
|
1515
1494
|
for (const block of this.blocks) {
|
|
1516
1495
|
const blockLen = block.getWeights().length;
|
|
1517
1496
|
block.setWeights(weights.slice(idx, idx + blockLen));
|
|
1518
1497
|
idx += blockLen;
|
|
1519
1498
|
}
|
|
1520
|
-
|
|
1521
|
-
|
|
1522
|
-
|
|
1499
|
+
const outProjLen = this.outputProj.getWeights().length;
|
|
1500
|
+
this.outputProj.setWeights(weights.slice(idx, idx + outProjLen));
|
|
1501
|
+
idx += outProjLen;
|
|
1502
|
+
this.outputBias.setWeights(weights.slice(idx, idx + this.outputBias.values.length));
|
|
1523
1503
|
}
|
|
1524
1504
|
// ── Internal ──────────────────────────────────────────────────────────────
|
|
1525
1505
|
// Shared embedding + block forward pass.
|
|
@@ -1561,8 +1541,7 @@ var NetworkTransformerRL = class {
|
|
|
1561
1541
|
() => new TransformerBlock({ d_model, nHeads, d_ff, causal: true })
|
|
1562
1542
|
);
|
|
1563
1543
|
this.outputProj = new WeightMatrix(nActions, d_model);
|
|
1564
|
-
this.outputBias = new
|
|
1565
|
-
this.outBiasOpts = Array.from({ length: nActions }, () => new Adam());
|
|
1544
|
+
this.outputBias = new BiasVector(nActions);
|
|
1566
1545
|
}
|
|
1567
1546
|
// ── Forward ────────────────────────────────────────────────────────────────
|
|
1568
1547
|
// sequence: seqLen × inputDim → nActions Q-values
|
|
@@ -1570,7 +1549,7 @@ var NetworkTransformerRL = class {
|
|
|
1570
1549
|
const h = this._forward(sequence);
|
|
1571
1550
|
const pooled = this._pool(h);
|
|
1572
1551
|
return this.outputProj.W.map(
|
|
1573
|
-
(row, c) => row.reduce((s, w, m) => s + w * pooled[m], this.outputBias[c])
|
|
1552
|
+
(row, c) => row.reduce((s, w, m) => s + w * pooled[m], this.outputBias.values[c])
|
|
1574
1553
|
);
|
|
1575
1554
|
}
|
|
1576
1555
|
// ── Training ────────────────────────────────────────────────────────────────
|
|
@@ -1582,7 +1561,7 @@ var NetworkTransformerRL = class {
|
|
|
1582
1561
|
const h = this._forward(sequence);
|
|
1583
1562
|
const pooled = this._pool(h);
|
|
1584
1563
|
const pred = this.outputProj.W.map(
|
|
1585
|
-
(row, c) => row.reduce((s, w, m) => s + w * pooled[m], this.outputBias[c])
|
|
1564
|
+
(row, c) => row.reduce((s, w, m) => s + w * pooled[m], this.outputBias.values[c])
|
|
1586
1565
|
);
|
|
1587
1566
|
const n = this.nActions;
|
|
1588
1567
|
let loss = 0;
|
|
@@ -1605,8 +1584,7 @@ var NetworkTransformerRL = class {
|
|
|
1605
1584
|
);
|
|
1606
1585
|
const dBout = dPred.slice();
|
|
1607
1586
|
this.outputProj.update(dWout, lr);
|
|
1608
|
-
|
|
1609
|
-
this.outputBias[c] = this.outBiasOpts[c].step(this.outputBias[c], dBout[c], lr);
|
|
1587
|
+
this.outputBias.update(dBout, lr);
|
|
1610
1588
|
let dH = this._distributePoolGradient(dPooled);
|
|
1611
1589
|
for (let b = this.blocks.length - 1; b >= 0; b--)
|
|
1612
1590
|
dH = this.blocks[b].backward(dH, lr);
|
|
@@ -1630,24 +1608,26 @@ var NetworkTransformerRL = class {
|
|
|
1630
1608
|
// Order: inputProj, block0, block1, ..., blockN, outputProj, outputBias.
|
|
1631
1609
|
getWeightsFlat() {
|
|
1632
1610
|
const w = [];
|
|
1633
|
-
|
|
1611
|
+
w.push(...this.inputProj.getWeights());
|
|
1634
1612
|
for (const block of this.blocks) w.push(...block.getWeights());
|
|
1635
|
-
|
|
1636
|
-
w.push(...this.outputBias);
|
|
1613
|
+
w.push(...this.outputProj.getWeights());
|
|
1614
|
+
w.push(...this.outputBias.getWeights());
|
|
1637
1615
|
return w;
|
|
1638
1616
|
}
|
|
1639
1617
|
setWeightsFlat(weights) {
|
|
1640
1618
|
let idx = 0;
|
|
1641
|
-
|
|
1642
|
-
|
|
1619
|
+
const inputProjLen = this.inputProj.getWeights().length;
|
|
1620
|
+
this.inputProj.setWeights(weights.slice(idx, idx + inputProjLen));
|
|
1621
|
+
idx += inputProjLen;
|
|
1643
1622
|
for (const block of this.blocks) {
|
|
1644
1623
|
const blockLen = block.getWeights().length;
|
|
1645
1624
|
block.setWeights(weights.slice(idx, idx + blockLen));
|
|
1646
1625
|
idx += blockLen;
|
|
1647
1626
|
}
|
|
1648
|
-
|
|
1649
|
-
|
|
1650
|
-
|
|
1627
|
+
const outProjLen = this.outputProj.getWeights().length;
|
|
1628
|
+
this.outputProj.setWeights(weights.slice(idx, idx + outProjLen));
|
|
1629
|
+
idx += outProjLen;
|
|
1630
|
+
this.outputBias.setWeights(weights.slice(idx, idx + this.outputBias.values.length));
|
|
1651
1631
|
}
|
|
1652
1632
|
getWeightsStructured() {
|
|
1653
1633
|
return {
|
|
@@ -1665,17 +1645,15 @@ var NetworkTransformerRL = class {
|
|
|
1665
1645
|
norm2: { gamma: [...b.norm2.gamma], beta: [...b.norm2.beta] },
|
|
1666
1646
|
ff1: b.ff1.W.map((r) => [...r]),
|
|
1667
1647
|
ff2: b.ff2.W.map((r) => [...r]),
|
|
1668
|
-
b1: [...b.b1],
|
|
1669
|
-
b2: [...b.b2]
|
|
1648
|
+
b1: [...b.b1.values],
|
|
1649
|
+
b2: [...b.b2.values]
|
|
1670
1650
|
})),
|
|
1671
1651
|
outputProj: this.outputProj.W.map((r) => [...r]),
|
|
1672
|
-
outputBias: [...this.outputBias]
|
|
1652
|
+
outputBias: [...this.outputBias.values]
|
|
1673
1653
|
};
|
|
1674
1654
|
}
|
|
1675
1655
|
setWeightsStructured(data) {
|
|
1676
|
-
data.inputProj.
|
|
1677
|
-
this.inputProj.W[i] = [...row];
|
|
1678
|
-
});
|
|
1656
|
+
this.inputProj.setWeights(data.inputProj.flat());
|
|
1679
1657
|
data.blocks.forEach((bd, b) => {
|
|
1680
1658
|
const blk = this.blocks[b];
|
|
1681
1659
|
bd.attn.heads.forEach((hd, h) => {
|
|
@@ -1690,11 +1668,11 @@ var NetworkTransformerRL = class {
|
|
|
1690
1668
|
blk.norm2.beta = [...bd.norm2.beta];
|
|
1691
1669
|
blk.ff1.W = bd.ff1.map((r) => [...r]);
|
|
1692
1670
|
blk.ff2.W = bd.ff2.map((r) => [...r]);
|
|
1693
|
-
blk.b1
|
|
1694
|
-
blk.b2
|
|
1671
|
+
blk.b1.setWeights(bd.b1);
|
|
1672
|
+
blk.b2.setWeights(bd.b2);
|
|
1695
1673
|
});
|
|
1696
1674
|
this.outputProj.W = data.outputProj.map((r) => [...r]);
|
|
1697
|
-
this.outputBias
|
|
1675
|
+
this.outputBias.setWeights(data.outputBias);
|
|
1698
1676
|
}
|
|
1699
1677
|
// ── Serializable interface (flat array) ────────────────────────────────────
|
|
1700
1678
|
// These satisfy the Serializable interface from ModelSaver, which requires
|
|
@@ -1853,7 +1831,7 @@ function tanhFn(x) {
|
|
|
1853
1831
|
var Gate2 = class {
|
|
1854
1832
|
constructor(inputSize, hSize, initBias = 0) {
|
|
1855
1833
|
const n = inputSize + hSize;
|
|
1856
|
-
const limit = Math.sqrt(2 / n);
|
|
1834
|
+
const limit = Math.sqrt(2 / (n + hSize));
|
|
1857
1835
|
this.W = Array.from(
|
|
1858
1836
|
{ length: hSize },
|
|
1859
1837
|
() => Array.from({ length: n }, () => (Math.random() * 2 - 1) * limit)
|
|
@@ -2624,6 +2602,7 @@ export {
|
|
|
2624
2602
|
Adam,
|
|
2625
2603
|
AttentionHead,
|
|
2626
2604
|
BatchNorm,
|
|
2605
|
+
BiasVector,
|
|
2627
2606
|
ClipOptimizer,
|
|
2628
2607
|
ClippedOptimizerFactory,
|
|
2629
2608
|
Conv1D,
|
|
@@ -2652,6 +2631,7 @@ export {
|
|
|
2652
2631
|
crossEntropy,
|
|
2653
2632
|
crossEntropyDelta,
|
|
2654
2633
|
crossEntropyDeltaRaw,
|
|
2634
|
+
defaultOptimizer,
|
|
2655
2635
|
elu,
|
|
2656
2636
|
leakyRelu,
|
|
2657
2637
|
linear,
|
package/package.json
CHANGED