scalar-autograd 0.1.3 → 0.1.5
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- package/dist/Losses.js +145 -0
- package/dist/Losses.spec.js +54 -0
- package/dist/Optimizers.edge-cases.spec.js +29 -0
- package/dist/Optimizers.js +177 -0
- package/dist/Optimizers.spec.js +56 -0
- package/dist/V.js +0 -0
- package/dist/Value.edge-cases.spec.js +54 -0
- package/dist/Value.grad-flow.spec.js +24 -0
- package/dist/Value.js +424 -0
- package/dist/Value.losses-edge-cases.spec.js +30 -0
- package/dist/Value.memory.spec.js +23 -0
- package/dist/Value.nn.spec.js +111 -0
- package/dist/Value.spec.js +245 -0
- package/dist/ValueActivation.js +34 -0
- package/dist/ValueArithmetic.js +180 -0
- package/dist/ValueComparison.js +47 -0
- package/dist/ValueTrig.js +49 -0
- package/package.json +4 -12
- package/Losses.ts +0 -145
- package/Optimizers.ts +0 -222
- package/V.ts +0 -0
- package/Value.edge-cases.spec.ts +0 -60
- package/Value.grad-flow.spec.ts +0 -24
- package/Value.losses-edge-cases.spec.ts +0 -32
- package/Value.memory.spec.ts +0 -25
- package/Value.nn.spec.ts +0 -109
- package/Value.spec.ts +0 -268
- package/Value.ts +0 -460
- package/ValueActivation.ts +0 -51
- package/ValueArithmetic.ts +0 -272
- package/ValueComparison.ts +0 -85
- package/ValueTrig.ts +0 -70
package/dist/Losses.js
ADDED
|
@@ -0,0 +1,145 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
exports.Losses = void 0;
|
|
4
|
+
const Value_1 = require("./Value");
|
|
5
|
+
const V_1 = require("./V");
|
|
6
|
+
/**
|
|
7
|
+
* Throws an error if outputs and targets length do not match.
|
|
8
|
+
* @param outputs Array of output Values.
|
|
9
|
+
* @param targets Array of target Values.
|
|
10
|
+
*/
|
|
11
|
+
function checkLengthMatch(outputs, targets) {
|
|
12
|
+
if (outputs.length !== targets.length) {
|
|
13
|
+
throw new Error('Outputs and targets must have the same length');
|
|
14
|
+
}
|
|
15
|
+
}
|
|
16
|
+
class Losses {
|
|
17
|
+
/**
|
|
18
|
+
* Computes mean squared error (MSE) loss between outputs and targets.
|
|
19
|
+
* @param outputs Array of Value predictions.
|
|
20
|
+
* @param targets Array of Value targets.
|
|
21
|
+
* @returns Mean squared error as a Value.
|
|
22
|
+
*/
|
|
23
|
+
static mse(outputs, targets) {
|
|
24
|
+
checkLengthMatch(outputs, targets);
|
|
25
|
+
if (!Array.isArray(outputs) || !Array.isArray(targets))
|
|
26
|
+
throw new TypeError('mse expects Value[] for both arguments.');
|
|
27
|
+
if (!outputs.length)
|
|
28
|
+
return new Value_1.Value(0);
|
|
29
|
+
const diffs = outputs.map((out, i) => out.sub(targets[i]).square());
|
|
30
|
+
return Value_1.Value.mean(diffs);
|
|
31
|
+
}
|
|
32
|
+
/**
|
|
33
|
+
* Computes mean absolute error (MAE) loss between outputs and targets.
|
|
34
|
+
* @param outputs Array of Value predictions.
|
|
35
|
+
* @param targets Array of Value targets.
|
|
36
|
+
* @returns Mean absolute error as a Value.
|
|
37
|
+
*/
|
|
38
|
+
static mae(outputs, targets) {
|
|
39
|
+
checkLengthMatch(outputs, targets);
|
|
40
|
+
if (!Array.isArray(outputs) || !Array.isArray(targets))
|
|
41
|
+
throw new TypeError('mae expects Value[] for both arguments.');
|
|
42
|
+
if (!outputs.length)
|
|
43
|
+
return new Value_1.Value(0);
|
|
44
|
+
const diffs = outputs.map((out, i) => out.sub(targets[i]).abs());
|
|
45
|
+
return Value_1.Value.mean(diffs);
|
|
46
|
+
}
|
|
47
|
+
static EPS = 1e-12;
|
|
48
|
+
/**
|
|
49
|
+
* Computes binary cross-entropy loss between predicted outputs and targets (after sigmoid).
|
|
50
|
+
* @param outputs Array of Value predictions (expected in (0,1)).
|
|
51
|
+
* @param targets Array of Value targets (typically 0 or 1).
|
|
52
|
+
* @returns Binary cross-entropy loss as a Value.
|
|
53
|
+
*/
|
|
54
|
+
static binaryCrossEntropy(outputs, targets) {
|
|
55
|
+
checkLengthMatch(outputs, targets);
|
|
56
|
+
if (!Array.isArray(outputs) || !Array.isArray(targets))
|
|
57
|
+
throw new TypeError('binaryCrossEntropy expects Value[] for both arguments.');
|
|
58
|
+
if (!outputs.length)
|
|
59
|
+
return new Value_1.Value(0);
|
|
60
|
+
const eps = Losses.EPS;
|
|
61
|
+
const one = new Value_1.Value(1);
|
|
62
|
+
const losses = outputs.map((out, i) => {
|
|
63
|
+
const t = targets[i];
|
|
64
|
+
const outClamped = out.clamp(eps, 1 - eps); // sigmoid should output (0,1)
|
|
65
|
+
return t.mul(outClamped.log()).add(one.sub(t).mul(one.sub(outClamped).log()));
|
|
66
|
+
});
|
|
67
|
+
return Value_1.Value.mean(losses).mul(-1);
|
|
68
|
+
}
|
|
69
|
+
/**
|
|
70
|
+
* Computes categorical cross-entropy loss between outputs (logits) and integer target classes.
|
|
71
|
+
* @param outputs Array of Value logits for each class.
|
|
72
|
+
* @param targets Array of integer class indices (0-based, one per sample).
|
|
73
|
+
* @returns Categorical cross-entropy loss as a Value.
|
|
74
|
+
*/
|
|
75
|
+
static categoricalCrossEntropy(outputs, targets) {
|
|
76
|
+
// targets: integer encoded class indices
|
|
77
|
+
if (!Array.isArray(outputs) || !Array.isArray(targets))
|
|
78
|
+
throw new TypeError('categoricalCrossEntropy expects Value[] and number[].');
|
|
79
|
+
if (!outputs.length || !targets.length)
|
|
80
|
+
return new Value_1.Value(0);
|
|
81
|
+
if (targets.some(t => typeof t !== 'number' || !isFinite(t) || t < 0 || t >= outputs.length || Math.floor(t) !== t)) {
|
|
82
|
+
throw new Error('Target indices must be valid integers in [0, outputs.length)');
|
|
83
|
+
}
|
|
84
|
+
const eps = Losses.EPS;
|
|
85
|
+
const maxLogit = outputs.reduce((a, b) => a.data > b.data ? a : b);
|
|
86
|
+
const exps = outputs.map(out => out.sub(maxLogit).exp());
|
|
87
|
+
const sumExp = Value_1.Value.sum(exps).add(eps);
|
|
88
|
+
const softmax = exps.map(e => e.div(sumExp));
|
|
89
|
+
const tIndices = targets.map((t, i) => softmax[t]);
|
|
90
|
+
return Value_1.Value.mean(tIndices.map(sm => sm.add(eps).log().mul(-1)));
|
|
91
|
+
}
|
|
92
|
+
/**
|
|
93
|
+
* Computes Huber loss between outputs and targets.
|
|
94
|
+
* Combines quadratic loss for small residuals and linear loss for large residuals.
|
|
95
|
+
* @param outputs Array of Value predictions.
|
|
96
|
+
* @param targets Array of Value targets.
|
|
97
|
+
* @param delta Threshold at which to switch from quadratic to linear (default: 1.0).
|
|
98
|
+
* @returns Huber loss as a Value.
|
|
99
|
+
*/
|
|
100
|
+
static huber(outputs, targets, delta = 1.0) {
|
|
101
|
+
checkLengthMatch(outputs, targets);
|
|
102
|
+
if (!Array.isArray(outputs) || !Array.isArray(targets))
|
|
103
|
+
throw new TypeError('huber expects Value[] for both arguments.');
|
|
104
|
+
if (!outputs.length)
|
|
105
|
+
return new Value_1.Value(0);
|
|
106
|
+
const deltaValue = new Value_1.Value(delta);
|
|
107
|
+
const half = new Value_1.Value(0.5);
|
|
108
|
+
const losses = outputs.map((out, i) => {
|
|
109
|
+
const residual = V_1.V.abs(V_1.V.sub(out, targets[i]));
|
|
110
|
+
const condition = V_1.V.lt(residual, deltaValue);
|
|
111
|
+
const quadraticLoss = V_1.V.mul(half, V_1.V.square(residual));
|
|
112
|
+
const linearLoss = V_1.V.mul(deltaValue, V_1.V.sub(residual, V_1.V.mul(half, deltaValue)));
|
|
113
|
+
return V_1.V.ifThenElse(condition, quadraticLoss, linearLoss);
|
|
114
|
+
});
|
|
115
|
+
return V_1.V.mean(losses);
|
|
116
|
+
}
|
|
117
|
+
/**
|
|
118
|
+
* Computes Tukey loss between outputs and targets.
|
|
119
|
+
* This robust loss function saturates for large residuals.
|
|
120
|
+
*
|
|
121
|
+
* @param outputs Array of Value predictions.
|
|
122
|
+
* @param targets Array of Value targets.
|
|
123
|
+
* @param c Threshold constant (typically 4.685).
|
|
124
|
+
* @returns Tukey loss as a Value.
|
|
125
|
+
*/
|
|
126
|
+
static tukey(outputs, targets, c = 4.685) {
|
|
127
|
+
checkLengthMatch(outputs, targets);
|
|
128
|
+
const c2_over_6 = (c * c) / 6;
|
|
129
|
+
const cValue = V_1.V.C(c);
|
|
130
|
+
const c2_over_6_Value = V_1.V.C(c2_over_6);
|
|
131
|
+
const losses = outputs.map((out, i) => {
|
|
132
|
+
const diff = V_1.V.abs(V_1.V.sub(out, targets[i]));
|
|
133
|
+
const inlier = V_1.V.lte(diff, cValue);
|
|
134
|
+
const rc = V_1.V.div(diff, cValue);
|
|
135
|
+
const rc2 = V_1.V.square(rc);
|
|
136
|
+
const oneMinusRC2 = V_1.V.sub(1, rc2);
|
|
137
|
+
const inner = V_1.V.pow(oneMinusRC2, 3);
|
|
138
|
+
const inlierLoss = V_1.V.mul(c2_over_6_Value, V_1.V.sub(1, inner));
|
|
139
|
+
const loss = V_1.V.ifThenElse(inlier, inlierLoss, c2_over_6_Value);
|
|
140
|
+
return loss;
|
|
141
|
+
});
|
|
142
|
+
return V_1.V.mean(losses);
|
|
143
|
+
}
|
|
144
|
+
}
|
|
145
|
+
exports.Losses = Losses;
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
const Value_1 = require("./Value");
|
|
4
|
+
const Losses_1 = require("./Losses");
|
|
5
|
+
describe("Losses", () => {
|
|
6
|
+
it("mse computes value and gradients correctly", () => {
|
|
7
|
+
const x = new Value_1.Value(2, "x", true);
|
|
8
|
+
const y = new Value_1.Value(3, "y", true);
|
|
9
|
+
const tx = new Value_1.Value(5, "tx");
|
|
10
|
+
const ty = new Value_1.Value(1, "ty");
|
|
11
|
+
const loss = Losses_1.Losses.mse([x, y], [tx, ty]); // (1/2)*((2-5)^2 + (3-1)^2) = (1/2)*(9+4) = 6.5
|
|
12
|
+
expect(loss.data).toBeCloseTo(6.5);
|
|
13
|
+
loss.backward();
|
|
14
|
+
expect(x.grad).toBeCloseTo(-3);
|
|
15
|
+
expect(y.grad).toBeCloseTo(2);
|
|
16
|
+
});
|
|
17
|
+
it("mae computes value and gradients correctly", () => {
|
|
18
|
+
const x = new Value_1.Value(2, "x", true);
|
|
19
|
+
const y = new Value_1.Value(-3, "y", true);
|
|
20
|
+
const tx = new Value_1.Value(5, "tx");
|
|
21
|
+
const ty = new Value_1.Value(2, "ty");
|
|
22
|
+
const loss = Losses_1.Losses.mae([x, y], [tx, ty]); // (1/2)*(abs(2-5)+abs(-3-2)) = (1/2)*(3+5)=4
|
|
23
|
+
expect(loss.data).toBeCloseTo(4);
|
|
24
|
+
loss.backward();
|
|
25
|
+
expect(x.grad).toBeCloseTo(-0.5);
|
|
26
|
+
expect(y.grad).toBeCloseTo(-0.5);
|
|
27
|
+
});
|
|
28
|
+
it("binaryCrossEntropy computes value and gradients correctly for easy case", () => {
|
|
29
|
+
const out = new Value_1.Value(0.9, "out", true);
|
|
30
|
+
const target = new Value_1.Value(1, "target");
|
|
31
|
+
const loss = Losses_1.Losses.binaryCrossEntropy([out], [target]);
|
|
32
|
+
expect(loss.data).toBeCloseTo(-Math.log(0.9));
|
|
33
|
+
loss.backward();
|
|
34
|
+
expect(out.grad).toBeCloseTo(-1 / 0.9, 4);
|
|
35
|
+
});
|
|
36
|
+
it("categoricalCrossEntropy computes value and gradients (softmax+NLL)", () => {
|
|
37
|
+
// logits: [2, 1, 0], true = 0
|
|
38
|
+
const a = new Value_1.Value(2, "a", true);
|
|
39
|
+
const b = new Value_1.Value(1, "b", true);
|
|
40
|
+
const c = new Value_1.Value(0, "c", true);
|
|
41
|
+
const targets = [0];
|
|
42
|
+
const loss = Losses_1.Losses.categoricalCrossEntropy([a, b, c], targets);
|
|
43
|
+
const softmax = [
|
|
44
|
+
Math.exp(2) / (Math.exp(2) + Math.exp(1) + Math.exp(0)),
|
|
45
|
+
Math.exp(1) / (Math.exp(2) + Math.exp(1) + Math.exp(0)),
|
|
46
|
+
Math.exp(0) / (Math.exp(2) + Math.exp(1) + Math.exp(0))
|
|
47
|
+
];
|
|
48
|
+
expect(loss.data).toBeCloseTo(-Math.log(softmax[0]), 4);
|
|
49
|
+
loss.backward();
|
|
50
|
+
expect(a.grad).toBeCloseTo(softmax[0] - 1, 4);
|
|
51
|
+
expect(b.grad).toBeCloseTo(softmax[1], 4);
|
|
52
|
+
expect(c.grad).toBeCloseTo(softmax[2], 4);
|
|
53
|
+
});
|
|
54
|
+
});
|
|
@@ -0,0 +1,29 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
const Value_1 = require("./Value");
|
|
4
|
+
const Optimizers_1 = require("./Optimizers");
|
|
5
|
+
describe('Optimizer edge cases', () => {
|
|
6
|
+
it('handles empty parameter list', () => {
|
|
7
|
+
const opt = new Optimizers_1.SGD([], { learningRate: 0.1 });
|
|
8
|
+
expect(() => opt.step()).not.toThrow();
|
|
9
|
+
});
|
|
10
|
+
it('filters out non-trainable parameters', () => {
|
|
11
|
+
const x = new Value_1.Value(1, 'x', true);
|
|
12
|
+
const y = new Value_1.Value(2, 'y', false);
|
|
13
|
+
const opt = new Optimizers_1.SGD([x, y], { learningRate: 0.1 });
|
|
14
|
+
x.grad = 1;
|
|
15
|
+
y.grad = 1;
|
|
16
|
+
opt.step();
|
|
17
|
+
expect(x.data).toBe(0.9);
|
|
18
|
+
expect(y.data).toBe(2); // unchanged
|
|
19
|
+
});
|
|
20
|
+
it('Adam handles zero gradients correctly', () => {
|
|
21
|
+
const x = new Value_1.Value(1, 'x', true);
|
|
22
|
+
const opt = new Optimizers_1.Adam([x], { learningRate: 0.1 });
|
|
23
|
+
x.grad = 0;
|
|
24
|
+
for (let i = 0; i < 10; i++) {
|
|
25
|
+
opt.step();
|
|
26
|
+
}
|
|
27
|
+
expect(x.data).toBe(1); // unchanged
|
|
28
|
+
});
|
|
29
|
+
});
|
|
@@ -0,0 +1,177 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
// Optimizers.ts
|
|
3
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
4
|
+
exports.AdamW = exports.Adam = exports.SGD = exports.Optimizer = void 0;
|
|
5
|
+
/**
|
|
6
|
+
* Abstract base class for all optimizers.
|
|
7
|
+
* Ensures only requiresGrad parameters are optimized.
|
|
8
|
+
*/
|
|
9
|
+
class Optimizer {
|
|
10
|
+
trainables;
|
|
11
|
+
learningRate;
|
|
12
|
+
/**
|
|
13
|
+
* Constructs an Optimizer.
|
|
14
|
+
* @param trainables Array of Value parameters to optimize.
|
|
15
|
+
* @param learningRate Learning rate for updates.
|
|
16
|
+
*/
|
|
17
|
+
constructor(trainables, learningRate) {
|
|
18
|
+
this.trainables = trainables.filter(v => v.requiresGrad);
|
|
19
|
+
this.learningRate = learningRate;
|
|
20
|
+
}
|
|
21
|
+
/**
|
|
22
|
+
* Sets grads of all trainables to zero.
|
|
23
|
+
*/
|
|
24
|
+
zeroGrad() {
|
|
25
|
+
for (const v of this.trainables)
|
|
26
|
+
v.grad = 0;
|
|
27
|
+
}
|
|
28
|
+
/**
|
|
29
|
+
* Clips global norm of gradients as regularization.
|
|
30
|
+
* @param maxNorm Maximum allowed norm for gradients.
|
|
31
|
+
*/
|
|
32
|
+
clipGradients(maxNorm) {
|
|
33
|
+
const totalNorm = Math.sqrt(this.trainables.reduce((sum, v) => sum + v.grad * v.grad, 0));
|
|
34
|
+
if (totalNorm > maxNorm) {
|
|
35
|
+
const scale = maxNorm / (totalNorm + 1e-6);
|
|
36
|
+
for (const v of this.trainables)
|
|
37
|
+
v.grad *= scale;
|
|
38
|
+
}
|
|
39
|
+
}
|
|
40
|
+
}
|
|
41
|
+
exports.Optimizer = Optimizer;
|
|
42
|
+
/**
|
|
43
|
+
* Stochastic Gradient Descent (SGD) optimizer. Accepts weightDecay and gradientClip for API consistency (ignored).
|
|
44
|
+
*/
|
|
45
|
+
class SGD extends Optimizer {
|
|
46
|
+
weightDecay;
|
|
47
|
+
gradientClip;
|
|
48
|
+
/**
|
|
49
|
+
* Constructs an SGD optimizer.
|
|
50
|
+
* @param trainables Array of Value parameters to optimize.
|
|
51
|
+
* @param opts Optional parameters (learningRate, weightDecay, gradientClip).
|
|
52
|
+
*/
|
|
53
|
+
constructor(trainables, opts = {}) {
|
|
54
|
+
super(trainables, opts.learningRate ?? 1e-2);
|
|
55
|
+
this.weightDecay = opts.weightDecay ?? 0;
|
|
56
|
+
this.gradientClip = opts.gradientClip ?? 0;
|
|
57
|
+
}
|
|
58
|
+
/**
|
|
59
|
+
* Performs a parameter update using standard SGD.
|
|
60
|
+
*/
|
|
61
|
+
step() {
|
|
62
|
+
// Intentionally ignoring weightDecay/gradientClip for SGD
|
|
63
|
+
for (const v of this.trainables) {
|
|
64
|
+
v.data -= this.learningRate * v.grad;
|
|
65
|
+
}
|
|
66
|
+
}
|
|
67
|
+
}
|
|
68
|
+
exports.SGD = SGD;
|
|
69
|
+
/**
|
|
70
|
+
* Adam optimizer, supports decoupled weight decay and gradient clipping.
|
|
71
|
+
*/
|
|
72
|
+
class Adam extends Optimizer {
|
|
73
|
+
beta1;
|
|
74
|
+
beta2;
|
|
75
|
+
epsilon;
|
|
76
|
+
weightDecay;
|
|
77
|
+
gradientClip;
|
|
78
|
+
m = new Map();
|
|
79
|
+
v = new Map();
|
|
80
|
+
stepCount = 0;
|
|
81
|
+
/**
|
|
82
|
+
* Constructs an Adam optimizer.
|
|
83
|
+
* @param trainables Array of Value parameters to optimize.
|
|
84
|
+
* @param opts Optional parameters (learningRate, weightDecay, gradientClip, beta1, beta2, epsilon).
|
|
85
|
+
*/
|
|
86
|
+
constructor(trainables, opts = {}) {
|
|
87
|
+
super(trainables, opts.learningRate ?? 0.001);
|
|
88
|
+
this.beta1 = opts.beta1 ?? 0.9;
|
|
89
|
+
this.beta2 = opts.beta2 ?? 0.999;
|
|
90
|
+
this.epsilon = opts.epsilon ?? 1e-8;
|
|
91
|
+
this.weightDecay = opts.weightDecay ?? 0;
|
|
92
|
+
this.gradientClip = opts.gradientClip ?? 0;
|
|
93
|
+
for (const v of this.trainables) {
|
|
94
|
+
this.m.set(v, 0);
|
|
95
|
+
this.v.set(v, 0);
|
|
96
|
+
}
|
|
97
|
+
}
|
|
98
|
+
/**
|
|
99
|
+
* Performs a parameter update using Adam optimization.
|
|
100
|
+
*/
|
|
101
|
+
step() {
|
|
102
|
+
this.stepCount++;
|
|
103
|
+
for (const v of this.trainables) {
|
|
104
|
+
let grad = v.grad;
|
|
105
|
+
if (this.weightDecay > 0)
|
|
106
|
+
grad += this.weightDecay * v.data;
|
|
107
|
+
let m = this.m.get(v);
|
|
108
|
+
let vVal = this.v.get(v);
|
|
109
|
+
m = this.beta1 * m + (1 - this.beta1) * grad;
|
|
110
|
+
vVal = this.beta2 * vVal + (1 - this.beta2) * grad * grad;
|
|
111
|
+
const mHat = m / (1 - Math.pow(this.beta1, this.stepCount));
|
|
112
|
+
const vHat = vVal / (1 - Math.pow(this.beta2, this.stepCount));
|
|
113
|
+
let update = mHat / (Math.sqrt(vHat) + this.epsilon);
|
|
114
|
+
if (this.gradientClip > 0) {
|
|
115
|
+
update = Math.max(-this.gradientClip, Math.min(update, this.gradientClip));
|
|
116
|
+
}
|
|
117
|
+
v.data -= this.learningRate * update;
|
|
118
|
+
this.m.set(v, m);
|
|
119
|
+
this.v.set(v, vVal);
|
|
120
|
+
}
|
|
121
|
+
}
|
|
122
|
+
}
|
|
123
|
+
exports.Adam = Adam;
|
|
124
|
+
/**
|
|
125
|
+
* AdamW optimizer, supports decoupled weight decay and gradient clipping (same options as Adam).
|
|
126
|
+
*/
|
|
127
|
+
class AdamW extends Optimizer {
|
|
128
|
+
beta1;
|
|
129
|
+
beta2;
|
|
130
|
+
epsilon;
|
|
131
|
+
weightDecay;
|
|
132
|
+
gradientClip;
|
|
133
|
+
m = new Map();
|
|
134
|
+
v = new Map();
|
|
135
|
+
stepCount = 0;
|
|
136
|
+
/**
|
|
137
|
+
* Constructs an AdamW optimizer.
|
|
138
|
+
* @param trainables Array of Value parameters to optimize.
|
|
139
|
+
* @param opts Optional parameters (learningRate, weightDecay, gradientClip, beta1, beta2, epsilon).
|
|
140
|
+
*/
|
|
141
|
+
constructor(trainables, opts = {}) {
|
|
142
|
+
super(trainables, opts.learningRate ?? 0.001);
|
|
143
|
+
this.beta1 = opts.beta1 ?? 0.9;
|
|
144
|
+
this.beta2 = opts.beta2 ?? 0.999;
|
|
145
|
+
this.epsilon = opts.epsilon ?? 1e-8;
|
|
146
|
+
this.weightDecay = opts.weightDecay ?? 0.01;
|
|
147
|
+
this.gradientClip = opts.gradientClip ?? 0;
|
|
148
|
+
for (const v of this.trainables) {
|
|
149
|
+
this.m.set(v, 0);
|
|
150
|
+
this.v.set(v, 0);
|
|
151
|
+
}
|
|
152
|
+
}
|
|
153
|
+
/**
|
|
154
|
+
* Performs a parameter update using AdamW optimization (decoupled weight decay).
|
|
155
|
+
*/
|
|
156
|
+
step() {
|
|
157
|
+
this.stepCount++;
|
|
158
|
+
for (const v of this.trainables) {
|
|
159
|
+
let grad = v.grad;
|
|
160
|
+
let m = this.m.get(v);
|
|
161
|
+
let vVal = this.v.get(v);
|
|
162
|
+
m = this.beta1 * m + (1 - this.beta1) * grad;
|
|
163
|
+
vVal = this.beta2 * vVal + (1 - this.beta2) * grad * grad;
|
|
164
|
+
const mHat = m / (1 - Math.pow(this.beta1, this.stepCount));
|
|
165
|
+
const vHat = vVal / (1 - Math.pow(this.beta2, this.stepCount));
|
|
166
|
+
let update = mHat / (Math.sqrt(vHat) + this.epsilon);
|
|
167
|
+
if (this.gradientClip > 0) {
|
|
168
|
+
update = Math.max(-this.gradientClip, Math.min(update, this.gradientClip));
|
|
169
|
+
}
|
|
170
|
+
// Weight decay is decoupled as in AdamW paper:
|
|
171
|
+
v.data -= this.learningRate * update + this.learningRate * this.weightDecay * v.data;
|
|
172
|
+
this.m.set(v, m);
|
|
173
|
+
this.v.set(v, vVal);
|
|
174
|
+
}
|
|
175
|
+
}
|
|
176
|
+
}
|
|
177
|
+
exports.AdamW = AdamW;
|
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
// Optimizers.spec.ts
|
|
3
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
4
|
+
const Value_1 = require("./Value");
|
|
5
|
+
const Optimizers_1 = require("./Optimizers");
|
|
6
|
+
function createLoss(x) {
|
|
7
|
+
// Loss = (x - 5)^2
|
|
8
|
+
const target = new Value_1.Value(5);
|
|
9
|
+
const diff = x.sub(target);
|
|
10
|
+
return diff.mul(diff);
|
|
11
|
+
}
|
|
12
|
+
describe("Optimizers", () => {
|
|
13
|
+
it("SGD minimizes simple quadratic loss", () => {
|
|
14
|
+
const x = new Value_1.Value(0, "x", true);
|
|
15
|
+
const opt = new Optimizers_1.SGD([x], { learningRate: 0.1 });
|
|
16
|
+
let lossVal;
|
|
17
|
+
for (let i = 0; i < 100; i++) {
|
|
18
|
+
const loss = createLoss(x);
|
|
19
|
+
lossVal = loss.data;
|
|
20
|
+
if (lossVal < 1e-6)
|
|
21
|
+
break;
|
|
22
|
+
opt.zeroGrad();
|
|
23
|
+
loss.backward();
|
|
24
|
+
opt.step();
|
|
25
|
+
}
|
|
26
|
+
// AdamW does true weight decay, so the final x is slightly under target
|
|
27
|
+
expect(x.data).toBeLessThan(5.0);
|
|
28
|
+
expect(x.data).toBeGreaterThan(4.8);
|
|
29
|
+
});
|
|
30
|
+
it("Adam minimizes simple quadratic loss", () => {
|
|
31
|
+
const x = new Value_1.Value(0, "x", true);
|
|
32
|
+
const opt = new Optimizers_1.Adam([x], { learningRate: 0.1 });
|
|
33
|
+
let lossVal;
|
|
34
|
+
for (let i = 0; i < 100; i++) {
|
|
35
|
+
const loss = createLoss(x);
|
|
36
|
+
lossVal = loss.data;
|
|
37
|
+
if (lossVal < 1e-6)
|
|
38
|
+
break;
|
|
39
|
+
opt.zeroGrad();
|
|
40
|
+
loss.backward();
|
|
41
|
+
opt.step();
|
|
42
|
+
}
|
|
43
|
+
expect(x.data).toBeCloseTo(5.0, 1);
|
|
44
|
+
});
|
|
45
|
+
it("AdamW minimizes simple quadratic loss with weight decay", () => {
|
|
46
|
+
const x = new Value_1.Value(0, "x", true);
|
|
47
|
+
const opt = new Optimizers_1.AdamW([x], { learningRate: 0.1, beta1: 0.9, beta2: 0.999, weightDecay: 0.005 });
|
|
48
|
+
for (let i = 0; i < 100; i++) {
|
|
49
|
+
const loss = createLoss(x);
|
|
50
|
+
opt.zeroGrad();
|
|
51
|
+
loss.backward();
|
|
52
|
+
opt.step();
|
|
53
|
+
}
|
|
54
|
+
expect(x.data).toBeCloseTo(5.0, 1);
|
|
55
|
+
});
|
|
56
|
+
});
|
package/dist/V.js
ADDED
|
Binary file
|
|
@@ -0,0 +1,54 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
const Value_1 = require("./Value");
|
|
4
|
+
// Edge cases and error handling
|
|
5
|
+
describe('Value edge cases and error handling', () => {
|
|
6
|
+
it('throws on invalid numeric inputs', () => {
|
|
7
|
+
expect(() => new Value_1.Value(NaN)).toThrow();
|
|
8
|
+
expect(() => new Value_1.Value(Infinity)).toThrow();
|
|
9
|
+
expect(() => new Value_1.Value(-Infinity)).toThrow();
|
|
10
|
+
});
|
|
11
|
+
it('handles gradient accumulation correctly', () => {
|
|
12
|
+
const x = new Value_1.Value(2, 'x', true);
|
|
13
|
+
const y = x.mul(3);
|
|
14
|
+
const z = x.mul(4);
|
|
15
|
+
const out = y.add(z);
|
|
16
|
+
out.backward();
|
|
17
|
+
expect(x.grad).toBe(7); // 3 + 4
|
|
18
|
+
});
|
|
19
|
+
it('handles repeated use of same value in expression', () => {
|
|
20
|
+
const x = new Value_1.Value(3, 'x', true);
|
|
21
|
+
const y = x.mul(x).mul(x); // x^3
|
|
22
|
+
y.backward();
|
|
23
|
+
expect(x.grad).toBeCloseTo(27); // 3*x^2 = 27
|
|
24
|
+
});
|
|
25
|
+
it('throws on division by zero', () => {
|
|
26
|
+
const a = new Value_1.Value(1);
|
|
27
|
+
const b = new Value_1.Value(0);
|
|
28
|
+
expect(() => a.div(b)).toThrow();
|
|
29
|
+
});
|
|
30
|
+
it('throws on log of negative number', () => {
|
|
31
|
+
const x = new Value_1.Value(-1);
|
|
32
|
+
expect(() => x.log()).toThrow();
|
|
33
|
+
});
|
|
34
|
+
it('throws on negative base with fractional exponent', () => {
|
|
35
|
+
const x = new Value_1.Value(-2);
|
|
36
|
+
expect(() => x.pow(0.5)).toThrow();
|
|
37
|
+
});
|
|
38
|
+
});
|
|
39
|
+
// Complex expressions
|
|
40
|
+
describe('Complex mathematical expressions', () => {
|
|
41
|
+
it('computes gradient of complex expression', () => {
|
|
42
|
+
const x = new Value_1.Value(0.5, 'x', true);
|
|
43
|
+
const y = x.sin().mul(x.cos()).add(x.exp());
|
|
44
|
+
y.backward();
|
|
45
|
+
const expected = Math.cos(0.5) ** 2 - Math.sin(0.5) ** 2 + Math.exp(0.5);
|
|
46
|
+
expect(x.grad).toBeCloseTo(expected, 4);
|
|
47
|
+
});
|
|
48
|
+
it('handles nested activation functions', () => {
|
|
49
|
+
const x = new Value_1.Value(0.5, 'x', true);
|
|
50
|
+
const y = x.tanh().sigmoid().relu();
|
|
51
|
+
y.backward();
|
|
52
|
+
expect(x.grad).toBeGreaterThan(0);
|
|
53
|
+
});
|
|
54
|
+
});
|
|
@@ -0,0 +1,24 @@
|
|
|
1
|
+
"use strict";
|
|
2
|
+
Object.defineProperty(exports, "__esModule", { value: true });
|
|
3
|
+
const Value_1 = require("./Value");
|
|
4
|
+
describe('Gradient flow control', () => {
|
|
5
|
+
it('stops gradient at non-requiresGrad nodes', () => {
|
|
6
|
+
const x = new Value_1.Value(2, 'x', true);
|
|
7
|
+
const y = new Value_1.Value(3, 'y', false);
|
|
8
|
+
const z = new Value_1.Value(4, 'z', true);
|
|
9
|
+
const out = x.mul(y).add(z);
|
|
10
|
+
out.backward();
|
|
11
|
+
expect(x.grad).toBe(3);
|
|
12
|
+
expect(y.grad).toBe(0);
|
|
13
|
+
expect(z.grad).toBe(1);
|
|
14
|
+
});
|
|
15
|
+
it('handles detached computation graphs', () => {
|
|
16
|
+
const x = new Value_1.Value(2, 'x', true);
|
|
17
|
+
const y = x.mul(3);
|
|
18
|
+
const z = new Value_1.Value(y.data, 'z', true); // detached
|
|
19
|
+
const out = z.mul(4);
|
|
20
|
+
out.backward();
|
|
21
|
+
expect(z.grad).toBe(4);
|
|
22
|
+
expect(x.grad).toBe(0); // no gradient flows to x
|
|
23
|
+
});
|
|
24
|
+
});
|