scalar-autograd 0.1.4 → 0.1.6

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Files changed (49) hide show
  1. package/dist/Losses.d.ts +51 -0
  2. package/dist/Losses.js +145 -0
  3. package/dist/Losses.spec.d.ts +1 -0
  4. package/dist/Losses.spec.js +54 -0
  5. package/dist/Optimizers.d.ts +114 -0
  6. package/dist/Optimizers.edge-cases.spec.d.ts +1 -0
  7. package/dist/Optimizers.edge-cases.spec.js +29 -0
  8. package/dist/Optimizers.js +177 -0
  9. package/dist/Optimizers.spec.d.ts +1 -0
  10. package/dist/Optimizers.spec.js +56 -0
  11. package/dist/V.d.ts +0 -0
  12. package/dist/V.js +0 -0
  13. package/dist/Value.d.ts +260 -0
  14. package/dist/Value.edge-cases.spec.d.ts +1 -0
  15. package/dist/Value.edge-cases.spec.js +54 -0
  16. package/dist/Value.grad-flow.spec.d.ts +1 -0
  17. package/dist/Value.grad-flow.spec.js +24 -0
  18. package/dist/Value.js +424 -0
  19. package/dist/Value.losses-edge-cases.spec.d.ts +1 -0
  20. package/dist/Value.losses-edge-cases.spec.js +30 -0
  21. package/dist/Value.memory.spec.d.ts +1 -0
  22. package/dist/Value.memory.spec.js +23 -0
  23. package/dist/Value.nn.spec.d.ts +1 -0
  24. package/dist/Value.nn.spec.js +111 -0
  25. package/dist/Value.spec.d.ts +1 -0
  26. package/dist/Value.spec.js +245 -0
  27. package/dist/ValueActivation.d.ts +7 -0
  28. package/dist/ValueActivation.js +34 -0
  29. package/dist/ValueArithmetic.d.ts +26 -0
  30. package/dist/ValueArithmetic.js +180 -0
  31. package/dist/ValueComparison.d.ts +10 -0
  32. package/dist/ValueComparison.js +47 -0
  33. package/dist/ValueTrig.d.ts +9 -0
  34. package/dist/ValueTrig.js +49 -0
  35. package/package.json +4 -12
  36. package/Losses.ts +0 -145
  37. package/Optimizers.ts +0 -222
  38. package/V.ts +0 -0
  39. package/Value.edge-cases.spec.ts +0 -60
  40. package/Value.grad-flow.spec.ts +0 -24
  41. package/Value.losses-edge-cases.spec.ts +0 -32
  42. package/Value.memory.spec.ts +0 -25
  43. package/Value.nn.spec.ts +0 -109
  44. package/Value.spec.ts +0 -268
  45. package/Value.ts +0 -461
  46. package/ValueActivation.ts +0 -51
  47. package/ValueArithmetic.ts +0 -272
  48. package/ValueComparison.ts +0 -85
  49. package/ValueTrig.ts +0 -70
@@ -0,0 +1,51 @@
1
+ import { Value } from "./Value";
2
+ export declare class Losses {
3
+ /**
4
+ * Computes mean squared error (MSE) loss between outputs and targets.
5
+ * @param outputs Array of Value predictions.
6
+ * @param targets Array of Value targets.
7
+ * @returns Mean squared error as a Value.
8
+ */
9
+ static mse(outputs: Value[], targets: Value[]): Value;
10
+ /**
11
+ * Computes mean absolute error (MAE) loss between outputs and targets.
12
+ * @param outputs Array of Value predictions.
13
+ * @param targets Array of Value targets.
14
+ * @returns Mean absolute error as a Value.
15
+ */
16
+ static mae(outputs: Value[], targets: Value[]): Value;
17
+ static EPS: number;
18
+ /**
19
+ * Computes binary cross-entropy loss between predicted outputs and targets (after sigmoid).
20
+ * @param outputs Array of Value predictions (expected in (0,1)).
21
+ * @param targets Array of Value targets (typically 0 or 1).
22
+ * @returns Binary cross-entropy loss as a Value.
23
+ */
24
+ static binaryCrossEntropy(outputs: Value[], targets: Value[]): Value;
25
+ /**
26
+ * Computes categorical cross-entropy loss between outputs (logits) and integer target classes.
27
+ * @param outputs Array of Value logits for each class.
28
+ * @param targets Array of integer class indices (0-based, one per sample).
29
+ * @returns Categorical cross-entropy loss as a Value.
30
+ */
31
+ static categoricalCrossEntropy(outputs: Value[], targets: number[]): Value;
32
+ /**
33
+ * Computes Huber loss between outputs and targets.
34
+ * Combines quadratic loss for small residuals and linear loss for large residuals.
35
+ * @param outputs Array of Value predictions.
36
+ * @param targets Array of Value targets.
37
+ * @param delta Threshold at which to switch from quadratic to linear (default: 1.0).
38
+ * @returns Huber loss as a Value.
39
+ */
40
+ static huber(outputs: Value[], targets: Value[], delta?: number): Value;
41
+ /**
42
+ * Computes Tukey loss between outputs and targets.
43
+ * This robust loss function saturates for large residuals.
44
+ *
45
+ * @param outputs Array of Value predictions.
46
+ * @param targets Array of Value targets.
47
+ * @param c Threshold constant (typically 4.685).
48
+ * @returns Tukey loss as a Value.
49
+ */
50
+ static tukey(outputs: Value[], targets: Value[], c?: number): Value;
51
+ }
package/dist/Losses.js ADDED
@@ -0,0 +1,145 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ exports.Losses = void 0;
4
+ const Value_1 = require("./Value");
5
+ const V_1 = require("./V");
6
+ /**
7
+ * Throws an error if outputs and targets length do not match.
8
+ * @param outputs Array of output Values.
9
+ * @param targets Array of target Values.
10
+ */
11
+ function checkLengthMatch(outputs, targets) {
12
+ if (outputs.length !== targets.length) {
13
+ throw new Error('Outputs and targets must have the same length');
14
+ }
15
+ }
16
+ class Losses {
17
+ /**
18
+ * Computes mean squared error (MSE) loss between outputs and targets.
19
+ * @param outputs Array of Value predictions.
20
+ * @param targets Array of Value targets.
21
+ * @returns Mean squared error as a Value.
22
+ */
23
+ static mse(outputs, targets) {
24
+ checkLengthMatch(outputs, targets);
25
+ if (!Array.isArray(outputs) || !Array.isArray(targets))
26
+ throw new TypeError('mse expects Value[] for both arguments.');
27
+ if (!outputs.length)
28
+ return new Value_1.Value(0);
29
+ const diffs = outputs.map((out, i) => out.sub(targets[i]).square());
30
+ return Value_1.Value.mean(diffs);
31
+ }
32
+ /**
33
+ * Computes mean absolute error (MAE) loss between outputs and targets.
34
+ * @param outputs Array of Value predictions.
35
+ * @param targets Array of Value targets.
36
+ * @returns Mean absolute error as a Value.
37
+ */
38
+ static mae(outputs, targets) {
39
+ checkLengthMatch(outputs, targets);
40
+ if (!Array.isArray(outputs) || !Array.isArray(targets))
41
+ throw new TypeError('mae expects Value[] for both arguments.');
42
+ if (!outputs.length)
43
+ return new Value_1.Value(0);
44
+ const diffs = outputs.map((out, i) => out.sub(targets[i]).abs());
45
+ return Value_1.Value.mean(diffs);
46
+ }
47
+ static EPS = 1e-12;
48
+ /**
49
+ * Computes binary cross-entropy loss between predicted outputs and targets (after sigmoid).
50
+ * @param outputs Array of Value predictions (expected in (0,1)).
51
+ * @param targets Array of Value targets (typically 0 or 1).
52
+ * @returns Binary cross-entropy loss as a Value.
53
+ */
54
+ static binaryCrossEntropy(outputs, targets) {
55
+ checkLengthMatch(outputs, targets);
56
+ if (!Array.isArray(outputs) || !Array.isArray(targets))
57
+ throw new TypeError('binaryCrossEntropy expects Value[] for both arguments.');
58
+ if (!outputs.length)
59
+ return new Value_1.Value(0);
60
+ const eps = Losses.EPS;
61
+ const one = new Value_1.Value(1);
62
+ const losses = outputs.map((out, i) => {
63
+ const t = targets[i];
64
+ const outClamped = out.clamp(eps, 1 - eps); // sigmoid should output (0,1)
65
+ return t.mul(outClamped.log()).add(one.sub(t).mul(one.sub(outClamped).log()));
66
+ });
67
+ return Value_1.Value.mean(losses).mul(-1);
68
+ }
69
+ /**
70
+ * Computes categorical cross-entropy loss between outputs (logits) and integer target classes.
71
+ * @param outputs Array of Value logits for each class.
72
+ * @param targets Array of integer class indices (0-based, one per sample).
73
+ * @returns Categorical cross-entropy loss as a Value.
74
+ */
75
+ static categoricalCrossEntropy(outputs, targets) {
76
+ // targets: integer encoded class indices
77
+ if (!Array.isArray(outputs) || !Array.isArray(targets))
78
+ throw new TypeError('categoricalCrossEntropy expects Value[] and number[].');
79
+ if (!outputs.length || !targets.length)
80
+ return new Value_1.Value(0);
81
+ if (targets.some(t => typeof t !== 'number' || !isFinite(t) || t < 0 || t >= outputs.length || Math.floor(t) !== t)) {
82
+ throw new Error('Target indices must be valid integers in [0, outputs.length)');
83
+ }
84
+ const eps = Losses.EPS;
85
+ const maxLogit = outputs.reduce((a, b) => a.data > b.data ? a : b);
86
+ const exps = outputs.map(out => out.sub(maxLogit).exp());
87
+ const sumExp = Value_1.Value.sum(exps).add(eps);
88
+ const softmax = exps.map(e => e.div(sumExp));
89
+ const tIndices = targets.map((t, i) => softmax[t]);
90
+ return Value_1.Value.mean(tIndices.map(sm => sm.add(eps).log().mul(-1)));
91
+ }
92
+ /**
93
+ * Computes Huber loss between outputs and targets.
94
+ * Combines quadratic loss for small residuals and linear loss for large residuals.
95
+ * @param outputs Array of Value predictions.
96
+ * @param targets Array of Value targets.
97
+ * @param delta Threshold at which to switch from quadratic to linear (default: 1.0).
98
+ * @returns Huber loss as a Value.
99
+ */
100
+ static huber(outputs, targets, delta = 1.0) {
101
+ checkLengthMatch(outputs, targets);
102
+ if (!Array.isArray(outputs) || !Array.isArray(targets))
103
+ throw new TypeError('huber expects Value[] for both arguments.');
104
+ if (!outputs.length)
105
+ return new Value_1.Value(0);
106
+ const deltaValue = new Value_1.Value(delta);
107
+ const half = new Value_1.Value(0.5);
108
+ const losses = outputs.map((out, i) => {
109
+ const residual = V_1.V.abs(V_1.V.sub(out, targets[i]));
110
+ const condition = V_1.V.lt(residual, deltaValue);
111
+ const quadraticLoss = V_1.V.mul(half, V_1.V.square(residual));
112
+ const linearLoss = V_1.V.mul(deltaValue, V_1.V.sub(residual, V_1.V.mul(half, deltaValue)));
113
+ return V_1.V.ifThenElse(condition, quadraticLoss, linearLoss);
114
+ });
115
+ return V_1.V.mean(losses);
116
+ }
117
+ /**
118
+ * Computes Tukey loss between outputs and targets.
119
+ * This robust loss function saturates for large residuals.
120
+ *
121
+ * @param outputs Array of Value predictions.
122
+ * @param targets Array of Value targets.
123
+ * @param c Threshold constant (typically 4.685).
124
+ * @returns Tukey loss as a Value.
125
+ */
126
+ static tukey(outputs, targets, c = 4.685) {
127
+ checkLengthMatch(outputs, targets);
128
+ const c2_over_6 = (c * c) / 6;
129
+ const cValue = V_1.V.C(c);
130
+ const c2_over_6_Value = V_1.V.C(c2_over_6);
131
+ const losses = outputs.map((out, i) => {
132
+ const diff = V_1.V.abs(V_1.V.sub(out, targets[i]));
133
+ const inlier = V_1.V.lte(diff, cValue);
134
+ const rc = V_1.V.div(diff, cValue);
135
+ const rc2 = V_1.V.square(rc);
136
+ const oneMinusRC2 = V_1.V.sub(1, rc2);
137
+ const inner = V_1.V.pow(oneMinusRC2, 3);
138
+ const inlierLoss = V_1.V.mul(c2_over_6_Value, V_1.V.sub(1, inner));
139
+ const loss = V_1.V.ifThenElse(inlier, inlierLoss, c2_over_6_Value);
140
+ return loss;
141
+ });
142
+ return V_1.V.mean(losses);
143
+ }
144
+ }
145
+ exports.Losses = Losses;
@@ -0,0 +1 @@
1
+ export {};
@@ -0,0 +1,54 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ const Value_1 = require("./Value");
4
+ const Losses_1 = require("./Losses");
5
+ describe("Losses", () => {
6
+ it("mse computes value and gradients correctly", () => {
7
+ const x = new Value_1.Value(2, "x", true);
8
+ const y = new Value_1.Value(3, "y", true);
9
+ const tx = new Value_1.Value(5, "tx");
10
+ const ty = new Value_1.Value(1, "ty");
11
+ const loss = Losses_1.Losses.mse([x, y], [tx, ty]); // (1/2)*((2-5)^2 + (3-1)^2) = (1/2)*(9+4) = 6.5
12
+ expect(loss.data).toBeCloseTo(6.5);
13
+ loss.backward();
14
+ expect(x.grad).toBeCloseTo(-3);
15
+ expect(y.grad).toBeCloseTo(2);
16
+ });
17
+ it("mae computes value and gradients correctly", () => {
18
+ const x = new Value_1.Value(2, "x", true);
19
+ const y = new Value_1.Value(-3, "y", true);
20
+ const tx = new Value_1.Value(5, "tx");
21
+ const ty = new Value_1.Value(2, "ty");
22
+ const loss = Losses_1.Losses.mae([x, y], [tx, ty]); // (1/2)*(abs(2-5)+abs(-3-2)) = (1/2)*(3+5)=4
23
+ expect(loss.data).toBeCloseTo(4);
24
+ loss.backward();
25
+ expect(x.grad).toBeCloseTo(-0.5);
26
+ expect(y.grad).toBeCloseTo(-0.5);
27
+ });
28
+ it("binaryCrossEntropy computes value and gradients correctly for easy case", () => {
29
+ const out = new Value_1.Value(0.9, "out", true);
30
+ const target = new Value_1.Value(1, "target");
31
+ const loss = Losses_1.Losses.binaryCrossEntropy([out], [target]);
32
+ expect(loss.data).toBeCloseTo(-Math.log(0.9));
33
+ loss.backward();
34
+ expect(out.grad).toBeCloseTo(-1 / 0.9, 4);
35
+ });
36
+ it("categoricalCrossEntropy computes value and gradients (softmax+NLL)", () => {
37
+ // logits: [2, 1, 0], true = 0
38
+ const a = new Value_1.Value(2, "a", true);
39
+ const b = new Value_1.Value(1, "b", true);
40
+ const c = new Value_1.Value(0, "c", true);
41
+ const targets = [0];
42
+ const loss = Losses_1.Losses.categoricalCrossEntropy([a, b, c], targets);
43
+ const softmax = [
44
+ Math.exp(2) / (Math.exp(2) + Math.exp(1) + Math.exp(0)),
45
+ Math.exp(1) / (Math.exp(2) + Math.exp(1) + Math.exp(0)),
46
+ Math.exp(0) / (Math.exp(2) + Math.exp(1) + Math.exp(0))
47
+ ];
48
+ expect(loss.data).toBeCloseTo(-Math.log(softmax[0]), 4);
49
+ loss.backward();
50
+ expect(a.grad).toBeCloseTo(softmax[0] - 1, 4);
51
+ expect(b.grad).toBeCloseTo(softmax[1], 4);
52
+ expect(c.grad).toBeCloseTo(softmax[2], 4);
53
+ });
54
+ });
@@ -0,0 +1,114 @@
1
+ import { Value } from "./Value";
2
+ /**
3
+ * Abstract base class for all optimizers.
4
+ * Ensures only requiresGrad parameters are optimized.
5
+ */
6
+ export declare abstract class Optimizer {
7
+ protected trainables: Value[];
8
+ learningRate: number;
9
+ /**
10
+ * Constructs an Optimizer.
11
+ * @param trainables Array of Value parameters to optimize.
12
+ * @param learningRate Learning rate for updates.
13
+ */
14
+ constructor(trainables: Value[], learningRate: number);
15
+ /**
16
+ * Performs a parameter update step.
17
+ */
18
+ abstract step(): void;
19
+ /**
20
+ * Sets grads of all trainables to zero.
21
+ */
22
+ zeroGrad(): void;
23
+ /**
24
+ * Clips global norm of gradients as regularization.
25
+ * @param maxNorm Maximum allowed norm for gradients.
26
+ */
27
+ clipGradients(maxNorm: number): void;
28
+ }
29
+ /**
30
+ * Optional arguments for basic optimizers.
31
+ * @property learningRate: Overrides the step size for parameter updates (default varies by optimizer).
32
+ * @property weightDecay: L2 regularization multiplier (default 0). Ignored for plain SGD.
33
+ * @property gradientClip: Maximum absolute value for gradient updates (default 0: no clipping).
34
+ */
35
+ export interface OptimizerOptions {
36
+ learningRate?: number;
37
+ weightDecay?: number;
38
+ gradientClip?: number;
39
+ }
40
+ /**
41
+ * Stochastic Gradient Descent (SGD) optimizer. Accepts weightDecay and gradientClip for API consistency (ignored).
42
+ */
43
+ export declare class SGD extends Optimizer {
44
+ private weightDecay;
45
+ private gradientClip;
46
+ /**
47
+ * Constructs an SGD optimizer.
48
+ * @param trainables Array of Value parameters to optimize.
49
+ * @param opts Optional parameters (learningRate, weightDecay, gradientClip).
50
+ */
51
+ constructor(trainables: Value[], opts?: OptimizerOptions);
52
+ /**
53
+ * Performs a parameter update using standard SGD.
54
+ */
55
+ step(): void;
56
+ }
57
+ /**
58
+ * Adam and AdamW optimizer parameters.
59
+ * Extends OptimizerOptions.
60
+ * @property beta1: Exponential decay rate for 1st moment (default 0.9).
61
+ * @property beta2: Exponential decay rate for 2nd moment (default 0.999).
62
+ * @property epsilon: Numerical stability fudge factor (default 1e-8).
63
+ */
64
+ export interface AdamOptions extends OptimizerOptions {
65
+ beta1?: number;
66
+ beta2?: number;
67
+ epsilon?: number;
68
+ }
69
+ /**
70
+ * Adam optimizer, supports decoupled weight decay and gradient clipping.
71
+ */
72
+ export declare class Adam extends Optimizer {
73
+ private beta1;
74
+ private beta2;
75
+ private epsilon;
76
+ private weightDecay;
77
+ private gradientClip;
78
+ private m;
79
+ private v;
80
+ private stepCount;
81
+ /**
82
+ * Constructs an Adam optimizer.
83
+ * @param trainables Array of Value parameters to optimize.
84
+ * @param opts Optional parameters (learningRate, weightDecay, gradientClip, beta1, beta2, epsilon).
85
+ */
86
+ constructor(trainables: Value[], opts?: AdamOptions);
87
+ /**
88
+ * Performs a parameter update using Adam optimization.
89
+ */
90
+ step(): void;
91
+ }
92
+ /**
93
+ * AdamW optimizer, supports decoupled weight decay and gradient clipping (same options as Adam).
94
+ */
95
+ export declare class AdamW extends Optimizer {
96
+ private beta1;
97
+ private beta2;
98
+ private epsilon;
99
+ private weightDecay;
100
+ private gradientClip;
101
+ private m;
102
+ private v;
103
+ private stepCount;
104
+ /**
105
+ * Constructs an AdamW optimizer.
106
+ * @param trainables Array of Value parameters to optimize.
107
+ * @param opts Optional parameters (learningRate, weightDecay, gradientClip, beta1, beta2, epsilon).
108
+ */
109
+ constructor(trainables: Value[], opts?: AdamOptions);
110
+ /**
111
+ * Performs a parameter update using AdamW optimization (decoupled weight decay).
112
+ */
113
+ step(): void;
114
+ }
@@ -0,0 +1 @@
1
+ export {};
@@ -0,0 +1,29 @@
1
+ "use strict";
2
+ Object.defineProperty(exports, "__esModule", { value: true });
3
+ const Value_1 = require("./Value");
4
+ const Optimizers_1 = require("./Optimizers");
5
+ describe('Optimizer edge cases', () => {
6
+ it('handles empty parameter list', () => {
7
+ const opt = new Optimizers_1.SGD([], { learningRate: 0.1 });
8
+ expect(() => opt.step()).not.toThrow();
9
+ });
10
+ it('filters out non-trainable parameters', () => {
11
+ const x = new Value_1.Value(1, 'x', true);
12
+ const y = new Value_1.Value(2, 'y', false);
13
+ const opt = new Optimizers_1.SGD([x, y], { learningRate: 0.1 });
14
+ x.grad = 1;
15
+ y.grad = 1;
16
+ opt.step();
17
+ expect(x.data).toBe(0.9);
18
+ expect(y.data).toBe(2); // unchanged
19
+ });
20
+ it('Adam handles zero gradients correctly', () => {
21
+ const x = new Value_1.Value(1, 'x', true);
22
+ const opt = new Optimizers_1.Adam([x], { learningRate: 0.1 });
23
+ x.grad = 0;
24
+ for (let i = 0; i < 10; i++) {
25
+ opt.step();
26
+ }
27
+ expect(x.data).toBe(1); // unchanged
28
+ });
29
+ });
@@ -0,0 +1,177 @@
1
+ "use strict";
2
+ // Optimizers.ts
3
+ Object.defineProperty(exports, "__esModule", { value: true });
4
+ exports.AdamW = exports.Adam = exports.SGD = exports.Optimizer = void 0;
5
+ /**
6
+ * Abstract base class for all optimizers.
7
+ * Ensures only requiresGrad parameters are optimized.
8
+ */
9
+ class Optimizer {
10
+ trainables;
11
+ learningRate;
12
+ /**
13
+ * Constructs an Optimizer.
14
+ * @param trainables Array of Value parameters to optimize.
15
+ * @param learningRate Learning rate for updates.
16
+ */
17
+ constructor(trainables, learningRate) {
18
+ this.trainables = trainables.filter(v => v.requiresGrad);
19
+ this.learningRate = learningRate;
20
+ }
21
+ /**
22
+ * Sets grads of all trainables to zero.
23
+ */
24
+ zeroGrad() {
25
+ for (const v of this.trainables)
26
+ v.grad = 0;
27
+ }
28
+ /**
29
+ * Clips global norm of gradients as regularization.
30
+ * @param maxNorm Maximum allowed norm for gradients.
31
+ */
32
+ clipGradients(maxNorm) {
33
+ const totalNorm = Math.sqrt(this.trainables.reduce((sum, v) => sum + v.grad * v.grad, 0));
34
+ if (totalNorm > maxNorm) {
35
+ const scale = maxNorm / (totalNorm + 1e-6);
36
+ for (const v of this.trainables)
37
+ v.grad *= scale;
38
+ }
39
+ }
40
+ }
41
+ exports.Optimizer = Optimizer;
42
+ /**
43
+ * Stochastic Gradient Descent (SGD) optimizer. Accepts weightDecay and gradientClip for API consistency (ignored).
44
+ */
45
+ class SGD extends Optimizer {
46
+ weightDecay;
47
+ gradientClip;
48
+ /**
49
+ * Constructs an SGD optimizer.
50
+ * @param trainables Array of Value parameters to optimize.
51
+ * @param opts Optional parameters (learningRate, weightDecay, gradientClip).
52
+ */
53
+ constructor(trainables, opts = {}) {
54
+ super(trainables, opts.learningRate ?? 1e-2);
55
+ this.weightDecay = opts.weightDecay ?? 0;
56
+ this.gradientClip = opts.gradientClip ?? 0;
57
+ }
58
+ /**
59
+ * Performs a parameter update using standard SGD.
60
+ */
61
+ step() {
62
+ // Intentionally ignoring weightDecay/gradientClip for SGD
63
+ for (const v of this.trainables) {
64
+ v.data -= this.learningRate * v.grad;
65
+ }
66
+ }
67
+ }
68
+ exports.SGD = SGD;
69
+ /**
70
+ * Adam optimizer, supports decoupled weight decay and gradient clipping.
71
+ */
72
+ class Adam extends Optimizer {
73
+ beta1;
74
+ beta2;
75
+ epsilon;
76
+ weightDecay;
77
+ gradientClip;
78
+ m = new Map();
79
+ v = new Map();
80
+ stepCount = 0;
81
+ /**
82
+ * Constructs an Adam optimizer.
83
+ * @param trainables Array of Value parameters to optimize.
84
+ * @param opts Optional parameters (learningRate, weightDecay, gradientClip, beta1, beta2, epsilon).
85
+ */
86
+ constructor(trainables, opts = {}) {
87
+ super(trainables, opts.learningRate ?? 0.001);
88
+ this.beta1 = opts.beta1 ?? 0.9;
89
+ this.beta2 = opts.beta2 ?? 0.999;
90
+ this.epsilon = opts.epsilon ?? 1e-8;
91
+ this.weightDecay = opts.weightDecay ?? 0;
92
+ this.gradientClip = opts.gradientClip ?? 0;
93
+ for (const v of this.trainables) {
94
+ this.m.set(v, 0);
95
+ this.v.set(v, 0);
96
+ }
97
+ }
98
+ /**
99
+ * Performs a parameter update using Adam optimization.
100
+ */
101
+ step() {
102
+ this.stepCount++;
103
+ for (const v of this.trainables) {
104
+ let grad = v.grad;
105
+ if (this.weightDecay > 0)
106
+ grad += this.weightDecay * v.data;
107
+ let m = this.m.get(v);
108
+ let vVal = this.v.get(v);
109
+ m = this.beta1 * m + (1 - this.beta1) * grad;
110
+ vVal = this.beta2 * vVal + (1 - this.beta2) * grad * grad;
111
+ const mHat = m / (1 - Math.pow(this.beta1, this.stepCount));
112
+ const vHat = vVal / (1 - Math.pow(this.beta2, this.stepCount));
113
+ let update = mHat / (Math.sqrt(vHat) + this.epsilon);
114
+ if (this.gradientClip > 0) {
115
+ update = Math.max(-this.gradientClip, Math.min(update, this.gradientClip));
116
+ }
117
+ v.data -= this.learningRate * update;
118
+ this.m.set(v, m);
119
+ this.v.set(v, vVal);
120
+ }
121
+ }
122
+ }
123
+ exports.Adam = Adam;
124
+ /**
125
+ * AdamW optimizer, supports decoupled weight decay and gradient clipping (same options as Adam).
126
+ */
127
+ class AdamW extends Optimizer {
128
+ beta1;
129
+ beta2;
130
+ epsilon;
131
+ weightDecay;
132
+ gradientClip;
133
+ m = new Map();
134
+ v = new Map();
135
+ stepCount = 0;
136
+ /**
137
+ * Constructs an AdamW optimizer.
138
+ * @param trainables Array of Value parameters to optimize.
139
+ * @param opts Optional parameters (learningRate, weightDecay, gradientClip, beta1, beta2, epsilon).
140
+ */
141
+ constructor(trainables, opts = {}) {
142
+ super(trainables, opts.learningRate ?? 0.001);
143
+ this.beta1 = opts.beta1 ?? 0.9;
144
+ this.beta2 = opts.beta2 ?? 0.999;
145
+ this.epsilon = opts.epsilon ?? 1e-8;
146
+ this.weightDecay = opts.weightDecay ?? 0.01;
147
+ this.gradientClip = opts.gradientClip ?? 0;
148
+ for (const v of this.trainables) {
149
+ this.m.set(v, 0);
150
+ this.v.set(v, 0);
151
+ }
152
+ }
153
+ /**
154
+ * Performs a parameter update using AdamW optimization (decoupled weight decay).
155
+ */
156
+ step() {
157
+ this.stepCount++;
158
+ for (const v of this.trainables) {
159
+ let grad = v.grad;
160
+ let m = this.m.get(v);
161
+ let vVal = this.v.get(v);
162
+ m = this.beta1 * m + (1 - this.beta1) * grad;
163
+ vVal = this.beta2 * vVal + (1 - this.beta2) * grad * grad;
164
+ const mHat = m / (1 - Math.pow(this.beta1, this.stepCount));
165
+ const vHat = vVal / (1 - Math.pow(this.beta2, this.stepCount));
166
+ let update = mHat / (Math.sqrt(vHat) + this.epsilon);
167
+ if (this.gradientClip > 0) {
168
+ update = Math.max(-this.gradientClip, Math.min(update, this.gradientClip));
169
+ }
170
+ // Weight decay is decoupled as in AdamW paper:
171
+ v.data -= this.learningRate * update + this.learningRate * this.weightDecay * v.data;
172
+ this.m.set(v, m);
173
+ this.v.set(v, vVal);
174
+ }
175
+ }
176
+ }
177
+ exports.AdamW = AdamW;
@@ -0,0 +1 @@
1
+ export {};
@@ -0,0 +1,56 @@
1
+ "use strict";
2
+ // Optimizers.spec.ts
3
+ Object.defineProperty(exports, "__esModule", { value: true });
4
+ const Value_1 = require("./Value");
5
+ const Optimizers_1 = require("./Optimizers");
6
+ function createLoss(x) {
7
+ // Loss = (x - 5)^2
8
+ const target = new Value_1.Value(5);
9
+ const diff = x.sub(target);
10
+ return diff.mul(diff);
11
+ }
12
+ describe("Optimizers", () => {
13
+ it("SGD minimizes simple quadratic loss", () => {
14
+ const x = new Value_1.Value(0, "x", true);
15
+ const opt = new Optimizers_1.SGD([x], { learningRate: 0.1 });
16
+ let lossVal;
17
+ for (let i = 0; i < 100; i++) {
18
+ const loss = createLoss(x);
19
+ lossVal = loss.data;
20
+ if (lossVal < 1e-6)
21
+ break;
22
+ opt.zeroGrad();
23
+ loss.backward();
24
+ opt.step();
25
+ }
26
+ // AdamW does true weight decay, so the final x is slightly under target
27
+ expect(x.data).toBeLessThan(5.0);
28
+ expect(x.data).toBeGreaterThan(4.8);
29
+ });
30
+ it("Adam minimizes simple quadratic loss", () => {
31
+ const x = new Value_1.Value(0, "x", true);
32
+ const opt = new Optimizers_1.Adam([x], { learningRate: 0.1 });
33
+ let lossVal;
34
+ for (let i = 0; i < 100; i++) {
35
+ const loss = createLoss(x);
36
+ lossVal = loss.data;
37
+ if (lossVal < 1e-6)
38
+ break;
39
+ opt.zeroGrad();
40
+ loss.backward();
41
+ opt.step();
42
+ }
43
+ expect(x.data).toBeCloseTo(5.0, 1);
44
+ });
45
+ it("AdamW minimizes simple quadratic loss with weight decay", () => {
46
+ const x = new Value_1.Value(0, "x", true);
47
+ const opt = new Optimizers_1.AdamW([x], { learningRate: 0.1, beta1: 0.9, beta2: 0.999, weightDecay: 0.005 });
48
+ for (let i = 0; i < 100; i++) {
49
+ const loss = createLoss(x);
50
+ opt.zeroGrad();
51
+ loss.backward();
52
+ opt.step();
53
+ }
54
+ expect(x.data).toBeCloseTo(5.0, 1);
55
+ });
56
+ });
package/dist/V.d.ts ADDED
Binary file
package/dist/V.js ADDED
Binary file