@dniskav/neuron 0.2.0 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -19,7 +19,7 @@ A minimal, dependency-free neural network library built from scratch in TypeScri
19
19
  | `MultiHeadAttention` | N parallel attention heads concatenated and projected to `d_model`. |
20
20
  | `AttentionHead` | Single scaled dot-product self-attention head (Q / K / V projections + backprop). |
21
21
  | `LayerNorm` | Layer normalization with learnable γ / β per feature. |
22
- | `WeightMatrix` | 2D weight matrix with per-scalar Adam optimizers. |
22
+ | `WeightMatrix` | 2D weight matrix with per-scalar Adam optimizers. Optional per-element gradient clipping via `update(dW, lr, clipValue)`. |
23
23
  | `EmbeddingMatrix` | Lookup-table embedding matrix with SGD updates. |
24
24
  | `sigmoid` `relu` `tanh` `linear` | Built-in activation functions. |
25
25
  | `SGD` `Momentum` `Adam` | Optimizers. Each instance tracks its own state per weight. |
@@ -258,6 +258,7 @@ const targets = [...]; // 81*9 one-hot values
258
258
  const mask = puzzle.map(v => v === 0); // only train on empty cells
259
259
 
260
260
  const loss = net.train(puzzle, targets, 0.001, mask);
261
+ // loss is cross-entropy (not MSE) — decreases from ~2.2 toward 0 as training progresses
261
262
  const logits = net.predict(puzzle); // 729 logits (81 × 9)
262
263
 
263
264
  // Attention weights from all blocks for visualization
@@ -270,8 +271,10 @@ Each head in each block learns a different type of relationship (row, column,
270
271
 
271
272
  ## Possible improvements
272
273
 
273
- 1. **Support for batches** in training to improve efficiency.
274
- 2. **Improve documentation** with more advanced examples and use cases.
274
+ 1. **Support for batches** in training to improve efficiency and gradient stability.
275
+ 2. **Global gradient norm clipping** `WeightMatrix.update` supports per-element clipping; a utility to clip across all matrices by total norm would be more principled.
276
+ 3. **Learning rate warmup** — standard practice for Transformers; ramp LR from 0 to target over the first N steps.
277
+ 4. **Pre-norm architecture** — LayerNorm before the residual add (instead of after) is more stable for deep stacks.
275
278
 
276
279
  ## License
277
280
 
package/dist/index.d.mts CHANGED
@@ -173,7 +173,7 @@ declare class WeightMatrix {
173
173
  W: number[][];
174
174
  private opts;
175
175
  constructor(rows: number, cols: number);
176
- update(dW: number[][], lr: number): void;
176
+ update(dW: number[][], lr: number, clipValue?: number): void;
177
177
  }
178
178
  declare class EmbeddingMatrix {
179
179
  W: number[][];
package/dist/index.d.ts CHANGED
@@ -173,7 +173,7 @@ declare class WeightMatrix {
173
173
  W: number[][];
174
174
  private opts;
175
175
  constructor(rows: number, cols: number);
176
- update(dW: number[][], lr: number): void;
176
+ update(dW: number[][], lr: number, clipValue?: number): void;
177
177
  }
178
178
  declare class EmbeddingMatrix {
179
179
  W: number[][];
package/dist/index.js CHANGED
@@ -579,10 +579,15 @@ var WeightMatrix = class {
579
579
  );
580
580
  }
581
581
  // Apply pre-computed gradient (same shape as W).
582
- update(dW, lr) {
582
+ // clipValue: optional per-element gradient clipping before the Adam step.
583
+ // Prevents gradient explosion in deep networks (e.g. Transformers without
584
+ // global norm clipping). Pass e.g. 1.0 to clip to [-1, 1].
585
+ update(dW, lr, clipValue = Infinity) {
583
586
  for (let i = 0; i < this.W.length; i++)
584
- for (let j = 0; j < this.W[0].length; j++)
585
- this.W[i][j] = this.opts[i][j].step(this.W[i][j], dW[i][j], lr);
587
+ for (let j = 0; j < this.W[0].length; j++) {
588
+ const g = isFinite(clipValue) ? Math.max(-clipValue, Math.min(clipValue, dW[i][j])) : dW[i][j];
589
+ this.W[i][j] = this.opts[i][j].step(this.W[i][j], g, lr);
590
+ }
586
591
  }
587
592
  };
588
593
  var EmbeddingMatrix = class {
@@ -1036,14 +1041,14 @@ var NetworkTransformer = class {
1036
1041
  const dLogits = Array.from({ length: this.seqLen }, (_, i) => {
1037
1042
  if (mask && !mask[i]) return new Array(this.nClasses).fill(0);
1038
1043
  count++;
1039
- return Array.from({ length: this.nClasses }, (_2, c) => {
1044
+ const probs = softmax(logits[i]);
1045
+ for (let c = 0; c < this.nClasses; c++) {
1040
1046
  const t = targets[i * this.nClasses + c];
1041
- const p = logits[i][c];
1042
- loss += (p - t) ** 2;
1043
- return 2 * (p - t);
1044
- });
1047
+ if (t > 0) loss -= Math.log(Math.max(probs[c], 1e-7));
1048
+ }
1049
+ return probs.map((p, c) => p - targets[i * this.nClasses + c]);
1045
1050
  });
1046
- if (count > 0) loss /= count * this.nClasses;
1051
+ if (count > 0) loss /= count;
1047
1052
  const dH = Array.from(
1048
1053
  { length: this.seqLen },
1049
1054
  (_, i) => Array.from(
package/dist/index.mjs CHANGED
@@ -520,10 +520,15 @@ var WeightMatrix = class {
520
520
  );
521
521
  }
522
522
  // Apply pre-computed gradient (same shape as W).
523
- update(dW, lr) {
523
+ // clipValue: optional per-element gradient clipping before the Adam step.
524
+ // Prevents gradient explosion in deep networks (e.g. Transformers without
525
+ // global norm clipping). Pass e.g. 1.0 to clip to [-1, 1].
526
+ update(dW, lr, clipValue = Infinity) {
524
527
  for (let i = 0; i < this.W.length; i++)
525
- for (let j = 0; j < this.W[0].length; j++)
526
- this.W[i][j] = this.opts[i][j].step(this.W[i][j], dW[i][j], lr);
528
+ for (let j = 0; j < this.W[0].length; j++) {
529
+ const g = isFinite(clipValue) ? Math.max(-clipValue, Math.min(clipValue, dW[i][j])) : dW[i][j];
530
+ this.W[i][j] = this.opts[i][j].step(this.W[i][j], g, lr);
531
+ }
527
532
  }
528
533
  };
529
534
  var EmbeddingMatrix = class {
@@ -977,14 +982,14 @@ var NetworkTransformer = class {
977
982
  const dLogits = Array.from({ length: this.seqLen }, (_, i) => {
978
983
  if (mask && !mask[i]) return new Array(this.nClasses).fill(0);
979
984
  count++;
980
- return Array.from({ length: this.nClasses }, (_2, c) => {
985
+ const probs = softmax(logits[i]);
986
+ for (let c = 0; c < this.nClasses; c++) {
981
987
  const t = targets[i * this.nClasses + c];
982
- const p = logits[i][c];
983
- loss += (p - t) ** 2;
984
- return 2 * (p - t);
985
- });
988
+ if (t > 0) loss -= Math.log(Math.max(probs[c], 1e-7));
989
+ }
990
+ return probs.map((p, c) => p - targets[i * this.nClasses + c]);
986
991
  });
987
- if (count > 0) loss /= count * this.nClasses;
992
+ if (count > 0) loss /= count;
988
993
  const dH = Array.from(
989
994
  { length: this.seqLen },
990
995
  (_, i) => Array.from(
package/package.json CHANGED
@@ -1,6 +1,6 @@
1
1
  {
2
2
  "name": "@dniskav/neuron",
3
- "version": "0.2.0",
3
+ "version": "0.2.1",
4
4
  "description": "Minimal neural network from scratch — neuron, layer, network, backpropagation. No dependencies.",
5
5
  "main": "dist/index.js",
6
6
  "module": "dist/index.mjs",