@dniskav/neuron 0.1.6 → 0.2.1

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -14,6 +14,13 @@ A minimal, dependency-free neural network library built from scratch in TypeScri
14
14
  | `NetworkN` | Deep network of arbitrary depth. Define your architecture as `[inputs, ...hidden, outputs]`. |
15
15
  | `LSTMLayer` | Recurrent layer with persistent hidden and cell state. Learns sequences via BPTT. |
16
16
  | `NetworkLSTM` | Wraps an `LSTMLayer` + dense layers. Maintains memory across steps within an episode. |
17
+ | `NetworkTransformer` | Full token-classification Transformer: embeddings → N blocks → per-token logits. |
18
+ | `TransformerBlock` | One Transformer block: multi-head attention + FFN + LayerNorm × 2 with residuals. |
19
+ | `MultiHeadAttention` | N parallel attention heads concatenated and projected to `d_model`. |
20
+ | `AttentionHead` | Single scaled dot-product self-attention head (Q / K / V projections + backprop). |
21
+ | `LayerNorm` | Layer normalization with learnable γ / β per feature. |
22
+ | `WeightMatrix` | 2D weight matrix with per-scalar Adam optimizers. Optional per-element gradient clipping via `update(dW, lr, clipValue)`. |
23
+ | `EmbeddingMatrix` | Lookup-table embedding matrix with SGD updates. |
17
24
  | `sigmoid` `relu` `tanh` `linear` | Built-in activation functions. |
18
25
  | `SGD` `Momentum` `Adam` | Optimizers. Each instance tracks its own state per weight. |
19
26
  | `mse` `crossEntropy` | Loss functions for evaluation and logging. |
@@ -230,10 +237,44 @@ npm run dev # watch mode
230
237
 
231
238
  If you are an AI agent or LLM working with this codebase, read [AGENTS.md](AGENTS.md) first. It contains the full class hierarchy, design constraints, and what this library does not do.
232
239
 
240
+ ### NetworkTransformer — self-attention over sequences
241
+
242
+ ```ts
243
+ import { NetworkTransformer } from "@dniskav/neuron";
244
+
245
+ // Sudoku solver: 81 cells (tokens), values 0–9, predict digit 1–9 per cell
246
+ const net = new NetworkTransformer(81, {
247
+ vocabSize: 10, // digits 0–9
248
+ d_model: 64, // embedding / hidden dimension
249
+ nHeads: 4, // attention heads (d_k = d_model / nHeads = 16)
250
+ d_ff: 128, // FFN hidden size
251
+ nBlocks: 4, // number of transformer blocks
252
+ nClasses: 9, // output classes per token (digits 1–9)
253
+ });
254
+
255
+ // tokens: 81 cell values (0 = empty)
256
+ const puzzle = [5,3,0, 0,7,0, 0,0,0, ...];
257
+ const targets = [...]; // 81*9 one-hot values
258
+ const mask = puzzle.map(v => v === 0); // only train on empty cells
259
+
260
+ const loss = net.train(puzzle, targets, 0.001, mask);
261
+ // loss is cross-entropy (not MSE) — decreases from ~2.2 toward 0 as training progresses
262
+ const logits = net.predict(puzzle); // 729 logits (81 × 9)
263
+
264
+ // Attention weights from all blocks for visualization
265
+ const weights = net.getAttentionWeights();
266
+ // weights[blockIdx][headIdx] → seqLen × seqLen matrix
267
+ ```
268
+
269
+ Each head in each block learns a different type of relationship (row, column,
270
+ 3×3 box). The network figures this out by itself through training.
271
+
233
272
  ## Possible improvements
234
273
 
235
- 1. **Support for batches** in training to improve efficiency.
236
- 2. **Improve documentation** with more advanced examples and use cases.
274
+ 1. **Support for batches** in training to improve efficiency and gradient stability.
275
+ 2. **Global gradient norm clipping** `WeightMatrix.update` supports per-element clipping; a utility to clip across all matrices by total norm would be more principled.
276
+ 3. **Learning rate warmup** — standard practice for Transformers; ramp LR from 0 to target over the first N steps.
277
+ 4. **Pre-norm architecture** — LayerNorm before the residual add (instead of after) is more stable for deep stacks.
237
278
 
238
279
  ## License
239
280
 
package/dist/index.d.mts CHANGED
@@ -165,10 +165,119 @@ declare class NetworkLSTM {
165
165
  setWeights(data: ReturnType<NetworkLSTM["getWeights"]>): void;
166
166
  }
167
167
 
168
+ declare function matMul(A: number[][], B: number[][]): number[][];
169
+ declare function transpose(A: number[][]): number[][];
170
+ declare function softmax(row: number[]): number[];
171
+ declare function softmaxBackward(dS: number[], s: number[]): number[];
172
+ declare class WeightMatrix {
173
+ W: number[][];
174
+ private opts;
175
+ constructor(rows: number, cols: number);
176
+ update(dW: number[][], lr: number, clipValue?: number): void;
177
+ }
178
+ declare class EmbeddingMatrix {
179
+ W: number[][];
180
+ constructor(vocabSize: number, d_model: number);
181
+ get(idx: number): number[];
182
+ update(idx: number, grad: number[], lr: number): void;
183
+ }
184
+
185
+ declare class AttentionHead {
186
+ readonly d_k: number;
187
+ readonly d_v: number;
188
+ Wq: WeightMatrix;
189
+ Wk: WeightMatrix;
190
+ Wv: WeightMatrix;
191
+ private cache;
192
+ constructor(d_model: number, d_k: number, d_v: number);
193
+ predict(X: number[][]): number[][];
194
+ backward(dOut: number[][], lr: number): number[][];
195
+ getAttentionWeights(): number[][] | null;
196
+ }
197
+
198
+ declare class MultiHeadAttention {
199
+ readonly nHeads: number;
200
+ readonly d_model: number;
201
+ readonly d_k: number;
202
+ heads: AttentionHead[];
203
+ Wo: WeightMatrix;
204
+ private _concat;
205
+ constructor(d_model: number, nHeads: number);
206
+ predict(X: number[][]): number[][];
207
+ backward(dOut: number[][], lr: number): number[][];
208
+ getAttentionWeights(): (number[][] | null)[];
209
+ }
210
+
211
+ declare class LayerNorm {
212
+ gamma: number[];
213
+ beta: number[];
214
+ private readonly eps;
215
+ private _cache;
216
+ constructor(dim: number);
217
+ resetCache(seqLen: number): void;
218
+ predictOne(x: number[], pos: number): number[];
219
+ backwardOne(dOut: number[], pos: number, lr: number): number[];
220
+ }
221
+
222
+ interface TransformerBlockOptions {
223
+ d_model: number;
224
+ nHeads: number;
225
+ d_ff: number;
226
+ }
227
+ declare class TransformerBlock {
228
+ readonly d_model: number;
229
+ readonly d_ff: number;
230
+ attn: MultiHeadAttention;
231
+ norm1: LayerNorm;
232
+ norm2: LayerNorm;
233
+ ff1: WeightMatrix;
234
+ ff2: WeightMatrix;
235
+ b1: number[];
236
+ b2: number[];
237
+ private b1Opts;
238
+ private b2Opts;
239
+ private _X;
240
+ private _attnOut;
241
+ private _h1;
242
+ private _ff1Pre;
243
+ private _ff1Out;
244
+ private _ff2Out;
245
+ constructor({ d_model, nHeads, d_ff }: TransformerBlockOptions);
246
+ predict(X: number[][]): number[][];
247
+ backward(dOut: number[][], lr: number): number[][];
248
+ getAttentionWeights(): (number[][] | null)[];
249
+ }
250
+
251
+ interface NetworkTransformerOptions {
252
+ vocabSize?: number;
253
+ d_model?: number;
254
+ nHeads?: number;
255
+ d_ff?: number;
256
+ nBlocks?: number;
257
+ nClasses?: number;
258
+ }
259
+ declare class NetworkTransformer {
260
+ readonly seqLen: number;
261
+ readonly vocabSize: number;
262
+ readonly d_model: number;
263
+ readonly nClasses: number;
264
+ tokenEmb: EmbeddingMatrix;
265
+ posEmb: EmbeddingMatrix;
266
+ blocks: TransformerBlock[];
267
+ outputProj: WeightMatrix;
268
+ outputBias: number[];
269
+ private outBiasOpts;
270
+ constructor(seqLen: number, options?: NetworkTransformerOptions);
271
+ predict(tokens: number[]): number[];
272
+ train(tokens: number[], targets: number[], lr: number, mask?: boolean[]): number;
273
+ getAttentionWeights(): (number[][] | null)[][];
274
+ private _forward;
275
+ }
276
+
168
277
  declare function mse(predicted: number[], actual: number[]): number;
169
278
  declare function crossEntropy(predicted: number[], actual: number[]): number;
170
279
  declare function mseDelta(predicted: number, actual: number): number;
171
280
  declare function crossEntropyDelta(predicted: number, actual: number): number;
172
281
  declare function crossEntropyDeltaRaw(predicted: number, actual: number): number;
173
282
 
174
- export { type Activation, Adam, LSTMLayer, Layer, Momentum, Network, NetworkLSTM, type NetworkLSTMOptions, NetworkN, type NetworkNOptions, Neuron, NeuronN, type Optimizer, type OptimizerFactory, SGD, crossEntropy, crossEntropyDelta, crossEntropyDeltaRaw, elu, leakyRelu, linear, makeElu, makeLeakyRelu, mse, mseDelta, relu, sigmoid, tanh };
283
+ export { type Activation, Adam, AttentionHead, EmbeddingMatrix, LSTMLayer, Layer, LayerNorm, Momentum, MultiHeadAttention, Network, NetworkLSTM, type NetworkLSTMOptions, NetworkN, type NetworkNOptions, NetworkTransformer, type NetworkTransformerOptions, Neuron, NeuronN, type Optimizer, type OptimizerFactory, SGD, TransformerBlock, type TransformerBlockOptions, WeightMatrix, crossEntropy, crossEntropyDelta, crossEntropyDeltaRaw, elu, leakyRelu, linear, makeElu, makeLeakyRelu, matMul, mse, mseDelta, relu, sigmoid, softmax, softmaxBackward, tanh, transpose };
package/dist/index.d.ts CHANGED
@@ -165,10 +165,119 @@ declare class NetworkLSTM {
165
165
  setWeights(data: ReturnType<NetworkLSTM["getWeights"]>): void;
166
166
  }
167
167
 
168
+ declare function matMul(A: number[][], B: number[][]): number[][];
169
+ declare function transpose(A: number[][]): number[][];
170
+ declare function softmax(row: number[]): number[];
171
+ declare function softmaxBackward(dS: number[], s: number[]): number[];
172
+ declare class WeightMatrix {
173
+ W: number[][];
174
+ private opts;
175
+ constructor(rows: number, cols: number);
176
+ update(dW: number[][], lr: number, clipValue?: number): void;
177
+ }
178
+ declare class EmbeddingMatrix {
179
+ W: number[][];
180
+ constructor(vocabSize: number, d_model: number);
181
+ get(idx: number): number[];
182
+ update(idx: number, grad: number[], lr: number): void;
183
+ }
184
+
185
+ declare class AttentionHead {
186
+ readonly d_k: number;
187
+ readonly d_v: number;
188
+ Wq: WeightMatrix;
189
+ Wk: WeightMatrix;
190
+ Wv: WeightMatrix;
191
+ private cache;
192
+ constructor(d_model: number, d_k: number, d_v: number);
193
+ predict(X: number[][]): number[][];
194
+ backward(dOut: number[][], lr: number): number[][];
195
+ getAttentionWeights(): number[][] | null;
196
+ }
197
+
198
+ declare class MultiHeadAttention {
199
+ readonly nHeads: number;
200
+ readonly d_model: number;
201
+ readonly d_k: number;
202
+ heads: AttentionHead[];
203
+ Wo: WeightMatrix;
204
+ private _concat;
205
+ constructor(d_model: number, nHeads: number);
206
+ predict(X: number[][]): number[][];
207
+ backward(dOut: number[][], lr: number): number[][];
208
+ getAttentionWeights(): (number[][] | null)[];
209
+ }
210
+
211
+ declare class LayerNorm {
212
+ gamma: number[];
213
+ beta: number[];
214
+ private readonly eps;
215
+ private _cache;
216
+ constructor(dim: number);
217
+ resetCache(seqLen: number): void;
218
+ predictOne(x: number[], pos: number): number[];
219
+ backwardOne(dOut: number[], pos: number, lr: number): number[];
220
+ }
221
+
222
+ interface TransformerBlockOptions {
223
+ d_model: number;
224
+ nHeads: number;
225
+ d_ff: number;
226
+ }
227
+ declare class TransformerBlock {
228
+ readonly d_model: number;
229
+ readonly d_ff: number;
230
+ attn: MultiHeadAttention;
231
+ norm1: LayerNorm;
232
+ norm2: LayerNorm;
233
+ ff1: WeightMatrix;
234
+ ff2: WeightMatrix;
235
+ b1: number[];
236
+ b2: number[];
237
+ private b1Opts;
238
+ private b2Opts;
239
+ private _X;
240
+ private _attnOut;
241
+ private _h1;
242
+ private _ff1Pre;
243
+ private _ff1Out;
244
+ private _ff2Out;
245
+ constructor({ d_model, nHeads, d_ff }: TransformerBlockOptions);
246
+ predict(X: number[][]): number[][];
247
+ backward(dOut: number[][], lr: number): number[][];
248
+ getAttentionWeights(): (number[][] | null)[];
249
+ }
250
+
251
+ interface NetworkTransformerOptions {
252
+ vocabSize?: number;
253
+ d_model?: number;
254
+ nHeads?: number;
255
+ d_ff?: number;
256
+ nBlocks?: number;
257
+ nClasses?: number;
258
+ }
259
+ declare class NetworkTransformer {
260
+ readonly seqLen: number;
261
+ readonly vocabSize: number;
262
+ readonly d_model: number;
263
+ readonly nClasses: number;
264
+ tokenEmb: EmbeddingMatrix;
265
+ posEmb: EmbeddingMatrix;
266
+ blocks: TransformerBlock[];
267
+ outputProj: WeightMatrix;
268
+ outputBias: number[];
269
+ private outBiasOpts;
270
+ constructor(seqLen: number, options?: NetworkTransformerOptions);
271
+ predict(tokens: number[]): number[];
272
+ train(tokens: number[], targets: number[], lr: number, mask?: boolean[]): number;
273
+ getAttentionWeights(): (number[][] | null)[][];
274
+ private _forward;
275
+ }
276
+
168
277
  declare function mse(predicted: number[], actual: number[]): number;
169
278
  declare function crossEntropy(predicted: number[], actual: number[]): number;
170
279
  declare function mseDelta(predicted: number, actual: number): number;
171
280
  declare function crossEntropyDelta(predicted: number, actual: number): number;
172
281
  declare function crossEntropyDeltaRaw(predicted: number, actual: number): number;
173
282
 
174
- export { type Activation, Adam, LSTMLayer, Layer, Momentum, Network, NetworkLSTM, type NetworkLSTMOptions, NetworkN, type NetworkNOptions, Neuron, NeuronN, type Optimizer, type OptimizerFactory, SGD, crossEntropy, crossEntropyDelta, crossEntropyDeltaRaw, elu, leakyRelu, linear, makeElu, makeLeakyRelu, mse, mseDelta, relu, sigmoid, tanh };
283
+ export { type Activation, Adam, AttentionHead, EmbeddingMatrix, LSTMLayer, Layer, LayerNorm, Momentum, MultiHeadAttention, Network, NetworkLSTM, type NetworkLSTMOptions, NetworkN, type NetworkNOptions, NetworkTransformer, type NetworkTransformerOptions, Neuron, NeuronN, type Optimizer, type OptimizerFactory, SGD, TransformerBlock, type TransformerBlockOptions, WeightMatrix, crossEntropy, crossEntropyDelta, crossEntropyDeltaRaw, elu, leakyRelu, linear, makeElu, makeLeakyRelu, matMul, mse, mseDelta, relu, sigmoid, softmax, softmaxBackward, tanh, transpose };