@dniskav/neuron 0.1.6 → 0.2.0

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
package/README.md CHANGED
@@ -14,6 +14,13 @@ A minimal, dependency-free neural network library built from scratch in TypeScri
14
14
  | `NetworkN` | Deep network of arbitrary depth. Define your architecture as `[inputs, ...hidden, outputs]`. |
15
15
  | `LSTMLayer` | Recurrent layer with persistent hidden and cell state. Learns sequences via BPTT. |
16
16
  | `NetworkLSTM` | Wraps an `LSTMLayer` + dense layers. Maintains memory across steps within an episode. |
17
+ | `NetworkTransformer` | Full token-classification Transformer: embeddings → N blocks → per-token logits. |
18
+ | `TransformerBlock` | One Transformer block: multi-head attention + FFN + LayerNorm × 2 with residuals. |
19
+ | `MultiHeadAttention` | N parallel attention heads concatenated and projected to `d_model`. |
20
+ | `AttentionHead` | Single scaled dot-product self-attention head (Q / K / V projections + backprop). |
21
+ | `LayerNorm` | Layer normalization with learnable γ / β per feature. |
22
+ | `WeightMatrix` | 2D weight matrix with per-scalar Adam optimizers. |
23
+ | `EmbeddingMatrix` | Lookup-table embedding matrix with SGD updates. |
17
24
  | `sigmoid` `relu` `tanh` `linear` | Built-in activation functions. |
18
25
  | `SGD` `Momentum` `Adam` | Optimizers. Each instance tracks its own state per weight. |
19
26
  | `mse` `crossEntropy` | Loss functions for evaluation and logging. |
@@ -230,6 +237,37 @@ npm run dev # watch mode
230
237
 
231
238
  If you are an AI agent or LLM working with this codebase, read [AGENTS.md](AGENTS.md) first. It contains the full class hierarchy, design constraints, and what this library does not do.
232
239
 
240
+ ### NetworkTransformer — self-attention over sequences
241
+
242
+ ```ts
243
+ import { NetworkTransformer } from "@dniskav/neuron";
244
+
245
+ // Sudoku solver: 81 cells (tokens), values 0–9, predict digit 1–9 per cell
246
+ const net = new NetworkTransformer(81, {
247
+ vocabSize: 10, // digits 0–9
248
+ d_model: 64, // embedding / hidden dimension
249
+ nHeads: 4, // attention heads (d_k = d_model / nHeads = 16)
250
+ d_ff: 128, // FFN hidden size
251
+ nBlocks: 4, // number of transformer blocks
252
+ nClasses: 9, // output classes per token (digits 1–9)
253
+ });
254
+
255
+ // tokens: 81 cell values (0 = empty)
256
+ const puzzle = [5,3,0, 0,7,0, 0,0,0, ...];
257
+ const targets = [...]; // 81*9 one-hot values
258
+ const mask = puzzle.map(v => v === 0); // only train on empty cells
259
+
260
+ const loss = net.train(puzzle, targets, 0.001, mask);
261
+ const logits = net.predict(puzzle); // 729 logits (81 × 9)
262
+
263
+ // Attention weights from all blocks for visualization
264
+ const weights = net.getAttentionWeights();
265
+ // weights[blockIdx][headIdx] → seqLen × seqLen matrix
266
+ ```
267
+
268
+ Each head in each block learns a different type of relationship (row, column,
269
+ 3×3 box). The network figures this out by itself through training.
270
+
233
271
  ## Possible improvements
234
272
 
235
273
  1. **Support for batches** in training to improve efficiency.
package/dist/index.d.mts CHANGED
@@ -165,10 +165,119 @@ declare class NetworkLSTM {
165
165
  setWeights(data: ReturnType<NetworkLSTM["getWeights"]>): void;
166
166
  }
167
167
 
168
+ declare function matMul(A: number[][], B: number[][]): number[][];
169
+ declare function transpose(A: number[][]): number[][];
170
+ declare function softmax(row: number[]): number[];
171
+ declare function softmaxBackward(dS: number[], s: number[]): number[];
172
+ declare class WeightMatrix {
173
+ W: number[][];
174
+ private opts;
175
+ constructor(rows: number, cols: number);
176
+ update(dW: number[][], lr: number): void;
177
+ }
178
+ declare class EmbeddingMatrix {
179
+ W: number[][];
180
+ constructor(vocabSize: number, d_model: number);
181
+ get(idx: number): number[];
182
+ update(idx: number, grad: number[], lr: number): void;
183
+ }
184
+
185
+ declare class AttentionHead {
186
+ readonly d_k: number;
187
+ readonly d_v: number;
188
+ Wq: WeightMatrix;
189
+ Wk: WeightMatrix;
190
+ Wv: WeightMatrix;
191
+ private cache;
192
+ constructor(d_model: number, d_k: number, d_v: number);
193
+ predict(X: number[][]): number[][];
194
+ backward(dOut: number[][], lr: number): number[][];
195
+ getAttentionWeights(): number[][] | null;
196
+ }
197
+
198
+ declare class MultiHeadAttention {
199
+ readonly nHeads: number;
200
+ readonly d_model: number;
201
+ readonly d_k: number;
202
+ heads: AttentionHead[];
203
+ Wo: WeightMatrix;
204
+ private _concat;
205
+ constructor(d_model: number, nHeads: number);
206
+ predict(X: number[][]): number[][];
207
+ backward(dOut: number[][], lr: number): number[][];
208
+ getAttentionWeights(): (number[][] | null)[];
209
+ }
210
+
211
+ declare class LayerNorm {
212
+ gamma: number[];
213
+ beta: number[];
214
+ private readonly eps;
215
+ private _cache;
216
+ constructor(dim: number);
217
+ resetCache(seqLen: number): void;
218
+ predictOne(x: number[], pos: number): number[];
219
+ backwardOne(dOut: number[], pos: number, lr: number): number[];
220
+ }
221
+
222
+ interface TransformerBlockOptions {
223
+ d_model: number;
224
+ nHeads: number;
225
+ d_ff: number;
226
+ }
227
+ declare class TransformerBlock {
228
+ readonly d_model: number;
229
+ readonly d_ff: number;
230
+ attn: MultiHeadAttention;
231
+ norm1: LayerNorm;
232
+ norm2: LayerNorm;
233
+ ff1: WeightMatrix;
234
+ ff2: WeightMatrix;
235
+ b1: number[];
236
+ b2: number[];
237
+ private b1Opts;
238
+ private b2Opts;
239
+ private _X;
240
+ private _attnOut;
241
+ private _h1;
242
+ private _ff1Pre;
243
+ private _ff1Out;
244
+ private _ff2Out;
245
+ constructor({ d_model, nHeads, d_ff }: TransformerBlockOptions);
246
+ predict(X: number[][]): number[][];
247
+ backward(dOut: number[][], lr: number): number[][];
248
+ getAttentionWeights(): (number[][] | null)[];
249
+ }
250
+
251
+ interface NetworkTransformerOptions {
252
+ vocabSize?: number;
253
+ d_model?: number;
254
+ nHeads?: number;
255
+ d_ff?: number;
256
+ nBlocks?: number;
257
+ nClasses?: number;
258
+ }
259
+ declare class NetworkTransformer {
260
+ readonly seqLen: number;
261
+ readonly vocabSize: number;
262
+ readonly d_model: number;
263
+ readonly nClasses: number;
264
+ tokenEmb: EmbeddingMatrix;
265
+ posEmb: EmbeddingMatrix;
266
+ blocks: TransformerBlock[];
267
+ outputProj: WeightMatrix;
268
+ outputBias: number[];
269
+ private outBiasOpts;
270
+ constructor(seqLen: number, options?: NetworkTransformerOptions);
271
+ predict(tokens: number[]): number[];
272
+ train(tokens: number[], targets: number[], lr: number, mask?: boolean[]): number;
273
+ getAttentionWeights(): (number[][] | null)[][];
274
+ private _forward;
275
+ }
276
+
168
277
  declare function mse(predicted: number[], actual: number[]): number;
169
278
  declare function crossEntropy(predicted: number[], actual: number[]): number;
170
279
  declare function mseDelta(predicted: number, actual: number): number;
171
280
  declare function crossEntropyDelta(predicted: number, actual: number): number;
172
281
  declare function crossEntropyDeltaRaw(predicted: number, actual: number): number;
173
282
 
174
- export { type Activation, Adam, LSTMLayer, Layer, Momentum, Network, NetworkLSTM, type NetworkLSTMOptions, NetworkN, type NetworkNOptions, Neuron, NeuronN, type Optimizer, type OptimizerFactory, SGD, crossEntropy, crossEntropyDelta, crossEntropyDeltaRaw, elu, leakyRelu, linear, makeElu, makeLeakyRelu, mse, mseDelta, relu, sigmoid, tanh };
283
+ export { type Activation, Adam, AttentionHead, EmbeddingMatrix, LSTMLayer, Layer, LayerNorm, Momentum, MultiHeadAttention, Network, NetworkLSTM, type NetworkLSTMOptions, NetworkN, type NetworkNOptions, NetworkTransformer, type NetworkTransformerOptions, Neuron, NeuronN, type Optimizer, type OptimizerFactory, SGD, TransformerBlock, type TransformerBlockOptions, WeightMatrix, crossEntropy, crossEntropyDelta, crossEntropyDeltaRaw, elu, leakyRelu, linear, makeElu, makeLeakyRelu, matMul, mse, mseDelta, relu, sigmoid, softmax, softmaxBackward, tanh, transpose };
package/dist/index.d.ts CHANGED
@@ -165,10 +165,119 @@ declare class NetworkLSTM {
165
165
  setWeights(data: ReturnType<NetworkLSTM["getWeights"]>): void;
166
166
  }
167
167
 
168
+ declare function matMul(A: number[][], B: number[][]): number[][];
169
+ declare function transpose(A: number[][]): number[][];
170
+ declare function softmax(row: number[]): number[];
171
+ declare function softmaxBackward(dS: number[], s: number[]): number[];
172
+ declare class WeightMatrix {
173
+ W: number[][];
174
+ private opts;
175
+ constructor(rows: number, cols: number);
176
+ update(dW: number[][], lr: number): void;
177
+ }
178
+ declare class EmbeddingMatrix {
179
+ W: number[][];
180
+ constructor(vocabSize: number, d_model: number);
181
+ get(idx: number): number[];
182
+ update(idx: number, grad: number[], lr: number): void;
183
+ }
184
+
185
+ declare class AttentionHead {
186
+ readonly d_k: number;
187
+ readonly d_v: number;
188
+ Wq: WeightMatrix;
189
+ Wk: WeightMatrix;
190
+ Wv: WeightMatrix;
191
+ private cache;
192
+ constructor(d_model: number, d_k: number, d_v: number);
193
+ predict(X: number[][]): number[][];
194
+ backward(dOut: number[][], lr: number): number[][];
195
+ getAttentionWeights(): number[][] | null;
196
+ }
197
+
198
+ declare class MultiHeadAttention {
199
+ readonly nHeads: number;
200
+ readonly d_model: number;
201
+ readonly d_k: number;
202
+ heads: AttentionHead[];
203
+ Wo: WeightMatrix;
204
+ private _concat;
205
+ constructor(d_model: number, nHeads: number);
206
+ predict(X: number[][]): number[][];
207
+ backward(dOut: number[][], lr: number): number[][];
208
+ getAttentionWeights(): (number[][] | null)[];
209
+ }
210
+
211
+ declare class LayerNorm {
212
+ gamma: number[];
213
+ beta: number[];
214
+ private readonly eps;
215
+ private _cache;
216
+ constructor(dim: number);
217
+ resetCache(seqLen: number): void;
218
+ predictOne(x: number[], pos: number): number[];
219
+ backwardOne(dOut: number[], pos: number, lr: number): number[];
220
+ }
221
+
222
+ interface TransformerBlockOptions {
223
+ d_model: number;
224
+ nHeads: number;
225
+ d_ff: number;
226
+ }
227
+ declare class TransformerBlock {
228
+ readonly d_model: number;
229
+ readonly d_ff: number;
230
+ attn: MultiHeadAttention;
231
+ norm1: LayerNorm;
232
+ norm2: LayerNorm;
233
+ ff1: WeightMatrix;
234
+ ff2: WeightMatrix;
235
+ b1: number[];
236
+ b2: number[];
237
+ private b1Opts;
238
+ private b2Opts;
239
+ private _X;
240
+ private _attnOut;
241
+ private _h1;
242
+ private _ff1Pre;
243
+ private _ff1Out;
244
+ private _ff2Out;
245
+ constructor({ d_model, nHeads, d_ff }: TransformerBlockOptions);
246
+ predict(X: number[][]): number[][];
247
+ backward(dOut: number[][], lr: number): number[][];
248
+ getAttentionWeights(): (number[][] | null)[];
249
+ }
250
+
251
+ interface NetworkTransformerOptions {
252
+ vocabSize?: number;
253
+ d_model?: number;
254
+ nHeads?: number;
255
+ d_ff?: number;
256
+ nBlocks?: number;
257
+ nClasses?: number;
258
+ }
259
+ declare class NetworkTransformer {
260
+ readonly seqLen: number;
261
+ readonly vocabSize: number;
262
+ readonly d_model: number;
263
+ readonly nClasses: number;
264
+ tokenEmb: EmbeddingMatrix;
265
+ posEmb: EmbeddingMatrix;
266
+ blocks: TransformerBlock[];
267
+ outputProj: WeightMatrix;
268
+ outputBias: number[];
269
+ private outBiasOpts;
270
+ constructor(seqLen: number, options?: NetworkTransformerOptions);
271
+ predict(tokens: number[]): number[];
272
+ train(tokens: number[], targets: number[], lr: number, mask?: boolean[]): number;
273
+ getAttentionWeights(): (number[][] | null)[][];
274
+ private _forward;
275
+ }
276
+
168
277
  declare function mse(predicted: number[], actual: number[]): number;
169
278
  declare function crossEntropy(predicted: number[], actual: number[]): number;
170
279
  declare function mseDelta(predicted: number, actual: number): number;
171
280
  declare function crossEntropyDelta(predicted: number, actual: number): number;
172
281
  declare function crossEntropyDeltaRaw(predicted: number, actual: number): number;
173
282
 
174
- export { type Activation, Adam, LSTMLayer, Layer, Momentum, Network, NetworkLSTM, type NetworkLSTMOptions, NetworkN, type NetworkNOptions, Neuron, NeuronN, type Optimizer, type OptimizerFactory, SGD, crossEntropy, crossEntropyDelta, crossEntropyDeltaRaw, elu, leakyRelu, linear, makeElu, makeLeakyRelu, mse, mseDelta, relu, sigmoid, tanh };
283
+ export { type Activation, Adam, AttentionHead, EmbeddingMatrix, LSTMLayer, Layer, LayerNorm, Momentum, MultiHeadAttention, Network, NetworkLSTM, type NetworkLSTMOptions, NetworkN, type NetworkNOptions, NetworkTransformer, type NetworkTransformerOptions, Neuron, NeuronN, type Optimizer, type OptimizerFactory, SGD, TransformerBlock, type TransformerBlockOptions, WeightMatrix, crossEntropy, crossEntropyDelta, crossEntropyDeltaRaw, elu, leakyRelu, linear, makeElu, makeLeakyRelu, matMul, mse, mseDelta, relu, sigmoid, softmax, softmaxBackward, tanh, transpose };