neuralnetworknumpy 0.2.0__tar.gz → 0.3.0__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- neuralnetworknumpy-0.3.0/PKG-INFO +394 -0
- neuralnetworknumpy-0.3.0/README.md +375 -0
- {neuralnetworknumpy-0.2.0 → neuralnetworknumpy-0.3.0}/neuralnetworknumpy/__init__.py +46 -2
- neuralnetworknumpy-0.3.0/neuralnetworknumpy/backend.py +1 -0
- {neuralnetworknumpy-0.2.0 → neuralnetworknumpy-0.3.0}/neuralnetworknumpy/layers/Activation.py +16 -15
- {neuralnetworknumpy-0.2.0 → neuralnetworknumpy-0.3.0}/neuralnetworknumpy/layers/AveragePooling2D.py +12 -4
- {neuralnetworknumpy-0.2.0 → neuralnetworknumpy-0.3.0}/neuralnetworknumpy/layers/BatchNorm.py +14 -6
- {neuralnetworknumpy-0.2.0 → neuralnetworknumpy-0.3.0}/neuralnetworknumpy/layers/BatchNorm2D.py +13 -4
- {neuralnetworknumpy-0.2.0 → neuralnetworknumpy-0.3.0}/neuralnetworknumpy/layers/Conv2D.py +16 -5
- {neuralnetworknumpy-0.2.0 → neuralnetworknumpy-0.3.0}/neuralnetworknumpy/layers/Dense.py +38 -6
- {neuralnetworknumpy-0.2.0 → neuralnetworknumpy-0.3.0}/neuralnetworknumpy/layers/DepthwiseConv2D.py +3 -0
- neuralnetworknumpy-0.3.0/neuralnetworknumpy/layers/DepthwiseSeparableConv2D.py +43 -0
- {neuralnetworknumpy-0.2.0 → neuralnetworknumpy-0.3.0}/neuralnetworknumpy/layers/Dropout.py +9 -5
- neuralnetworknumpy-0.3.0/neuralnetworknumpy/layers/Embedding.py +92 -0
- neuralnetworknumpy-0.3.0/neuralnetworknumpy/layers/FeedForwardNetwork.py +50 -0
- {neuralnetworknumpy-0.2.0 → neuralnetworknumpy-0.3.0}/neuralnetworknumpy/layers/Flatten.py +5 -3
- neuralnetworknumpy-0.3.0/neuralnetworknumpy/layers/GRU.py +411 -0
- {neuralnetworknumpy-0.2.0 → neuralnetworknumpy-0.3.0}/neuralnetworknumpy/layers/GlobalAveragePooling2D.py +6 -4
- {neuralnetworknumpy-0.2.0 → neuralnetworknumpy-0.3.0}/neuralnetworknumpy/layers/GroupConv2D.py +17 -6
- neuralnetworknumpy-0.3.0/neuralnetworknumpy/layers/LSTM.py +481 -0
- neuralnetworknumpy-0.3.0/neuralnetworknumpy/layers/Layer.py +74 -0
- neuralnetworknumpy-0.3.0/neuralnetworknumpy/layers/LayerNorm.py +127 -0
- {neuralnetworknumpy-0.2.0 → neuralnetworknumpy-0.3.0}/neuralnetworknumpy/layers/MaxPooling2D.py +12 -4
- neuralnetworknumpy-0.3.0/neuralnetworknumpy/layers/MultiHeadAttention.py +60 -0
- neuralnetworknumpy-0.3.0/neuralnetworknumpy/layers/PositionEmbedding.py +120 -0
- neuralnetworknumpy-0.3.0/neuralnetworknumpy/layers/RNN.py +222 -0
- {neuralnetworknumpy-0.2.0 → neuralnetworknumpy-0.3.0}/neuralnetworknumpy/layers/ResidualBlock.py +27 -9
- neuralnetworknumpy-0.3.0/neuralnetworknumpy/layers/Seq2Seq.py +182 -0
- neuralnetworknumpy-0.3.0/neuralnetworknumpy/layers/SingleHeadAttention.py +218 -0
- {neuralnetworknumpy-0.2.0 → neuralnetworknumpy-0.3.0}/neuralnetworknumpy/layers/SpatiallySeparableConv2D.py +15 -9
- neuralnetworknumpy-0.3.0/neuralnetworknumpy/layers/TransformerBlock.py +85 -0
- {neuralnetworknumpy-0.2.0 → neuralnetworknumpy-0.3.0}/neuralnetworknumpy/layers/__init__.py +33 -13
- neuralnetworknumpy-0.3.0/neuralnetworknumpy/learning_rate.py +57 -0
- neuralnetworknumpy-0.3.0/neuralnetworknumpy/masks.py +16 -0
- neuralnetworknumpy-0.3.0/neuralnetworknumpy/model.py +722 -0
- neuralnetworknumpy-0.3.0/neuralnetworknumpy/tokenizer/Tokenizer.py +226 -0
- neuralnetworknumpy-0.3.0/neuralnetworknumpy/tokenizer/__init__.py +7 -0
- neuralnetworknumpy-0.3.0/neuralnetworknumpy/tokenizer/build_tokenizer.py +27 -0
- neuralnetworknumpy-0.3.0/neuralnetworknumpy/tokenizer/load_test.py +16 -0
- neuralnetworknumpy-0.3.0/neuralnetworknumpy/tokenizer/pretrained.py +21 -0
- neuralnetworknumpy-0.3.0/neuralnetworknumpy/utils/History.py +12 -0
- neuralnetworknumpy-0.3.0/neuralnetworknumpy/utils/Scaler.py +46 -0
- neuralnetworknumpy-0.3.0/neuralnetworknumpy/utils/__init__.py +10 -0
- neuralnetworknumpy-0.3.0/neuralnetworknumpy/utils/split.py +45 -0
- neuralnetworknumpy-0.3.0/neuralnetworknumpy.egg-info/PKG-INFO +394 -0
- {neuralnetworknumpy-0.2.0 → neuralnetworknumpy-0.3.0}/neuralnetworknumpy.egg-info/SOURCES.txt +23 -3
- {neuralnetworknumpy-0.2.0 → neuralnetworknumpy-0.3.0}/pyproject.toml +1 -1
- neuralnetworknumpy-0.2.0/PKG-INFO +0 -284
- neuralnetworknumpy-0.2.0/README.md +0 -265
- neuralnetworknumpy-0.2.0/neuralnetworknumpy/layers/DepthwiseSeparableConv2D.py +0 -33
- neuralnetworknumpy-0.2.0/neuralnetworknumpy/layers/Layer.py +0 -47
- neuralnetworknumpy-0.2.0/neuralnetworknumpy/model.py +0 -676
- neuralnetworknumpy-0.2.0/neuralnetworknumpy/utils.py +0 -87
- neuralnetworknumpy-0.2.0/neuralnetworknumpy.egg-info/PKG-INFO +0 -284
- neuralnetworknumpy-0.2.0/tests/test_load.py +0 -66
- neuralnetworknumpy-0.2.0/tests/test_model.py +0 -66
- {neuralnetworknumpy-0.2.0 → neuralnetworknumpy-0.3.0}/neuralnetworknumpy.egg-info/dependency_links.txt +0 -0
- {neuralnetworknumpy-0.2.0 → neuralnetworknumpy-0.3.0}/neuralnetworknumpy.egg-info/requires.txt +0 -0
- {neuralnetworknumpy-0.2.0 → neuralnetworknumpy-0.3.0}/neuralnetworknumpy.egg-info/top_level.txt +0 -0
- {neuralnetworknumpy-0.2.0 → neuralnetworknumpy-0.3.0}/setup.cfg +0 -0
|
@@ -0,0 +1,394 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: neuralnetworknumpy
|
|
3
|
+
Version: 0.3.0
|
|
4
|
+
Summary: A neural network framework built completely from scratch using NumPy
|
|
5
|
+
Author: Itamar Senderovitz
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/Sendy45/NeuralNetworkFromScratch
|
|
8
|
+
Keywords: neural network,deep learning,machine learning,numpy
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Operating System :: OS Independent
|
|
15
|
+
Requires-Python: >=3.8
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
Requires-Dist: numpy>=1.21
|
|
18
|
+
Requires-Dist: tqdm>=4.60
|
|
19
|
+
|
|
20
|
+
# neuralnetworknumpy
|
|
21
|
+
|
|
22
|
+
A deep learning framework built from scratch using NumPy. Implements forward propagation, backpropagation, convolutional layers, recurrent layers, transformer blocks, residual connections, and common optimizers — no PyTorch or TensorFlow required.
|
|
23
|
+
|
|
24
|
+
---
|
|
25
|
+
|
|
26
|
+
## Installation
|
|
27
|
+
|
|
28
|
+
```bash
|
|
29
|
+
pip install neuralnetworknumpy
|
|
30
|
+
```
|
|
31
|
+
|
|
32
|
+
Or from source:
|
|
33
|
+
|
|
34
|
+
```bash
|
|
35
|
+
git clone https://github.com/Sendy45/NeuralNetworkFromScratch.git
|
|
36
|
+
cd neuralnetworknumpy
|
|
37
|
+
pip install .
|
|
38
|
+
```
|
|
39
|
+
|
|
40
|
+
**Dependencies:** `numpy`, `tqdm`
|
|
41
|
+
**Optional (for examples):** `keras` (datasets only)
|
|
42
|
+
|
|
43
|
+
---
|
|
44
|
+
|
|
45
|
+
## Quick Start
|
|
46
|
+
|
|
47
|
+
### Dense network (MNIST)
|
|
48
|
+
|
|
49
|
+
```python
|
|
50
|
+
import numpy as np
|
|
51
|
+
from keras.datasets import mnist
|
|
52
|
+
from neuralnetworknumpy import NeuralNetwork
|
|
53
|
+
from neuralnetworknumpy.layers import Dense, ReLu, BatchNorm, Dropout, Softmax
|
|
54
|
+
|
|
55
|
+
(X_train, y_train), (X_test, y_test) = mnist.load_data()
|
|
56
|
+
X_train = X_train.reshape(-1, 784).astype(np.float32) / 255.0
|
|
57
|
+
X_test = X_test.reshape(-1, 784).astype(np.float32) / 255.0
|
|
58
|
+
|
|
59
|
+
model = NeuralNetwork([
|
|
60
|
+
Dense(256), ReLu(), BatchNorm(), Dropout(0.2),
|
|
61
|
+
Dense(128), ReLu(), BatchNorm(),
|
|
62
|
+
Dense(10), Softmax()
|
|
63
|
+
])
|
|
64
|
+
|
|
65
|
+
model.compile(optimizer="adam", loss_type="cross_entropy", lr=0.001)
|
|
66
|
+
history = model.fit(X_train, y_train, X_val=X_test, y_val=y_test, epochs=10, batch_size=64)
|
|
67
|
+
print(f"Val accuracy: {model.evaluate(X_test, y_test):.4f}")
|
|
68
|
+
```
|
|
69
|
+
|
|
70
|
+
### Convolutional network with residual blocks (Fashion-MNIST)
|
|
71
|
+
|
|
72
|
+
```python
|
|
73
|
+
import numpy as np
|
|
74
|
+
from keras.datasets import fashion_mnist
|
|
75
|
+
from neuralnetworknumpy import NeuralNetwork
|
|
76
|
+
from neuralnetworknumpy.layers import (
|
|
77
|
+
Conv2D, DepthwiseSeparableConv2D, MaxPooling2D, AveragePooling2D,
|
|
78
|
+
Flatten, Dense, ReLu, Softmax, BatchNorm2D, ResidualBlock
|
|
79
|
+
)
|
|
80
|
+
|
|
81
|
+
(train_X, train_y), (test_X, test_y) = fashion_mnist.load_data()
|
|
82
|
+
train_X = train_X.astype(np.float32) / 255.0
|
|
83
|
+
test_X = test_X.astype(np.float32) / 255.0
|
|
84
|
+
train_X = train_X[..., np.newaxis] # (N, 28, 28, 1)
|
|
85
|
+
test_X = test_X[..., np.newaxis]
|
|
86
|
+
|
|
87
|
+
def res_block(in_ch, out_ch, stride=(1, 1)):
|
|
88
|
+
projection = None
|
|
89
|
+
if in_ch != out_ch or stride != (1, 1):
|
|
90
|
+
projection = Conv2D(out_ch, 1, strides=stride, padding="same")
|
|
91
|
+
return ResidualBlock([
|
|
92
|
+
Conv2D(out_ch, (3, 3), strides=stride, padding="same"),
|
|
93
|
+
BatchNorm2D(), ReLu(),
|
|
94
|
+
DepthwiseSeparableConv2D(out_ch, (3, 3), padding="same"),
|
|
95
|
+
BatchNorm2D(), ReLu(),
|
|
96
|
+
], projection=projection)
|
|
97
|
+
|
|
98
|
+
model = NeuralNetwork([
|
|
99
|
+
Conv2D(16, (3, 3), padding="same"), BatchNorm2D(), ReLu(),
|
|
100
|
+
MaxPooling2D((2, 2)), # → 14×14×16
|
|
101
|
+
res_block(16, 32),
|
|
102
|
+
MaxPooling2D((2, 2)), # → 7×7×32
|
|
103
|
+
res_block(32, 64),
|
|
104
|
+
AveragePooling2D((7, 7)), # global avg pool → 1×1×64
|
|
105
|
+
Flatten(),
|
|
106
|
+
Dense(128), ReLu(),
|
|
107
|
+
Dense(10), Softmax()
|
|
108
|
+
])
|
|
109
|
+
|
|
110
|
+
model.compile(optimizer="adam", loss_type="cross_entropy", lr=0.001, lambda_=0.0001)
|
|
111
|
+
history = model.fit(train_X, train_y, X_val=test_X, y_val=test_y, epochs=20, batch_size=64)
|
|
112
|
+
model.save("fashion_model")
|
|
113
|
+
```
|
|
114
|
+
|
|
115
|
+
### Transformer language model (WikiText)
|
|
116
|
+
|
|
117
|
+
```python
|
|
118
|
+
import numpy as np
|
|
119
|
+
from neuralnetworknumpy import NeuralNetwork
|
|
120
|
+
from neuralnetworknumpy.layers import (
|
|
121
|
+
Embedding, PositionEmbedding, TransformerBlock, Dense, Softmax
|
|
122
|
+
)
|
|
123
|
+
from neuralnetworknumpy.tokenizer import Tokenizer
|
|
124
|
+
from neuralnetworknumpy.learning_rate import LinearWarmup, CosineDecay, SequentialLR
|
|
125
|
+
|
|
126
|
+
tokenizer = Tokenizer()
|
|
127
|
+
tokenizer.load("tokenizer.json")
|
|
128
|
+
vocab_size = len(tokenizer.vocab)
|
|
129
|
+
|
|
130
|
+
SEQ_LEN = 64
|
|
131
|
+
EMBED_DIM = 256
|
|
132
|
+
N_HEADS = 8
|
|
133
|
+
N_BLOCKS = 4
|
|
134
|
+
D_FFN = EMBED_DIM * 4
|
|
135
|
+
|
|
136
|
+
model = NeuralNetwork([
|
|
137
|
+
Embedding(vocab_size, EMBED_DIM),
|
|
138
|
+
PositionEmbedding(SEQ_LEN),
|
|
139
|
+
*[TransformerBlock(EMBED_DIM, N_HEADS, D_FFN) for _ in range(N_BLOCKS)],
|
|
140
|
+
Dense(vocab_size, EMBED_DIM),
|
|
141
|
+
Softmax()
|
|
142
|
+
])
|
|
143
|
+
|
|
144
|
+
total_steps = 40 * (500_000 // 64)
|
|
145
|
+
warmup_steps = 4000
|
|
146
|
+
|
|
147
|
+
schedule = SequentialLR(
|
|
148
|
+
schedules=[
|
|
149
|
+
LinearWarmup(warmup_steps=warmup_steps, max_lr=0.001),
|
|
150
|
+
CosineDecay(max_steps=total_steps - warmup_steps, base_lr=0.001),
|
|
151
|
+
],
|
|
152
|
+
boundaries=[warmup_steps]
|
|
153
|
+
)
|
|
154
|
+
|
|
155
|
+
model.compile(optimizer="adam", loss_type="cross_entropy", lr=schedule, task="language_model")
|
|
156
|
+
model.fit(X_train, y_train, X_val=X_val, y_val=y_val, epochs=40, batch_size=64)
|
|
157
|
+
|
|
158
|
+
# Generate text
|
|
159
|
+
print(model.generate("the researchers discovered", tokenizer, max_new_tokens=50, seq_len=SEQ_LEN))
|
|
160
|
+
```
|
|
161
|
+
|
|
162
|
+
### Seq2Seq with LSTM
|
|
163
|
+
|
|
164
|
+
```python
|
|
165
|
+
from neuralnetworknumpy import NeuralNetwork
|
|
166
|
+
from neuralnetworknumpy.layers import Seq2Seq
|
|
167
|
+
from neuralnetworknumpy.tokenizer import Tokenizer
|
|
168
|
+
|
|
169
|
+
tokenizer = Tokenizer()
|
|
170
|
+
tokenizer.load("tokenizer.json")
|
|
171
|
+
|
|
172
|
+
model = NeuralNetwork([
|
|
173
|
+
Seq2Seq(vocab_size=len(tokenizer.vocab), embed_dim=128, hidden_size=256, layer_type="LSTM")
|
|
174
|
+
])
|
|
175
|
+
|
|
176
|
+
model.compile(optimizer="adam", loss_type="cross_entropy", lr=0.001, task="language_model")
|
|
177
|
+
model.fit((X_src, X_trg), y, epochs=20, batch_size=64)
|
|
178
|
+
|
|
179
|
+
print(model.generate("how are you", tokenizer, max_new_tokens=30, mode="seq2seq"))
|
|
180
|
+
```
|
|
181
|
+
|
|
182
|
+
---
|
|
183
|
+
|
|
184
|
+
## API Reference
|
|
185
|
+
|
|
186
|
+
### `NeuralNetwork`
|
|
187
|
+
|
|
188
|
+
```python
|
|
189
|
+
model = NeuralNetwork(layers)
|
|
190
|
+
```
|
|
191
|
+
|
|
192
|
+
| Method | Description |
|
|
193
|
+
|---|---|
|
|
194
|
+
| `compile(loss_type, optimizer, lr, lambda_, task)` | Set training hyperparameters |
|
|
195
|
+
| `fit(X, y, X_val, y_val, epochs, batch_size)` | Train the model, returns `History` |
|
|
196
|
+
| `predict(X)` | Returns class label predictions |
|
|
197
|
+
| `predict_proba(X)` | Returns raw output activations |
|
|
198
|
+
| `evaluate(X, y)` | Returns accuracy |
|
|
199
|
+
| `save(path)` | Serialise model to `.pkl` |
|
|
200
|
+
| `NeuralNetwork.load(path)` | Load a saved model |
|
|
201
|
+
| `summary()` | Print layer descriptions and parameter counts |
|
|
202
|
+
| `generate(prompt_ids, tokenizer, ...)` | Autoregressive text generation |
|
|
203
|
+
| `check_gradient(X, y)` | Numerical gradient check for debugging |
|
|
204
|
+
|
|
205
|
+
**Optimizers:** `"adam"`, `"adamW"`, `"momentum"`, `"rmsprop"`, `"sgd"`
|
|
206
|
+
**Loss functions:** `"cross_entropy"`, `"mse"`
|
|
207
|
+
**Tasks:** `"classification"`, `"language_model"`
|
|
208
|
+
|
|
209
|
+
---
|
|
210
|
+
|
|
211
|
+
### Layers
|
|
212
|
+
|
|
213
|
+
#### Dense
|
|
214
|
+
|
|
215
|
+
| Layer | Constructor | Notes |
|
|
216
|
+
|---|---|---|
|
|
217
|
+
| `Dense` | `Dense(units)` | Fully connected |
|
|
218
|
+
| `BatchNorm` | `BatchNorm(momentum=0.9)` | For 1D feature vectors |
|
|
219
|
+
| `Dropout` | `Dropout(rate)` | Dropped during training only |
|
|
220
|
+
|
|
221
|
+
#### Activations
|
|
222
|
+
|
|
223
|
+
`ReLu()` · `Sigmoid()` · `Softmax()` · `Tanh()` · `Linear()`
|
|
224
|
+
|
|
225
|
+
#### Convolutional
|
|
226
|
+
|
|
227
|
+
All conv layers expect input shape `(batch, H, W, channels)`.
|
|
228
|
+
|
|
229
|
+
| Layer | Constructor | Notes |
|
|
230
|
+
|---|---|---|
|
|
231
|
+
| `Conv2D` | `Conv2D(filters, kernel_size, strides, padding)` | Standard 2D convolution |
|
|
232
|
+
| `GroupConv2D` | `GroupConv2D(filters, kernel_size, groups, strides, padding)` | `groups=1` → Conv2D, `groups=C_in` → depthwise |
|
|
233
|
+
| `DepthwiseConv2D` | `DepthwiseConv2D(kernel_size, strides, padding)` | One filter per input channel |
|
|
234
|
+
| `DepthwiseSeparableConv2D` | `DepthwiseSeparableConv2D(filters, kernel_size, strides, padding)` | Depthwise + pointwise |
|
|
235
|
+
| `SpatiallySeparableConv2D` | `SpatiallySeparableConv2D(filters, kernel_size)` | Row × column factored convolution |
|
|
236
|
+
| `BatchNorm2D` | `BatchNorm2D(momentum=0.9)` | Normalises over spatial + batch dims |
|
|
237
|
+
|
|
238
|
+
#### Pooling
|
|
239
|
+
|
|
240
|
+
| Layer | Constructor | Notes |
|
|
241
|
+
|---|---|---|
|
|
242
|
+
| `MaxPooling2D` | `MaxPooling2D(pool_size, strides, padding)` | Takes max in each window |
|
|
243
|
+
| `AveragePooling2D` | `AveragePooling2D(pool_size, strides, padding)` | Takes average in each window |
|
|
244
|
+
| `GlobalAveragePooling2D` | `GlobalAveragePooling2D()` | Collapses H×W → 1 per channel |
|
|
245
|
+
|
|
246
|
+
#### Structural
|
|
247
|
+
|
|
248
|
+
| Layer | Constructor | Notes |
|
|
249
|
+
|---|---|---|
|
|
250
|
+
| `Flatten` | `Flatten()` | `(m, H, W, C)` → `(m, H*W*C)` |
|
|
251
|
+
| `ResidualBlock` | `ResidualBlock(layers, projection=None)` | Skip connection |
|
|
252
|
+
|
|
253
|
+
#### Recurrent
|
|
254
|
+
|
|
255
|
+
| Layer | Constructor | Notes |
|
|
256
|
+
|---|---|---|
|
|
257
|
+
| `RNN` | `RNN(embed_dim, hidden_size)` | Simple recurrent, suffers from vanishing gradient |
|
|
258
|
+
| `GRU` | `GRU(embed_dim, hidden_size)` | Gated — update + reset gates |
|
|
259
|
+
| `LSTM` | `LSTM(embed_dim, hidden_size)` | Gated — forget, input, output, cell |
|
|
260
|
+
| `Seq2Seq` | `Seq2Seq(vocab_size, embed_dim, hidden_size, layer_type)` | Encoder-decoder with RNN/GRU/LSTM |
|
|
261
|
+
|
|
262
|
+
#### Transformer
|
|
263
|
+
|
|
264
|
+
| Layer | Constructor | Notes |
|
|
265
|
+
|---|---|---|
|
|
266
|
+
| `Embedding` | `Embedding(vocab_size, embed_dim)` | Token id → dense vector |
|
|
267
|
+
| `PositionEmbedding` | `PositionEmbedding(seq_len)` | Learnable positional encoding |
|
|
268
|
+
| `TransformerBlock` | `TransformerBlock(model_dim, n_heads, ffn_dim)` | Causal self-attention + FFN + LayerNorm |
|
|
269
|
+
| `MultiHeadAttention` | `MultiHeadAttention(embed_dim, heads_num)` | Multi-head scaled dot-product attention |
|
|
270
|
+
| `LayerNorm` | `LayerNorm(embed_dim)` | Normalises over feature dimension |
|
|
271
|
+
|
|
272
|
+
---
|
|
273
|
+
|
|
274
|
+
### Learning Rate Schedules
|
|
275
|
+
|
|
276
|
+
```python
|
|
277
|
+
from neuralnetworknumpy.learning_rate import (
|
|
278
|
+
LinearWarmup, CosineDecay, StepDecay, ExponentialDecay, SequentialLR
|
|
279
|
+
)
|
|
280
|
+
```
|
|
281
|
+
|
|
282
|
+
| Schedule | Constructor | Notes |
|
|
283
|
+
|---|---|---|
|
|
284
|
+
| `LinearWarmup` | `LinearWarmup(warmup_steps, max_lr)` | Ramps from 0 to `max_lr` |
|
|
285
|
+
| `CosineDecay` | `CosineDecay(max_steps, base_lr)` | Cosine annealing to 0 |
|
|
286
|
+
| `StepDecay` | `StepDecay(drop_rate, step_size, base_lr)` | Drops by factor every N steps |
|
|
287
|
+
| `ExponentialDecay` | `ExponentialDecay(drop_rate, base_lr)` | Continuous exponential decay |
|
|
288
|
+
| `SequentialLR` | `SequentialLR(schedules, boundaries)` | Chains schedules together |
|
|
289
|
+
|
|
290
|
+
**Warmup + cosine (recommended for transformers):**
|
|
291
|
+
|
|
292
|
+
```python
|
|
293
|
+
schedule = SequentialLR(
|
|
294
|
+
schedules=[
|
|
295
|
+
LinearWarmup(warmup_steps=4000, max_lr=0.001),
|
|
296
|
+
CosineDecay(max_steps=total_steps - 4000, base_lr=0.001),
|
|
297
|
+
],
|
|
298
|
+
boundaries=[4000]
|
|
299
|
+
)
|
|
300
|
+
model.compile(optimizer="adam", lr=schedule, ...)
|
|
301
|
+
```
|
|
302
|
+
|
|
303
|
+
---
|
|
304
|
+
|
|
305
|
+
### Masking
|
|
306
|
+
|
|
307
|
+
```python
|
|
308
|
+
from neuralnetworknumpy.masks import causal_mask, padding_mask, combined_mask
|
|
309
|
+
```
|
|
310
|
+
|
|
311
|
+
| Function | Description |
|
|
312
|
+
|---|---|
|
|
313
|
+
| `causal_mask(T)` | Upper-triangle mask — blocks future positions |
|
|
314
|
+
| `padding_mask(token_ids, pad_id)` | Blocks PAD tokens |
|
|
315
|
+
| `combined_mask(token_ids, pad_id)` | Causal + padding combined |
|
|
316
|
+
|
|
317
|
+
---
|
|
318
|
+
|
|
319
|
+
### Tokenizer
|
|
320
|
+
|
|
321
|
+
```python
|
|
322
|
+
from neuralnetworknumpy.tokenizer import Tokenizer
|
|
323
|
+
|
|
324
|
+
tokenizer = Tokenizer()
|
|
325
|
+
tokenizer.fit(text, vocab_size=8000) # train BPE tokenizer
|
|
326
|
+
tokenizer.save("tokenizer.json")
|
|
327
|
+
tokenizer.load("tokenizer.json")
|
|
328
|
+
|
|
329
|
+
ids = tokenizer.encode("hello world")
|
|
330
|
+
text = tokenizer.decode(ids)
|
|
331
|
+
```
|
|
332
|
+
|
|
333
|
+
**Download pretrained tokenizer (trained on WikiText-103):**
|
|
334
|
+
|
|
335
|
+
```python
|
|
336
|
+
from neuralnetworknumpy.pretrained import download_tokenizer
|
|
337
|
+
|
|
338
|
+
path = download_tokenizer() # downloads tokenizer.json if not present
|
|
339
|
+
tokenizer.load(path)
|
|
340
|
+
```
|
|
341
|
+
|
|
342
|
+
---
|
|
343
|
+
|
|
344
|
+
### Text Generation
|
|
345
|
+
|
|
346
|
+
```python
|
|
347
|
+
# Transformer LM
|
|
348
|
+
output = model.generate(
|
|
349
|
+
prompt_ids = tokenizer.encode("the scientists discovered"),
|
|
350
|
+
tokenizer = tokenizer,
|
|
351
|
+
max_new_tokens = 50,
|
|
352
|
+
temperature = 0.8,
|
|
353
|
+
seq_len = 64,
|
|
354
|
+
mode = "transformer",
|
|
355
|
+
top_k = 10
|
|
356
|
+
)
|
|
357
|
+
|
|
358
|
+
# Seq2Seq
|
|
359
|
+
output = model.generate(
|
|
360
|
+
prompt_ids = tokenizer.encode("how are you"),
|
|
361
|
+
tokenizer = tokenizer,
|
|
362
|
+
max_new_tokens = 30,
|
|
363
|
+
mode = "seq2seq"
|
|
364
|
+
)
|
|
365
|
+
```
|
|
366
|
+
|
|
367
|
+
---
|
|
368
|
+
|
|
369
|
+
### Utilities
|
|
370
|
+
|
|
371
|
+
```python
|
|
372
|
+
from neuralnetworknumpy.utils import History, Scaler, split_train_test, split_train_validation
|
|
373
|
+
```
|
|
374
|
+
|
|
375
|
+
| Utility | Description |
|
|
376
|
+
|---|---|
|
|
377
|
+
| `Scaler(mode)` | `"standard"` or `"minmax"` normalisation |
|
|
378
|
+
| `split_train_test(X, y, test_ratio)` | Random train/test split |
|
|
379
|
+
| `split_train_validation(X, y, val_ratio)` | Random train/val split |
|
|
380
|
+
| `History` | Returned by `model.fit()` — tracks loss and metrics per epoch |
|
|
381
|
+
|
|
382
|
+
---
|
|
383
|
+
|
|
384
|
+
## Save and Load
|
|
385
|
+
|
|
386
|
+
```python
|
|
387
|
+
model.save("my_model") # writes my_model.pkl
|
|
388
|
+
model2 = NeuralNetwork.load("my_model") # loads it back
|
|
389
|
+
print(model2.evaluate(X_test, y_test))
|
|
390
|
+
```
|
|
391
|
+
|
|
392
|
+
---
|
|
393
|
+
|
|
394
|
+
## Project Structure
|