neuralnetworknumpy 0.1.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- NeuralNetworkFromScratch/__init__.py +56 -0
- NeuralNetworkFromScratch/layers.py +342 -0
- NeuralNetworkFromScratch/model.py +506 -0
- NeuralNetworkFromScratch/utils.py +87 -0
- neuralnetworknumpy-0.1.1.dist-info/METADATA +150 -0
- neuralnetworknumpy-0.1.1.dist-info/RECORD +8 -0
- neuralnetworknumpy-0.1.1.dist-info/WHEEL +5 -0
- neuralnetworknumpy-0.1.1.dist-info/top_level.txt +1 -0
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
"""
|
|
2
|
+
NeuralNetworkFromScratch
|
|
3
|
+
A minimal deep learning framework built using NumPy.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
__version__ = "0.1.0"
|
|
7
|
+
|
|
8
|
+
# Layers
|
|
9
|
+
from .layers import (
|
|
10
|
+
Layer,
|
|
11
|
+
Dense,
|
|
12
|
+
Activation,
|
|
13
|
+
ReLu,
|
|
14
|
+
Sigmoid,
|
|
15
|
+
Softmax,
|
|
16
|
+
Linear,
|
|
17
|
+
Tanh,
|
|
18
|
+
BatchNorm,
|
|
19
|
+
Dropout,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
# Model
|
|
23
|
+
from .model import NeuralNetwork
|
|
24
|
+
|
|
25
|
+
# Utils
|
|
26
|
+
from .utils import (
|
|
27
|
+
History,
|
|
28
|
+
Scaler,
|
|
29
|
+
split_train_test,
|
|
30
|
+
split_train_validation,
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
__all__ = [
|
|
34
|
+
# Core
|
|
35
|
+
"NeuralNetwork",
|
|
36
|
+
|
|
37
|
+
# Base
|
|
38
|
+
"Layer",
|
|
39
|
+
|
|
40
|
+
# Layers
|
|
41
|
+
"Dense",
|
|
42
|
+
"Activation",
|
|
43
|
+
"ReLu",
|
|
44
|
+
"Sigmoid",
|
|
45
|
+
"Softmax",
|
|
46
|
+
"Linear",
|
|
47
|
+
"Tanh",
|
|
48
|
+
"BatchNorm",
|
|
49
|
+
"Dropout",
|
|
50
|
+
|
|
51
|
+
# Utilities
|
|
52
|
+
"History",
|
|
53
|
+
"Scaler",
|
|
54
|
+
"split_train_test",
|
|
55
|
+
"split_train_validation",
|
|
56
|
+
]
|
|
@@ -0,0 +1,342 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
|
|
3
|
+
class Layer:
|
|
4
|
+
def __init__(self):
|
|
5
|
+
# Trainable parameters (some layers won't use them)
|
|
6
|
+
self.W = None
|
|
7
|
+
self.b = None
|
|
8
|
+
|
|
9
|
+
# Gradients
|
|
10
|
+
self.dW = None
|
|
11
|
+
self.db = None
|
|
12
|
+
|
|
13
|
+
# Optimizer states
|
|
14
|
+
self.vW = None
|
|
15
|
+
self.vb = None
|
|
16
|
+
self.mW = None
|
|
17
|
+
self.mb = None
|
|
18
|
+
|
|
19
|
+
# Forward pass values
|
|
20
|
+
self.A = None
|
|
21
|
+
self.A_prev = None # Input to this layer
|
|
22
|
+
|
|
23
|
+
# Backprop pass values
|
|
24
|
+
self.Z = None
|
|
25
|
+
|
|
26
|
+
# Forward and backward are abstract methods — override in subclasses
|
|
27
|
+
def _forward(self, A_prev, training=None):
|
|
28
|
+
raise NotImplementedError
|
|
29
|
+
|
|
30
|
+
def _backward(self, dA):
|
|
31
|
+
raise NotImplementedError
|
|
32
|
+
|
|
33
|
+
def _update(self, lambda_, lr, beta1, beta2, _eps, optimizer, t):
|
|
34
|
+
raise NotImplementedError
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class Dense(Layer):
|
|
38
|
+
def __init__(self, units: int, inputs:int=0, kernel_initializer: str=None):
|
|
39
|
+
super().__init__()
|
|
40
|
+
|
|
41
|
+
self.units = units
|
|
42
|
+
self.in_size = inputs
|
|
43
|
+
self.out_size = units
|
|
44
|
+
self.kernel_initializer = kernel_initializer
|
|
45
|
+
|
|
46
|
+
if not kernel_initializer:
|
|
47
|
+
self._set_default_initializers()
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _set_default_initializers(self):
|
|
51
|
+
self.initializer = "he"
|
|
52
|
+
"""if self.activation == "relu":
|
|
53
|
+
self.initializer = "he"
|
|
54
|
+
elif self.activation in ["sigmoid", "tanh", "softmax", "linear"]:
|
|
55
|
+
self.initializer = "xavier"
|
|
56
|
+
else:
|
|
57
|
+
raise Exception("Invalid activation function")"""
|
|
58
|
+
|
|
59
|
+
def build(self, input_size):
|
|
60
|
+
self.in_size = input_size
|
|
61
|
+
self._initialize_weights()
|
|
62
|
+
|
|
63
|
+
self.b = np.zeros((self.out_size, 1), dtype=np.float32)
|
|
64
|
+
|
|
65
|
+
# Optimizer state
|
|
66
|
+
self.vW = np.zeros_like(self.W).astype(np.float32)
|
|
67
|
+
self.vb = np.zeros_like(self.b).astype(np.float32)
|
|
68
|
+
self.mW = np.zeros_like(self.W).astype(np.float32)
|
|
69
|
+
self.mb = np.zeros_like(self.b).astype(np.float32)
|
|
70
|
+
|
|
71
|
+
def _initialize_weights(self):
|
|
72
|
+
|
|
73
|
+
if self.initializer == "he":
|
|
74
|
+
# W ~ N(0, √(2/in))
|
|
75
|
+
std = np.sqrt(2.0 / self.in_size)
|
|
76
|
+
self.W = np.random.randn(self.out_size, self.in_size).astype(np.float32) * std
|
|
77
|
+
|
|
78
|
+
elif self.initializer == "xavier":
|
|
79
|
+
# U(-√(6/(in+out)),√(6/(in+out)))
|
|
80
|
+
limit = np.sqrt(6.0 / (self.in_size + self.out_size))
|
|
81
|
+
self.W = np.random.uniform(-limit, limit, (self.out_size, self.in_size)).astype(np.float32)
|
|
82
|
+
|
|
83
|
+
else:
|
|
84
|
+
self.W = np.random.randn(self.out_size, self.in_size).astype(np.float32) * 0.01
|
|
85
|
+
|
|
86
|
+
def _forward(self, A_prev, training=None):
|
|
87
|
+
if self.W is None:
|
|
88
|
+
self.build(A_prev.shape[0])
|
|
89
|
+
|
|
90
|
+
self.A_prev = A_prev
|
|
91
|
+
self.Z = np.dot(self.W, A_prev) + self.b
|
|
92
|
+
|
|
93
|
+
return self.Z
|
|
94
|
+
|
|
95
|
+
def _backward(self, dA, skip_activation=False):
|
|
96
|
+
|
|
97
|
+
# dW_i = dZ_i · A_{i}^T
|
|
98
|
+
# Gradient of the loss w.r.t. weights of layer i
|
|
99
|
+
self.dW = np.dot(dA, self.A_prev.T)
|
|
100
|
+
|
|
101
|
+
# dB_i = sum(dZ_i) over the batch
|
|
102
|
+
# Gradient of the loss w.r.t. biases of layer i
|
|
103
|
+
self.db = np.sum(dA, axis=1, keepdims=True)
|
|
104
|
+
|
|
105
|
+
# Gradient to pass backward
|
|
106
|
+
# dA_prev_i = W_i · dA_i
|
|
107
|
+
return np.dot(self.W.T, dA)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def _update(self, lambda_, lr, beta1, beta2, _eps, optimizer, t):
|
|
111
|
+
|
|
112
|
+
if optimizer == "adamW":
|
|
113
|
+
dw = self.dW # pure gradient
|
|
114
|
+
else:
|
|
115
|
+
# Get batch size from the last dZ calculation to scale regularization
|
|
116
|
+
m = self.A.shape[1]
|
|
117
|
+
dw = self.dW + (lambda_ / m) * self.W # L2 regularization
|
|
118
|
+
|
|
119
|
+
if optimizer == "momentum":
|
|
120
|
+
self.vW = beta1 * self.vW + dw
|
|
121
|
+
self.vb = beta1 * self.vb + self.db
|
|
122
|
+
|
|
123
|
+
update_w = self.vW
|
|
124
|
+
update_b = self.vb
|
|
125
|
+
|
|
126
|
+
elif optimizer == "adam" or optimizer == "adamW":
|
|
127
|
+
|
|
128
|
+
self.mW = beta1 * self.mW + (1 - beta1) * dw
|
|
129
|
+
self.mb = beta1 * self.mb + (1 - beta1) * self.db
|
|
130
|
+
|
|
131
|
+
self.vW = beta2 * self.vW + (1 - beta2) * (dw ** 2)
|
|
132
|
+
self.vb = beta2 * self.vb + (1 - beta2) * (self.db ** 2)
|
|
133
|
+
|
|
134
|
+
m_w_hat = self.mW / (1 - beta1 ** t)
|
|
135
|
+
m_b_hat = self.mb / (1 - beta1 ** t)
|
|
136
|
+
|
|
137
|
+
v_w_hat = self.vW / (1 - beta2 ** t)
|
|
138
|
+
v_b_hat = self.vb / (1 - beta2 ** t)
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
update_w = m_w_hat / (np.sqrt(v_w_hat) + _eps)
|
|
142
|
+
update_b = m_b_hat / (np.sqrt(v_b_hat) + _eps)
|
|
143
|
+
|
|
144
|
+
elif optimizer == "rmsprop":
|
|
145
|
+
self.vW = beta2 * self.vW + (1 - beta2) * (dw ** 2)
|
|
146
|
+
self.vb = beta2 * self.vb + (1 - beta2) * (self.db ** 2)
|
|
147
|
+
|
|
148
|
+
update_w = dw / (np.sqrt(self.vW) + _eps)
|
|
149
|
+
update_b = self.db / (np.sqrt(self.vb) + _eps)
|
|
150
|
+
|
|
151
|
+
else:
|
|
152
|
+
# Classic SGD
|
|
153
|
+
update_w = dw
|
|
154
|
+
update_b = self.db
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
# W = W - lr * (dW + λ * W)
|
|
158
|
+
self.W -= lr * update_w
|
|
159
|
+
self.b -= lr * update_b
|
|
160
|
+
|
|
161
|
+
# Decouple (adamW) Weight Decay
|
|
162
|
+
if optimizer == "adamW":
|
|
163
|
+
self.W *= (1 - lr * lambda_) # Decoupled weight decay
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
class Activation(Layer):
|
|
167
|
+
def __init__(self):
|
|
168
|
+
super().__init__()
|
|
169
|
+
|
|
170
|
+
def _update(self, *args, **kwargs):
|
|
171
|
+
pass # no parameters to update
|
|
172
|
+
|
|
173
|
+
class ReLu(Activation):
|
|
174
|
+
|
|
175
|
+
def _forward(self, Z, training=None):
|
|
176
|
+
self.Z = Z
|
|
177
|
+
self.A = np.maximum(0, Z)
|
|
178
|
+
return self.A
|
|
179
|
+
|
|
180
|
+
def _backward(self, dA):
|
|
181
|
+
return dA * (self.Z > 0)
|
|
182
|
+
|
|
183
|
+
class Sigmoid(Activation):
|
|
184
|
+
|
|
185
|
+
def _forward(self, Z, training=None):
|
|
186
|
+
self.Z = Z
|
|
187
|
+
self.A = 1 / (1 + np.exp(-Z))
|
|
188
|
+
return self.A
|
|
189
|
+
|
|
190
|
+
def _backward(self, dA):
|
|
191
|
+
return dA * self.A * (1 - self.A)
|
|
192
|
+
|
|
193
|
+
class Softmax(Activation):
|
|
194
|
+
|
|
195
|
+
def _forward(self, Z, training=None):
|
|
196
|
+
self.Z = Z
|
|
197
|
+
shifted = Z - np.max(Z, axis=0, keepdims=True)
|
|
198
|
+
exp_vals = np.exp(shifted)
|
|
199
|
+
self.A = exp_vals / np.sum(exp_vals, axis=0, keepdims=True)
|
|
200
|
+
return self.A
|
|
201
|
+
|
|
202
|
+
def _backward(self, dA):
|
|
203
|
+
s = np.sum(dA * self.A, axis=0, keepdims=True)
|
|
204
|
+
return self.A * (dA - s)
|
|
205
|
+
|
|
206
|
+
class Linear(Activation):
|
|
207
|
+
|
|
208
|
+
def _forward(self, Z, training=None):
|
|
209
|
+
self.Z = Z
|
|
210
|
+
self.A = Z
|
|
211
|
+
return self.A
|
|
212
|
+
|
|
213
|
+
def _backward(self, dA):
|
|
214
|
+
return dA # derivative is 1
|
|
215
|
+
|
|
216
|
+
class Tanh(Activation):
|
|
217
|
+
|
|
218
|
+
def _forward(self, Z, training=None):
|
|
219
|
+
self.Z = Z
|
|
220
|
+
self.A = np.tanh(Z)
|
|
221
|
+
return self.A
|
|
222
|
+
|
|
223
|
+
def _backward(self, dA):
|
|
224
|
+
return dA * (1 - self.A ** 2)
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
class BatchNorm(Layer):
|
|
229
|
+
def __init__(self, momentum=0.9):
|
|
230
|
+
super().__init__()
|
|
231
|
+
self.momentum = momentum
|
|
232
|
+
self._eps = 1e-08
|
|
233
|
+
|
|
234
|
+
self.gamma = None
|
|
235
|
+
self.beta = None
|
|
236
|
+
|
|
237
|
+
self.running_mean = None
|
|
238
|
+
self.running_var = None
|
|
239
|
+
|
|
240
|
+
def build(self, input_size):
|
|
241
|
+
self.gamma = np.ones((input_size, 1), dtype=np.float32)
|
|
242
|
+
self.beta = np.zeros((input_size, 1), dtype=np.float32)
|
|
243
|
+
|
|
244
|
+
self.running_mean = np.zeros((input_size, 1), dtype=np.float32)
|
|
245
|
+
self.running_var = np.ones((input_size, 1), dtype=np.float32)
|
|
246
|
+
|
|
247
|
+
def _forward(self, A_prev, training=True):
|
|
248
|
+
if self.gamma is None:
|
|
249
|
+
self.build(A_prev.shape[0])
|
|
250
|
+
|
|
251
|
+
self.A_prev = A_prev
|
|
252
|
+
|
|
253
|
+
if training:
|
|
254
|
+
self.mean = np.mean(A_prev, axis=1, keepdims=True)
|
|
255
|
+
self.var = np.var(A_prev, axis=1, keepdims=True)
|
|
256
|
+
|
|
257
|
+
# X̂ = (X - μB) / √(σB^2 + ε)
|
|
258
|
+
self.X_hat = (A_prev - self.mean) / np.sqrt(self.var + self._eps)
|
|
259
|
+
# A = γ * X̂ + β
|
|
260
|
+
self.A = self.gamma * self.X_hat + self.beta
|
|
261
|
+
|
|
262
|
+
# Update running stats
|
|
263
|
+
self.running_mean = (
|
|
264
|
+
self.momentum * self.running_mean
|
|
265
|
+
+ (1 - self.momentum) * self.mean
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
self.running_var = (
|
|
269
|
+
self.momentum * self.running_var
|
|
270
|
+
+ (1 - self.momentum) * self.var
|
|
271
|
+
)
|
|
272
|
+
|
|
273
|
+
else:
|
|
274
|
+
self.X_hat = (A_prev - self.running_mean) / np.sqrt(self.running_var + self._eps)
|
|
275
|
+
self.A = self.gamma * self.X_hat + self.beta
|
|
276
|
+
|
|
277
|
+
return self.A
|
|
278
|
+
|
|
279
|
+
def _backward(self, dA, skip_activation=False):
|
|
280
|
+
|
|
281
|
+
m = dA.shape[1]
|
|
282
|
+
|
|
283
|
+
dgamma = np.sum(dA * self.X_hat, axis=1, keepdims=True)
|
|
284
|
+
dbeta = np.sum(dA, axis=1, keepdims=True)
|
|
285
|
+
|
|
286
|
+
dX_hat = dA * self.gamma
|
|
287
|
+
|
|
288
|
+
var_inv = 1. / np.sqrt(self.var + self._eps)
|
|
289
|
+
|
|
290
|
+
dvar = np.sum(dX_hat * (self.A_prev - self.mean) * -0.5 * var_inv**3,
|
|
291
|
+
axis=1, keepdims=True)
|
|
292
|
+
|
|
293
|
+
dmean = (
|
|
294
|
+
np.sum(dX_hat * -var_inv, axis=1, keepdims=True)
|
|
295
|
+
+ dvar * np.mean(-2. * (self.A_prev - self.mean), axis=1, keepdims=True)
|
|
296
|
+
)
|
|
297
|
+
|
|
298
|
+
dX = (
|
|
299
|
+
dX_hat * var_inv
|
|
300
|
+
+ dvar * 2 * (self.A_prev - self.mean) / m
|
|
301
|
+
+ dmean / m
|
|
302
|
+
)
|
|
303
|
+
|
|
304
|
+
self.dgamma = dgamma / m
|
|
305
|
+
self.dbeta = dbeta / m
|
|
306
|
+
|
|
307
|
+
return dX
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
def _update(self, lambda_, lr, beta1, beta2, eps, optimizer, t):
|
|
312
|
+
self.gamma -= lr * self.dgamma
|
|
313
|
+
self.beta -= lr * self.dbeta
|
|
314
|
+
|
|
315
|
+
class Dropout(Layer):
|
|
316
|
+
def __init__(self, rate):
|
|
317
|
+
super().__init__()
|
|
318
|
+
self.rate = rate # probability of dropping a unit
|
|
319
|
+
self.mask = None
|
|
320
|
+
|
|
321
|
+
def _forward(self, A_prev, training=True):
|
|
322
|
+
if not training:
|
|
323
|
+
# No dropout during inference
|
|
324
|
+
self.mask = np.ones_like(A_prev)
|
|
325
|
+
self.A = A_prev
|
|
326
|
+
return self.A
|
|
327
|
+
|
|
328
|
+
# Create dropout mask
|
|
329
|
+
self.mask = np.random.rand(A_prev.shape[0], A_prev.shape[1]) > self.rate
|
|
330
|
+
# Apply mask AND scale (inverted dropout)
|
|
331
|
+
self.A = (A_prev * self.mask) / (1 - self.rate)
|
|
332
|
+
|
|
333
|
+
return self.A
|
|
334
|
+
|
|
335
|
+
def _backward(self, dA, skip_activation=False):
|
|
336
|
+
# Backprop only through active neurons
|
|
337
|
+
dA_prev = (dA * self.mask) / (1 - self.rate)
|
|
338
|
+
return dA_prev
|
|
339
|
+
|
|
340
|
+
def _update(self, lambda_, lr, beta1, beta2, _eps, optimizer, t):
|
|
341
|
+
# Dropout layer has no trainable parameters
|
|
342
|
+
pass
|
|
@@ -0,0 +1,506 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
from tqdm.auto import tqdm
|
|
3
|
+
|
|
4
|
+
from .layers import Dropout, Activation, BatchNorm, Dense
|
|
5
|
+
from .utils import History
|
|
6
|
+
|
|
7
|
+
class NeuralNetwork:
|
|
8
|
+
def __init__(self, layers:list):
|
|
9
|
+
|
|
10
|
+
self.num_classes = 0
|
|
11
|
+
self.layers = layers
|
|
12
|
+
|
|
13
|
+
self._eps = 1e-08 # Avoid dividing by zero
|
|
14
|
+
self.lr = 0.001
|
|
15
|
+
self.lambda_ = 0.0
|
|
16
|
+
self.beta1 = 0.9
|
|
17
|
+
self.beta2 = 0.999
|
|
18
|
+
self.loss_type = "cross_entropy"
|
|
19
|
+
self.optimizer = "adam"
|
|
20
|
+
|
|
21
|
+
def save(self, path):
|
|
22
|
+
layer_data = []
|
|
23
|
+
|
|
24
|
+
for layer in self.layers:
|
|
25
|
+
entry = {"type": type(layer).__name__}
|
|
26
|
+
|
|
27
|
+
if isinstance(layer, Dense):
|
|
28
|
+
entry.update({
|
|
29
|
+
"W": layer.W,
|
|
30
|
+
"b": layer.b,
|
|
31
|
+
"in_size": layer.in_size,
|
|
32
|
+
"out_size": layer.out_size,
|
|
33
|
+
"initializer": layer.initializer,
|
|
34
|
+
})
|
|
35
|
+
elif isinstance(layer, BatchNorm):
|
|
36
|
+
entry.update({
|
|
37
|
+
"gamma": layer.gamma,
|
|
38
|
+
"beta": layer.beta,
|
|
39
|
+
"running_mean": layer.running_mean,
|
|
40
|
+
"running_var": layer.running_var,
|
|
41
|
+
"momentum": layer.momentum,
|
|
42
|
+
})
|
|
43
|
+
elif isinstance(layer, Dropout):
|
|
44
|
+
entry["rate"] = layer.rate
|
|
45
|
+
|
|
46
|
+
# Activation layers (ReLu, Sigmoid, etc.) need no extra data
|
|
47
|
+
|
|
48
|
+
layer_data.append(entry)
|
|
49
|
+
|
|
50
|
+
np.savez(
|
|
51
|
+
path,
|
|
52
|
+
layers=np.array(layer_data, dtype=object),
|
|
53
|
+
lr=self.lr,
|
|
54
|
+
lambda_=self.lambda_,
|
|
55
|
+
beta1=self.beta1,
|
|
56
|
+
beta2=self.beta2,
|
|
57
|
+
loss_type=self.loss_type,
|
|
58
|
+
optimizer=self.optimizer,
|
|
59
|
+
num_classes=self.num_classes,
|
|
60
|
+
)
|
|
61
|
+
|
|
62
|
+
|
|
63
|
+
@staticmethod
|
|
64
|
+
def load(path):
|
|
65
|
+
from .layers import Dense, BatchNorm, Dropout, ReLu, Sigmoid, Softmax, Tanh, Linear
|
|
66
|
+
|
|
67
|
+
ACTIVATION_MAP = {
|
|
68
|
+
"ReLu": ReLu,
|
|
69
|
+
"Sigmoid": Sigmoid,
|
|
70
|
+
"Softmax": Softmax,
|
|
71
|
+
"Tanh": Tanh,
|
|
72
|
+
"Linear": Linear,
|
|
73
|
+
}
|
|
74
|
+
|
|
75
|
+
data = np.load(path, allow_pickle=True)
|
|
76
|
+
layers = []
|
|
77
|
+
|
|
78
|
+
for entry in data["layers"]:
|
|
79
|
+
layer_type = entry["type"]
|
|
80
|
+
|
|
81
|
+
if layer_type == "Dense":
|
|
82
|
+
layer = Dense(units=entry["out_size"])
|
|
83
|
+
layer.W = entry["W"]
|
|
84
|
+
layer.b = entry["b"]
|
|
85
|
+
layer.in_size = entry["in_size"]
|
|
86
|
+
layer.out_size = entry["out_size"]
|
|
87
|
+
layer.initializer = entry["initializer"]
|
|
88
|
+
# Restore optimizer states as zeros (not serialized)
|
|
89
|
+
layer.vW = np.zeros_like(layer.W)
|
|
90
|
+
layer.vb = np.zeros_like(layer.b)
|
|
91
|
+
layer.mW = np.zeros_like(layer.W)
|
|
92
|
+
layer.mb = np.zeros_like(layer.b)
|
|
93
|
+
|
|
94
|
+
elif layer_type == "BatchNorm":
|
|
95
|
+
layer = BatchNorm(momentum=entry["momentum"])
|
|
96
|
+
layer.gamma = entry["gamma"]
|
|
97
|
+
layer.beta = entry["beta"]
|
|
98
|
+
layer.running_mean = entry["running_mean"]
|
|
99
|
+
layer.running_var = entry["running_var"]
|
|
100
|
+
|
|
101
|
+
elif layer_type == "Dropout":
|
|
102
|
+
layer = Dropout(rate=entry["rate"])
|
|
103
|
+
|
|
104
|
+
elif layer_type in ACTIVATION_MAP:
|
|
105
|
+
layer = ACTIVATION_MAP[layer_type]()
|
|
106
|
+
|
|
107
|
+
else:
|
|
108
|
+
raise ValueError(f"Unknown layer type: {layer_type}")
|
|
109
|
+
|
|
110
|
+
layers.append(layer)
|
|
111
|
+
|
|
112
|
+
model = NeuralNetwork(layers)
|
|
113
|
+
model.lr = data["lr"].item()
|
|
114
|
+
model.lambda_ = data["lambda_"].item()
|
|
115
|
+
model.beta1 = data["beta1"].item()
|
|
116
|
+
model.beta2 = data["beta2"].item()
|
|
117
|
+
model.loss_type = data["loss_type"].item()
|
|
118
|
+
model.optimizer = data["optimizer"].item()
|
|
119
|
+
model.num_classes = data["num_classes"].item()
|
|
120
|
+
|
|
121
|
+
return model
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def summary(self):
|
|
125
|
+
print("=" * 55)
|
|
126
|
+
print("Model Summary")
|
|
127
|
+
print("=" * 55)
|
|
128
|
+
|
|
129
|
+
total_params = 0
|
|
130
|
+
|
|
131
|
+
for i, layer in enumerate(self.layers):
|
|
132
|
+
layer_type = type(layer).__name__
|
|
133
|
+
|
|
134
|
+
if isinstance(layer, Dense):
|
|
135
|
+
params = layer.W.size + layer.b.size if layer.W is not None else 0
|
|
136
|
+
total_params += params
|
|
137
|
+
built = f"{layer.in_size} → {layer.out_size}"
|
|
138
|
+
print(f"[{i+1}] Dense {built:<20} params: {params}")
|
|
139
|
+
|
|
140
|
+
elif isinstance(layer, BatchNorm):
|
|
141
|
+
params = layer.gamma.size + layer.beta.size if layer.gamma is not None else 0
|
|
142
|
+
total_params += params
|
|
143
|
+
print(f"[{i+1}] BatchNorm momentum={layer.momentum:<13} params: {params}")
|
|
144
|
+
|
|
145
|
+
elif isinstance(layer, Dropout):
|
|
146
|
+
print(f"[{i+1}] Dropout rate={layer.rate}")
|
|
147
|
+
|
|
148
|
+
elif isinstance(layer, Activation):
|
|
149
|
+
print(f"[{i+1}] {layer_type:<15}")
|
|
150
|
+
|
|
151
|
+
print("-" * 55)
|
|
152
|
+
|
|
153
|
+
print(f"Total trainable parameters: {total_params}")
|
|
154
|
+
print("=" * 55)
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
# One hot encoding for y_true - convert format into a matrix for calculations
|
|
158
|
+
def _one_hot_encoding(self, y):
|
|
159
|
+
one_hot_y = np.zeros((y.size, self.num_classes))
|
|
160
|
+
one_hot_y[np.arange(y.size), y] = 1
|
|
161
|
+
return one_hot_y.T
|
|
162
|
+
|
|
163
|
+
""" **********************************************************
|
|
164
|
+
Model Algorithms
|
|
165
|
+
********************************************************** """
|
|
166
|
+
|
|
167
|
+
# Forward function - feed input and get prediction
|
|
168
|
+
def _forward(self, X, training=True):
|
|
169
|
+
# Z = W * A + B
|
|
170
|
+
# A - output (after activation function)
|
|
171
|
+
# Also next layer input
|
|
172
|
+
for layer in self.layers:
|
|
173
|
+
X = layer._forward(X, training=training)
|
|
174
|
+
return X
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
# Backward function - locates the origin of the loss and tweaks it
|
|
178
|
+
def _backward(self, y_true):
|
|
179
|
+
m = y_true.size
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
dA = self._loss_derivative(self.layers[-1].A, y_true) / m
|
|
183
|
+
dA = self.layers[-1]._backward(dA)
|
|
184
|
+
|
|
185
|
+
# Remaining layers
|
|
186
|
+
# Backpropagation: iterate layers in reverse order (from last to first)
|
|
187
|
+
# Compute gradients dW and dB for each layer
|
|
188
|
+
for layer in reversed(self.layers[:-1]):
|
|
189
|
+
# dA = ∂J/∂A_L
|
|
190
|
+
# This is the derivative of the loss w.r.t. the network output
|
|
191
|
+
dA = layer._backward(dA)
|
|
192
|
+
|
|
193
|
+
|
|
194
|
+
# Loss derivative - for the last layer based on the loss-type
|
|
195
|
+
def _loss_derivative(self, y_pred, y_true):
|
|
196
|
+
one_hot = self._one_hot_encoding(y_true) # y_true formatting
|
|
197
|
+
# Division by m happens in backward function
|
|
198
|
+
if self.loss_type == "cross_entropy":
|
|
199
|
+
# - y_true / y_pred
|
|
200
|
+
return -(one_hot / (y_pred + self._eps))
|
|
201
|
+
|
|
202
|
+
elif self.loss_type == "mse":
|
|
203
|
+
# 2 * (y_pred - y_true)
|
|
204
|
+
return 2 * (y_pred - one_hot)
|
|
205
|
+
|
|
206
|
+
else:
|
|
207
|
+
raise Exception("Invalid loss function")
|
|
208
|
+
|
|
209
|
+
|
|
210
|
+
# Compute loss for logging
|
|
211
|
+
def _compute_loss(self, y_pred, y_true):
|
|
212
|
+
one_hot = self._one_hot_encoding(y_true) # shape: (num_classes, N)
|
|
213
|
+
m = y_true.size
|
|
214
|
+
|
|
215
|
+
if self.loss_type == "cross_entropy":
|
|
216
|
+
# -1/m * ∑ (y_true * log(y_pred))
|
|
217
|
+
data_loss = -1 * np.sum(one_hot * np.log(y_pred + self._eps)) / m
|
|
218
|
+
elif self.loss_type == "mse":
|
|
219
|
+
# 1/m * ∑ ((y_pred - y_true)^2)
|
|
220
|
+
data_loss = np.mean((y_pred - one_hot) ** 2)
|
|
221
|
+
else:
|
|
222
|
+
raise Exception("Invalid loss function")
|
|
223
|
+
|
|
224
|
+
# L2 regularization term: (λ / 2m) * sum(||W||^2)
|
|
225
|
+
reg_loss = 0.0
|
|
226
|
+
for layer in self.layers:
|
|
227
|
+
if not isinstance(layer, (Dropout, Activation, BatchNorm)):
|
|
228
|
+
reg_loss += np.sum(layer.W ** 2)
|
|
229
|
+
|
|
230
|
+
reg_loss = (self.lambda_ / (2 * m)) * reg_loss
|
|
231
|
+
|
|
232
|
+
return data_loss + reg_loss
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
# Update weights and biases
|
|
236
|
+
# lr = learning rate
|
|
237
|
+
# high lr - impact the model fast, can overshoot
|
|
238
|
+
# low lr - learns slower, wont overshoot
|
|
239
|
+
# Lambda λ = Counter overfitting by punishing big weights
|
|
240
|
+
# Force weights to be small but not zero (w = 0 -> no impact on the model)
|
|
241
|
+
# Beta1 = momentum factor
|
|
242
|
+
# Beta2 = RSMprop factor
|
|
243
|
+
def _update(self, optimizer_t):
|
|
244
|
+
|
|
245
|
+
for layer in self.layers:
|
|
246
|
+
layer._update(self.lambda_, self.lr, self.beta1, self.beta2, self._eps, self.optimizer, optimizer_t)
|
|
247
|
+
|
|
248
|
+
""" **********************************************************
|
|
249
|
+
Metrics
|
|
250
|
+
********************************************************** """
|
|
251
|
+
|
|
252
|
+
# Calculate model accuracy
|
|
253
|
+
@staticmethod
|
|
254
|
+
def accuracy(predictions, y):
|
|
255
|
+
return np.sum(predictions==y) / y.size
|
|
256
|
+
|
|
257
|
+
|
|
258
|
+
# Calculate model precision
|
|
259
|
+
@staticmethod
|
|
260
|
+
def precision(predictions, y, num_classes):
|
|
261
|
+
precisions = []
|
|
262
|
+
|
|
263
|
+
for c in range(num_classes):
|
|
264
|
+
tp = np.sum((predictions == c) & (y == c))
|
|
265
|
+
fp = np.sum((predictions == c) & (y != c))
|
|
266
|
+
|
|
267
|
+
precisions.append(tp / (tp + fp + 1e-8))
|
|
268
|
+
|
|
269
|
+
return np.mean(precisions)
|
|
270
|
+
|
|
271
|
+
|
|
272
|
+
# Calculate model recall
|
|
273
|
+
@staticmethod
|
|
274
|
+
def recall(predictions, y, num_classes):
|
|
275
|
+
recalls = []
|
|
276
|
+
|
|
277
|
+
for c in range(num_classes):
|
|
278
|
+
tp = np.sum((predictions == c) & (y == c))
|
|
279
|
+
fn = np.sum((predictions != c) & (y == c))
|
|
280
|
+
|
|
281
|
+
recalls.append(tp / (tp + fn + 1e-8))
|
|
282
|
+
|
|
283
|
+
return np.mean(recalls)
|
|
284
|
+
|
|
285
|
+
|
|
286
|
+
# Calculate model f1
|
|
287
|
+
@staticmethod
|
|
288
|
+
def f1(predictions, y, num_classes):
|
|
289
|
+
precision = NeuralNetwork.precision(predictions, y, num_classes)
|
|
290
|
+
recall = NeuralNetwork.recall(predictions, y, num_classes)
|
|
291
|
+
return 2 * (precision * recall) / (precision + recall + 1e-8)
|
|
292
|
+
|
|
293
|
+
|
|
294
|
+
def calc_metrics(self, history:History, y_pred, y_true, metrics=None):
|
|
295
|
+
if metrics is None:
|
|
296
|
+
metrics = []
|
|
297
|
+
for metric in metrics:
|
|
298
|
+
if metric == "accuracy":
|
|
299
|
+
accuracy = NeuralNetwork.accuracy(y_pred, y_true)
|
|
300
|
+
history.add("accuracy", accuracy)
|
|
301
|
+
if metric == "precision":
|
|
302
|
+
precision = NeuralNetwork.precision(y_pred, y_true, self.num_classes)
|
|
303
|
+
history.add("precision", precision)
|
|
304
|
+
if metric == "recall":
|
|
305
|
+
recall = NeuralNetwork.recall(y_pred, y_true, self.num_classes)
|
|
306
|
+
history.add("recall", recall)
|
|
307
|
+
if metric == "f1":
|
|
308
|
+
f1 = NeuralNetwork.f1(y_pred, y_true, self.num_classes)
|
|
309
|
+
history.add("f1", f1)
|
|
310
|
+
|
|
311
|
+
return history
|
|
312
|
+
|
|
313
|
+
""" **********************************************************
|
|
314
|
+
Runtime functions
|
|
315
|
+
********************************************************** """
|
|
316
|
+
|
|
317
|
+
# Converts final layer activation to predicted class labels
|
|
318
|
+
@staticmethod
|
|
319
|
+
def _decode_output(output):
|
|
320
|
+
if output.shape[0] == 1: # Binary classification
|
|
321
|
+
return (output > 0.5).astype(int).flatten()
|
|
322
|
+
else: # Multi-class (softmax)
|
|
323
|
+
return np.argmax(output, axis=0)
|
|
324
|
+
|
|
325
|
+
|
|
326
|
+
@staticmethod
|
|
327
|
+
def shuffle_data(x, y):
|
|
328
|
+
perm = np.random.permutation(y.size)
|
|
329
|
+
x = x[:, perm]
|
|
330
|
+
y = y[perm]
|
|
331
|
+
return x, y
|
|
332
|
+
|
|
333
|
+
@staticmethod
|
|
334
|
+
def set_seed(seed):
|
|
335
|
+
np.random.seed(seed)
|
|
336
|
+
|
|
337
|
+
|
|
338
|
+
def check_gradient(self, X, y):
|
|
339
|
+
assert X.shape[1] == y.size, f"X has {X.shape[1]} samples but y has {y.size}"
|
|
340
|
+
|
|
341
|
+
# Use a small batch to avoid numerical issues
|
|
342
|
+
X = X[:, :8].astype(np.float64) # <-- float64 is critical for numerical grad
|
|
343
|
+
y = y[:8]
|
|
344
|
+
|
|
345
|
+
rel_diff = []
|
|
346
|
+
original_lambda = self.lambda_
|
|
347
|
+
self.lambda_ = 0.0
|
|
348
|
+
epsilon = 1e-5 # smaller epsilon can help
|
|
349
|
+
|
|
350
|
+
self._forward(X, training=False)
|
|
351
|
+
self._backward(y)
|
|
352
|
+
|
|
353
|
+
# Snapshot ALL analytical gradients before any weight perturbation
|
|
354
|
+
analytical_grads = {}
|
|
355
|
+
for idx, layer in enumerate(self.layers):
|
|
356
|
+
if hasattr(layer, 'dW') and layer.dW is not None:
|
|
357
|
+
analytical_grads[idx] = layer.dW.copy() # <-- copy before perturbation
|
|
358
|
+
|
|
359
|
+
for idx, layer in enumerate(self.layers):
|
|
360
|
+
if hasattr(layer, 'W') and layer.W is not None:
|
|
361
|
+
i = np.random.randint(0, layer.W.shape[0])
|
|
362
|
+
j = np.random.randint(0, layer.W.shape[1])
|
|
363
|
+
|
|
364
|
+
W_orig = layer.W[i, j]
|
|
365
|
+
grad_analytical = analytical_grads[idx][i, j]
|
|
366
|
+
|
|
367
|
+
layer.W[i, j] = W_orig + epsilon
|
|
368
|
+
y_pred = self._forward(X, training=False)
|
|
369
|
+
loss_plus = self._compute_loss(y_pred, y)
|
|
370
|
+
|
|
371
|
+
layer.W[i, j] = W_orig - epsilon
|
|
372
|
+
y_pred = self._forward(X, training=False)
|
|
373
|
+
loss_minus = self._compute_loss(y_pred, y)
|
|
374
|
+
|
|
375
|
+
grad_numerical = (loss_plus - loss_minus) / (2 * epsilon)
|
|
376
|
+
layer.W[i, j] = W_orig
|
|
377
|
+
|
|
378
|
+
numerator = abs(grad_numerical - grad_analytical)
|
|
379
|
+
denominator = abs(grad_numerical) + abs(grad_analytical) + 1e-10
|
|
380
|
+
rel_diff.append(numerator / denominator)
|
|
381
|
+
|
|
382
|
+
print(f"Layer {idx} W[{i},{j}] Numerical: {grad_numerical:.10f} Analytical: {grad_analytical:.10f} Rel diff: {rel_diff[-1]:.2e}")
|
|
383
|
+
|
|
384
|
+
self.lambda_ = original_lambda
|
|
385
|
+
return rel_diff
|
|
386
|
+
|
|
387
|
+
|
|
388
|
+
# Compile the model and trains it
|
|
389
|
+
# x - input features
|
|
390
|
+
# y - labels
|
|
391
|
+
# lr - learning rate
|
|
392
|
+
# epochs - num of iterations over the data
|
|
393
|
+
# batch_size - data size before model updating
|
|
394
|
+
# loss_type - loss functions used
|
|
395
|
+
# lambda_ - lambda used for preventing weights overfitting
|
|
396
|
+
def gradient_descent(self, X, y, X_val=None, y_val=None, epochs=10, batch_size=1):
|
|
397
|
+
X = X.astype(np.float32)
|
|
398
|
+
history = History()
|
|
399
|
+
|
|
400
|
+
optimizer_t = 0
|
|
401
|
+
|
|
402
|
+
stopping_patience = 5
|
|
403
|
+
|
|
404
|
+
for ep in range(epochs):
|
|
405
|
+
predictions = []
|
|
406
|
+
epoch_loss = 0
|
|
407
|
+
|
|
408
|
+
x_shuffled, y_shuffled = NeuralNetwork.shuffle_data(X, y)
|
|
409
|
+
|
|
410
|
+
# Batches
|
|
411
|
+
# tqdm - loading animation
|
|
412
|
+
for i in tqdm(range(0, x_shuffled.shape[1], batch_size)):
|
|
413
|
+
|
|
414
|
+
optimizer_t += 1
|
|
415
|
+
|
|
416
|
+
# get batch
|
|
417
|
+
x_batch = x_shuffled[:, i:i+batch_size]
|
|
418
|
+
y_batch = y_shuffled[i:i+batch_size]
|
|
419
|
+
# feed model
|
|
420
|
+
self._forward(x_batch)
|
|
421
|
+
self._backward(y_batch)
|
|
422
|
+
self._update(optimizer_t)
|
|
423
|
+
|
|
424
|
+
y_pred = self.layers[-1].A
|
|
425
|
+
# Monitor loss - epoch_loss = Avg(batches_loss)
|
|
426
|
+
predictions.append(self._decode_output(y_pred))
|
|
427
|
+
batch_loss = self._compute_loss(y_pred, y_batch)
|
|
428
|
+
epoch_loss += batch_loss * x_batch.shape[1] / X.shape[1]
|
|
429
|
+
|
|
430
|
+
# Check Gradient - make sure backpropagation works well
|
|
431
|
+
#self.check_gradient(x_batch, y_batch)
|
|
432
|
+
|
|
433
|
+
predictions = np.concatenate(predictions)
|
|
434
|
+
|
|
435
|
+
history.add("epoch", ep)
|
|
436
|
+
history.add("loss", epoch_loss)
|
|
437
|
+
history = self.calc_metrics(history, predictions, y_shuffled, metrics=["accuracy", "precision", "recall"])
|
|
438
|
+
|
|
439
|
+
# Validation
|
|
440
|
+
if X_val is not None and y_val is not None:
|
|
441
|
+
val_pred = self.predict_proba(X_val)
|
|
442
|
+
val_loss = self._compute_loss(val_pred, y_val)
|
|
443
|
+
val_acc = NeuralNetwork.accuracy(self._decode_output(val_pred), y_val)
|
|
444
|
+
history.add("val_loss", val_loss)
|
|
445
|
+
history.add("val_accuracy", val_acc)
|
|
446
|
+
|
|
447
|
+
history.progress()
|
|
448
|
+
|
|
449
|
+
if len(history.history["val_loss"]) > 1:
|
|
450
|
+
if history.history["val_loss"][-1] > history.history["val_loss"][-2]:
|
|
451
|
+
stopping_patience -= 1
|
|
452
|
+
if stopping_patience == 0:
|
|
453
|
+
print("Early stopping")
|
|
454
|
+
break
|
|
455
|
+
else:
|
|
456
|
+
history.progress()
|
|
457
|
+
|
|
458
|
+
return history
|
|
459
|
+
|
|
460
|
+
|
|
461
|
+
""" **********************************************************
|
|
462
|
+
Model API functions
|
|
463
|
+
********************************************************** """
|
|
464
|
+
|
|
465
|
+
# Configure training hyperparameters and optimization settings
|
|
466
|
+
def compile(self, loss_type="cross_entropy", optimizer="adam", lr=0.001, lambda_=0.0, beta1=0.9, beta2=0.999):
|
|
467
|
+
self.lr = lr
|
|
468
|
+
self.lambda_ = lambda_
|
|
469
|
+
self.beta1 = beta1
|
|
470
|
+
self.beta2 = beta2
|
|
471
|
+
self.loss_type = loss_type
|
|
472
|
+
self.optimizer = optimizer
|
|
473
|
+
|
|
474
|
+
|
|
475
|
+
# Train the model
|
|
476
|
+
def fit(self, X, y, X_val=None, y_val=None, epochs=10, batch_size=1):
|
|
477
|
+
if X.shape[1] != y.size:
|
|
478
|
+
raise ValueError("Mismatch between samples and labels")
|
|
479
|
+
|
|
480
|
+
# Find last layer - last outsize is num classes
|
|
481
|
+
for layer in reversed(self.layers):
|
|
482
|
+
if hasattr(layer, "out_size"):
|
|
483
|
+
self.num_classes = layer.out_size
|
|
484
|
+
break
|
|
485
|
+
return self.gradient_descent(X, y, X_val=X_val, y_val=y_val, epochs=epochs, batch_size=batch_size)
|
|
486
|
+
|
|
487
|
+
|
|
488
|
+
# Add layers to model
|
|
489
|
+
def add(self, layer):
|
|
490
|
+
self.layers.append(layer)
|
|
491
|
+
|
|
492
|
+
|
|
493
|
+
# Return predicted class labels for input data
|
|
494
|
+
def predict(self, X):
|
|
495
|
+
return self._decode_output(self.predict_proba(X))
|
|
496
|
+
|
|
497
|
+
|
|
498
|
+
# Return raw output activations (probabilities or scores)
|
|
499
|
+
def predict_proba(self, X):
|
|
500
|
+
return self._forward(X, training=False)
|
|
501
|
+
|
|
502
|
+
|
|
503
|
+
# Evaluate model performance (accuracy) on given dataset
|
|
504
|
+
def evaluate(self, X, y):
|
|
505
|
+
predictions = self.predict(X)
|
|
506
|
+
return NeuralNetwork.accuracy(predictions, y)
|
|
@@ -0,0 +1,87 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
|
|
3
|
+
class History:
|
|
4
|
+
def __init__(self):
|
|
5
|
+
self.history = {}
|
|
6
|
+
|
|
7
|
+
def add(self, key, value):
|
|
8
|
+
if key not in self.history:
|
|
9
|
+
self.history[key] = []
|
|
10
|
+
self.history[key].append(value)
|
|
11
|
+
|
|
12
|
+
def progress(self):
|
|
13
|
+
for key, value in self.history.items():
|
|
14
|
+
print(f"{key}: {value[-1]}")
|
|
15
|
+
|
|
16
|
+
|
|
17
|
+
|
|
18
|
+
class Scaler:
|
|
19
|
+
def __init__(self, mode="standard"):
|
|
20
|
+
"""
|
|
21
|
+
mode: "standard" for Z-score (Mean=0, Std=1)
|
|
22
|
+
"minmax" for range scaling (0 to 1)
|
|
23
|
+
"""
|
|
24
|
+
self.mode = mode
|
|
25
|
+
self.mean = None
|
|
26
|
+
self.std = None
|
|
27
|
+
self.min = None
|
|
28
|
+
self.max = None
|
|
29
|
+
|
|
30
|
+
def fit(self, X):
|
|
31
|
+
"""Calculates parameters from training data. X shape: (features, samples)"""
|
|
32
|
+
if self.mode == "standard":
|
|
33
|
+
self.mean = np.mean(X, axis=1, keepdims=True)
|
|
34
|
+
self.std = np.std(X, axis=1, keepdims=True)
|
|
35
|
+
self.std[self.std == 0] = 1e-8 # Avoid division by zero
|
|
36
|
+
|
|
37
|
+
elif self.mode == "minmax":
|
|
38
|
+
self.min = np.min(X, axis=1, keepdims=True)
|
|
39
|
+
self.max = np.max(X, axis=1, keepdims=True)
|
|
40
|
+
# Avoid division by zero if all values in a feature are the same
|
|
41
|
+
self.diff = self.max - self.min
|
|
42
|
+
self.diff[self.diff == 0] = 1e-8
|
|
43
|
+
|
|
44
|
+
def transform(self, X):
|
|
45
|
+
"""Applies scaling to data using fitted parameters."""
|
|
46
|
+
if self.mode == "standard":
|
|
47
|
+
return (X - self.mean) / self.std
|
|
48
|
+
elif self.mode == "minmax":
|
|
49
|
+
return (X - self.min) / self.diff
|
|
50
|
+
else:
|
|
51
|
+
raise NotImplementedError
|
|
52
|
+
|
|
53
|
+
def fit_transform(self, X):
|
|
54
|
+
self.fit(X)
|
|
55
|
+
return self.transform(X)
|
|
56
|
+
|
|
57
|
+
def split_train_test(X, y, test_ratio=0.2):
|
|
58
|
+
m = X.shape[1]
|
|
59
|
+
perm = np.random.permutation(m)
|
|
60
|
+
|
|
61
|
+
X = X[:, perm]
|
|
62
|
+
y = y[perm]
|
|
63
|
+
|
|
64
|
+
test_size = int(m * test_ratio)
|
|
65
|
+
|
|
66
|
+
X_test = X[:, :test_size]
|
|
67
|
+
y_test = y[:test_size]
|
|
68
|
+
|
|
69
|
+
X_train = X[:, test_size:]
|
|
70
|
+
y_train = y[test_size:]
|
|
71
|
+
|
|
72
|
+
return X_train, y_train, X_test, y_test
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
def split_train_validation(X, y, val_ratio=0.2):
|
|
76
|
+
m = X.shape[0]
|
|
77
|
+
perm = np.random.permutation(m)
|
|
78
|
+
X = X[perm]
|
|
79
|
+
y = y[perm]
|
|
80
|
+
|
|
81
|
+
val_size = int(m * val_ratio)
|
|
82
|
+
X_val = X[:val_size]
|
|
83
|
+
y_val = y[:val_size]
|
|
84
|
+
X_train = X[val_size:]
|
|
85
|
+
y_train = y[val_size:]
|
|
86
|
+
|
|
87
|
+
return X_train, y_train, X_val, y_val
|
|
@@ -0,0 +1,150 @@
|
|
|
1
|
+
Metadata-Version: 2.4
|
|
2
|
+
Name: neuralnetworknumpy
|
|
3
|
+
Version: 0.1.1
|
|
4
|
+
Summary: A neural network framework built completely from scratch using NumPy
|
|
5
|
+
Author: Itamar Senderovitz
|
|
6
|
+
License: MIT
|
|
7
|
+
Project-URL: Homepage, https://github.com/Sendy45/NeuralNetworkFromScratch
|
|
8
|
+
Keywords: neural network,deep learning,machine learning,numpy
|
|
9
|
+
Classifier: Programming Language :: Python :: 3
|
|
10
|
+
Classifier: Programming Language :: Python :: 3.8
|
|
11
|
+
Classifier: Programming Language :: Python :: 3.9
|
|
12
|
+
Classifier: Programming Language :: Python :: 3.10
|
|
13
|
+
Classifier: License :: OSI Approved :: MIT License
|
|
14
|
+
Classifier: Operating System :: OS Independent
|
|
15
|
+
Requires-Python: >=3.8
|
|
16
|
+
Description-Content-Type: text/markdown
|
|
17
|
+
Requires-Dist: numpy>=1.21
|
|
18
|
+
Requires-Dist: tqdm>=4.60
|
|
19
|
+
|
|
20
|
+
# NeuralNetworkFromScratch
|
|
21
|
+
|
|
22
|
+
A lightweight Python library implementing a fully functional neural network **from scratch using NumPy**, without relying on machine learning frameworks such as TensorFlow or PyTorch.
|
|
23
|
+
|
|
24
|
+
The goal of this project is to provide a clear and educational implementation of neural networks, including forward propagation, backpropagation, normalization, and regularization techniques.
|
|
25
|
+
|
|
26
|
+
---
|
|
27
|
+
|
|
28
|
+
## Features
|
|
29
|
+
|
|
30
|
+
* Fully connected neural network implementation
|
|
31
|
+
* Modular layer system
|
|
32
|
+
* Forward and backward propagation
|
|
33
|
+
* Batch normalization
|
|
34
|
+
* Dropout regularization
|
|
35
|
+
* ReLU and Softmax activation functions
|
|
36
|
+
* Dataset scaling utilities
|
|
37
|
+
* Train / validation split helpers
|
|
38
|
+
|
|
39
|
+
---
|
|
40
|
+
|
|
41
|
+
## Installation
|
|
42
|
+
|
|
43
|
+
Install from PyPI:
|
|
44
|
+
|
|
45
|
+
```bash
|
|
46
|
+
pip install neuralnetwork-from-scratch
|
|
47
|
+
```
|
|
48
|
+
|
|
49
|
+
Or install from source:
|
|
50
|
+
|
|
51
|
+
```bash
|
|
52
|
+
git clone https://github.com/Sendy45/NeuralNetworkFromScratch.git
|
|
53
|
+
cd NeuralNetworkFromScratch
|
|
54
|
+
pip install .
|
|
55
|
+
```
|
|
56
|
+
|
|
57
|
+
---
|
|
58
|
+
|
|
59
|
+
## Example Usage
|
|
60
|
+
|
|
61
|
+
```python
|
|
62
|
+
import numpy as np
|
|
63
|
+
from keras.datasets import mnist
|
|
64
|
+
|
|
65
|
+
from NeuralNetworkFromScratch import (
|
|
66
|
+
NeuralNetwork,
|
|
67
|
+
Dense,
|
|
68
|
+
ReLu,
|
|
69
|
+
BatchNorm,
|
|
70
|
+
Dropout,
|
|
71
|
+
Softmax
|
|
72
|
+
)
|
|
73
|
+
|
|
74
|
+
# load dataset
|
|
75
|
+
(X_train, y_train), _ = mnist.load_data()
|
|
76
|
+
|
|
77
|
+
# flatten images
|
|
78
|
+
X_train = X_train.reshape(-1, 784) / 255.0
|
|
79
|
+
|
|
80
|
+
model = NeuralNetwork([
|
|
81
|
+
Dense(64, inputs=784),
|
|
82
|
+
ReLu(),
|
|
83
|
+
BatchNorm(),
|
|
84
|
+
Dropout(0.1),
|
|
85
|
+
Dense(10),
|
|
86
|
+
Softmax()
|
|
87
|
+
])
|
|
88
|
+
|
|
89
|
+
model.compile(
|
|
90
|
+
optimizer="adam",
|
|
91
|
+
loss="categorical_crossentropy"
|
|
92
|
+
)
|
|
93
|
+
|
|
94
|
+
model.fit(X_train, y_train, epochs=10, batch_size=32)
|
|
95
|
+
```
|
|
96
|
+
|
|
97
|
+
---
|
|
98
|
+
|
|
99
|
+
## Project Structure
|
|
100
|
+
|
|
101
|
+
```
|
|
102
|
+
NeuralNetworkFromScratch
|
|
103
|
+
│
|
|
104
|
+
├── neuralnet
|
|
105
|
+
│ ├── __init__.py
|
|
106
|
+
│ ├── network.py
|
|
107
|
+
│ ├── layers.py
|
|
108
|
+
│ ├── activations.py
|
|
109
|
+
│ └── utils.py
|
|
110
|
+
│
|
|
111
|
+
├── tests
|
|
112
|
+
├── README.md
|
|
113
|
+
└── pyproject.toml
|
|
114
|
+
```
|
|
115
|
+
|
|
116
|
+
---
|
|
117
|
+
|
|
118
|
+
## Goals of the Project
|
|
119
|
+
|
|
120
|
+
This project was designed to:
|
|
121
|
+
|
|
122
|
+
* Demonstrate **how neural networks work internally**
|
|
123
|
+
* Provide a **clean NumPy-based implementation**
|
|
124
|
+
* Serve as an **educational resource for learning deep learning fundamentals**
|
|
125
|
+
|
|
126
|
+
Unlike production ML frameworks, this project prioritizes **clarity and learning over performance**.
|
|
127
|
+
|
|
128
|
+
---
|
|
129
|
+
|
|
130
|
+
## Dependencies
|
|
131
|
+
|
|
132
|
+
* numpy
|
|
133
|
+
* tqdm
|
|
134
|
+
|
|
135
|
+
Optional dependencies used in examples:
|
|
136
|
+
|
|
137
|
+
* matplotlib
|
|
138
|
+
* keras (for datasets such as MNIST)
|
|
139
|
+
|
|
140
|
+
---
|
|
141
|
+
|
|
142
|
+
## License
|
|
143
|
+
|
|
144
|
+
This project is licensed under the MIT License.
|
|
145
|
+
|
|
146
|
+
---
|
|
147
|
+
|
|
148
|
+
## Author
|
|
149
|
+
|
|
150
|
+
Created by **Itamar Senderovitz**.
|
|
@@ -0,0 +1,8 @@
|
|
|
1
|
+
NeuralNetworkFromScratch/__init__.py,sha256=fDI9TXts-_WKxQLUfT_S99sU2bP3OBJOHssFq-jODTU,787
|
|
2
|
+
NeuralNetworkFromScratch/layers.py,sha256=QUJACrS62u_oyAX1xfo_92OGWnAxemXthJ-4vRFQBFU,9462
|
|
3
|
+
NeuralNetworkFromScratch/model.py,sha256=uTASLPdXtIFAaXbWGLWul9vHdX0Q8XpwJsxI_S5d0T0,17343
|
|
4
|
+
NeuralNetworkFromScratch/utils.py,sha256=ck9yNw3dymXqGEvXyaexMHaK2lqLF5M34_EkqIrR4n0,2372
|
|
5
|
+
neuralnetworknumpy-0.1.1.dist-info/METADATA,sha256=UGeqACpBhBVE41_KavxzisVy_6hRtmcoobPy_ESO5vc,3275
|
|
6
|
+
neuralnetworknumpy-0.1.1.dist-info/WHEEL,sha256=aeYiig01lYGDzBgS8HxWXOg3uV61G9ijOsup-k9o1sk,91
|
|
7
|
+
neuralnetworknumpy-0.1.1.dist-info/top_level.txt,sha256=lUKcwPPsuJ-pqHQfY-ukbA3K1X9wY1vzgjptuBVw8PI,25
|
|
8
|
+
neuralnetworknumpy-0.1.1.dist-info/RECORD,,
|
|
@@ -0,0 +1 @@
|
|
|
1
|
+
NeuralNetworkFromScratch
|