neuralnetworknumpy 0.1.1__tar.gz
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- neuralnetworknumpy-0.1.1/NeuralNetworkFromScratch/__init__.py +56 -0
- neuralnetworknumpy-0.1.1/NeuralNetworkFromScratch/layers.py +342 -0
- neuralnetworknumpy-0.1.1/NeuralNetworkFromScratch/model.py +506 -0
- neuralnetworknumpy-0.1.1/NeuralNetworkFromScratch/utils.py +87 -0
- neuralnetworknumpy-0.1.1/PKG-INFO +150 -0
- neuralnetworknumpy-0.1.1/README.md +131 -0
- neuralnetworknumpy-0.1.1/neuralnetworknumpy.egg-info/PKG-INFO +150 -0
- neuralnetworknumpy-0.1.1/neuralnetworknumpy.egg-info/SOURCES.txt +13 -0
- neuralnetworknumpy-0.1.1/neuralnetworknumpy.egg-info/dependency_links.txt +1 -0
- neuralnetworknumpy-0.1.1/neuralnetworknumpy.egg-info/requires.txt +2 -0
- neuralnetworknumpy-0.1.1/neuralnetworknumpy.egg-info/top_level.txt +1 -0
- neuralnetworknumpy-0.1.1/pyproject.toml +34 -0
- neuralnetworknumpy-0.1.1/setup.cfg +4 -0
- neuralnetworknumpy-0.1.1/tests/test_load.py +66 -0
- neuralnetworknumpy-0.1.1/tests/test_model.py +66 -0
|
@@ -0,0 +1,56 @@
|
|
|
1
|
+
"""
|
|
2
|
+
NeuralNetworkFromScratch
|
|
3
|
+
A minimal deep learning framework built using NumPy.
|
|
4
|
+
"""
|
|
5
|
+
|
|
6
|
+
__version__ = "0.1.0"
|
|
7
|
+
|
|
8
|
+
# Layers
|
|
9
|
+
from .layers import (
|
|
10
|
+
Layer,
|
|
11
|
+
Dense,
|
|
12
|
+
Activation,
|
|
13
|
+
ReLu,
|
|
14
|
+
Sigmoid,
|
|
15
|
+
Softmax,
|
|
16
|
+
Linear,
|
|
17
|
+
Tanh,
|
|
18
|
+
BatchNorm,
|
|
19
|
+
Dropout,
|
|
20
|
+
)
|
|
21
|
+
|
|
22
|
+
# Model
|
|
23
|
+
from .model import NeuralNetwork
|
|
24
|
+
|
|
25
|
+
# Utils
|
|
26
|
+
from .utils import (
|
|
27
|
+
History,
|
|
28
|
+
Scaler,
|
|
29
|
+
split_train_test,
|
|
30
|
+
split_train_validation,
|
|
31
|
+
)
|
|
32
|
+
|
|
33
|
+
__all__ = [
|
|
34
|
+
# Core
|
|
35
|
+
"NeuralNetwork",
|
|
36
|
+
|
|
37
|
+
# Base
|
|
38
|
+
"Layer",
|
|
39
|
+
|
|
40
|
+
# Layers
|
|
41
|
+
"Dense",
|
|
42
|
+
"Activation",
|
|
43
|
+
"ReLu",
|
|
44
|
+
"Sigmoid",
|
|
45
|
+
"Softmax",
|
|
46
|
+
"Linear",
|
|
47
|
+
"Tanh",
|
|
48
|
+
"BatchNorm",
|
|
49
|
+
"Dropout",
|
|
50
|
+
|
|
51
|
+
# Utilities
|
|
52
|
+
"History",
|
|
53
|
+
"Scaler",
|
|
54
|
+
"split_train_test",
|
|
55
|
+
"split_train_validation",
|
|
56
|
+
]
|
|
@@ -0,0 +1,342 @@
|
|
|
1
|
+
import numpy as np
|
|
2
|
+
|
|
3
|
+
class Layer:
|
|
4
|
+
def __init__(self):
|
|
5
|
+
# Trainable parameters (some layers won't use them)
|
|
6
|
+
self.W = None
|
|
7
|
+
self.b = None
|
|
8
|
+
|
|
9
|
+
# Gradients
|
|
10
|
+
self.dW = None
|
|
11
|
+
self.db = None
|
|
12
|
+
|
|
13
|
+
# Optimizer states
|
|
14
|
+
self.vW = None
|
|
15
|
+
self.vb = None
|
|
16
|
+
self.mW = None
|
|
17
|
+
self.mb = None
|
|
18
|
+
|
|
19
|
+
# Forward pass values
|
|
20
|
+
self.A = None
|
|
21
|
+
self.A_prev = None # Input to this layer
|
|
22
|
+
|
|
23
|
+
# Backprop pass values
|
|
24
|
+
self.Z = None
|
|
25
|
+
|
|
26
|
+
# Forward and backward are abstract methods — override in subclasses
|
|
27
|
+
def _forward(self, A_prev, training=None):
|
|
28
|
+
raise NotImplementedError
|
|
29
|
+
|
|
30
|
+
def _backward(self, dA):
|
|
31
|
+
raise NotImplementedError
|
|
32
|
+
|
|
33
|
+
def _update(self, lambda_, lr, beta1, beta2, _eps, optimizer, t):
|
|
34
|
+
raise NotImplementedError
|
|
35
|
+
|
|
36
|
+
|
|
37
|
+
class Dense(Layer):
|
|
38
|
+
def __init__(self, units: int, inputs:int=0, kernel_initializer: str=None):
|
|
39
|
+
super().__init__()
|
|
40
|
+
|
|
41
|
+
self.units = units
|
|
42
|
+
self.in_size = inputs
|
|
43
|
+
self.out_size = units
|
|
44
|
+
self.kernel_initializer = kernel_initializer
|
|
45
|
+
|
|
46
|
+
if not kernel_initializer:
|
|
47
|
+
self._set_default_initializers()
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
def _set_default_initializers(self):
|
|
51
|
+
self.initializer = "he"
|
|
52
|
+
"""if self.activation == "relu":
|
|
53
|
+
self.initializer = "he"
|
|
54
|
+
elif self.activation in ["sigmoid", "tanh", "softmax", "linear"]:
|
|
55
|
+
self.initializer = "xavier"
|
|
56
|
+
else:
|
|
57
|
+
raise Exception("Invalid activation function")"""
|
|
58
|
+
|
|
59
|
+
def build(self, input_size):
|
|
60
|
+
self.in_size = input_size
|
|
61
|
+
self._initialize_weights()
|
|
62
|
+
|
|
63
|
+
self.b = np.zeros((self.out_size, 1), dtype=np.float32)
|
|
64
|
+
|
|
65
|
+
# Optimizer state
|
|
66
|
+
self.vW = np.zeros_like(self.W).astype(np.float32)
|
|
67
|
+
self.vb = np.zeros_like(self.b).astype(np.float32)
|
|
68
|
+
self.mW = np.zeros_like(self.W).astype(np.float32)
|
|
69
|
+
self.mb = np.zeros_like(self.b).astype(np.float32)
|
|
70
|
+
|
|
71
|
+
def _initialize_weights(self):
|
|
72
|
+
|
|
73
|
+
if self.initializer == "he":
|
|
74
|
+
# W ~ N(0, √(2/in))
|
|
75
|
+
std = np.sqrt(2.0 / self.in_size)
|
|
76
|
+
self.W = np.random.randn(self.out_size, self.in_size).astype(np.float32) * std
|
|
77
|
+
|
|
78
|
+
elif self.initializer == "xavier":
|
|
79
|
+
# U(-√(6/(in+out)),√(6/(in+out)))
|
|
80
|
+
limit = np.sqrt(6.0 / (self.in_size + self.out_size))
|
|
81
|
+
self.W = np.random.uniform(-limit, limit, (self.out_size, self.in_size)).astype(np.float32)
|
|
82
|
+
|
|
83
|
+
else:
|
|
84
|
+
self.W = np.random.randn(self.out_size, self.in_size).astype(np.float32) * 0.01
|
|
85
|
+
|
|
86
|
+
def _forward(self, A_prev, training=None):
|
|
87
|
+
if self.W is None:
|
|
88
|
+
self.build(A_prev.shape[0])
|
|
89
|
+
|
|
90
|
+
self.A_prev = A_prev
|
|
91
|
+
self.Z = np.dot(self.W, A_prev) + self.b
|
|
92
|
+
|
|
93
|
+
return self.Z
|
|
94
|
+
|
|
95
|
+
def _backward(self, dA, skip_activation=False):
|
|
96
|
+
|
|
97
|
+
# dW_i = dZ_i · A_{i}^T
|
|
98
|
+
# Gradient of the loss w.r.t. weights of layer i
|
|
99
|
+
self.dW = np.dot(dA, self.A_prev.T)
|
|
100
|
+
|
|
101
|
+
# dB_i = sum(dZ_i) over the batch
|
|
102
|
+
# Gradient of the loss w.r.t. biases of layer i
|
|
103
|
+
self.db = np.sum(dA, axis=1, keepdims=True)
|
|
104
|
+
|
|
105
|
+
# Gradient to pass backward
|
|
106
|
+
# dA_prev_i = W_i · dA_i
|
|
107
|
+
return np.dot(self.W.T, dA)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
def _update(self, lambda_, lr, beta1, beta2, _eps, optimizer, t):
|
|
111
|
+
|
|
112
|
+
if optimizer == "adamW":
|
|
113
|
+
dw = self.dW # pure gradient
|
|
114
|
+
else:
|
|
115
|
+
# Get batch size from the last dZ calculation to scale regularization
|
|
116
|
+
m = self.A.shape[1]
|
|
117
|
+
dw = self.dW + (lambda_ / m) * self.W # L2 regularization
|
|
118
|
+
|
|
119
|
+
if optimizer == "momentum":
|
|
120
|
+
self.vW = beta1 * self.vW + dw
|
|
121
|
+
self.vb = beta1 * self.vb + self.db
|
|
122
|
+
|
|
123
|
+
update_w = self.vW
|
|
124
|
+
update_b = self.vb
|
|
125
|
+
|
|
126
|
+
elif optimizer == "adam" or optimizer == "adamW":
|
|
127
|
+
|
|
128
|
+
self.mW = beta1 * self.mW + (1 - beta1) * dw
|
|
129
|
+
self.mb = beta1 * self.mb + (1 - beta1) * self.db
|
|
130
|
+
|
|
131
|
+
self.vW = beta2 * self.vW + (1 - beta2) * (dw ** 2)
|
|
132
|
+
self.vb = beta2 * self.vb + (1 - beta2) * (self.db ** 2)
|
|
133
|
+
|
|
134
|
+
m_w_hat = self.mW / (1 - beta1 ** t)
|
|
135
|
+
m_b_hat = self.mb / (1 - beta1 ** t)
|
|
136
|
+
|
|
137
|
+
v_w_hat = self.vW / (1 - beta2 ** t)
|
|
138
|
+
v_b_hat = self.vb / (1 - beta2 ** t)
|
|
139
|
+
|
|
140
|
+
|
|
141
|
+
update_w = m_w_hat / (np.sqrt(v_w_hat) + _eps)
|
|
142
|
+
update_b = m_b_hat / (np.sqrt(v_b_hat) + _eps)
|
|
143
|
+
|
|
144
|
+
elif optimizer == "rmsprop":
|
|
145
|
+
self.vW = beta2 * self.vW + (1 - beta2) * (dw ** 2)
|
|
146
|
+
self.vb = beta2 * self.vb + (1 - beta2) * (self.db ** 2)
|
|
147
|
+
|
|
148
|
+
update_w = dw / (np.sqrt(self.vW) + _eps)
|
|
149
|
+
update_b = self.db / (np.sqrt(self.vb) + _eps)
|
|
150
|
+
|
|
151
|
+
else:
|
|
152
|
+
# Classic SGD
|
|
153
|
+
update_w = dw
|
|
154
|
+
update_b = self.db
|
|
155
|
+
|
|
156
|
+
|
|
157
|
+
# W = W - lr * (dW + λ * W)
|
|
158
|
+
self.W -= lr * update_w
|
|
159
|
+
self.b -= lr * update_b
|
|
160
|
+
|
|
161
|
+
# Decouple (adamW) Weight Decay
|
|
162
|
+
if optimizer == "adamW":
|
|
163
|
+
self.W *= (1 - lr * lambda_) # Decoupled weight decay
|
|
164
|
+
|
|
165
|
+
|
|
166
|
+
class Activation(Layer):
|
|
167
|
+
def __init__(self):
|
|
168
|
+
super().__init__()
|
|
169
|
+
|
|
170
|
+
def _update(self, *args, **kwargs):
|
|
171
|
+
pass # no parameters to update
|
|
172
|
+
|
|
173
|
+
class ReLu(Activation):
|
|
174
|
+
|
|
175
|
+
def _forward(self, Z, training=None):
|
|
176
|
+
self.Z = Z
|
|
177
|
+
self.A = np.maximum(0, Z)
|
|
178
|
+
return self.A
|
|
179
|
+
|
|
180
|
+
def _backward(self, dA):
|
|
181
|
+
return dA * (self.Z > 0)
|
|
182
|
+
|
|
183
|
+
class Sigmoid(Activation):
|
|
184
|
+
|
|
185
|
+
def _forward(self, Z, training=None):
|
|
186
|
+
self.Z = Z
|
|
187
|
+
self.A = 1 / (1 + np.exp(-Z))
|
|
188
|
+
return self.A
|
|
189
|
+
|
|
190
|
+
def _backward(self, dA):
|
|
191
|
+
return dA * self.A * (1 - self.A)
|
|
192
|
+
|
|
193
|
+
class Softmax(Activation):
|
|
194
|
+
|
|
195
|
+
def _forward(self, Z, training=None):
|
|
196
|
+
self.Z = Z
|
|
197
|
+
shifted = Z - np.max(Z, axis=0, keepdims=True)
|
|
198
|
+
exp_vals = np.exp(shifted)
|
|
199
|
+
self.A = exp_vals / np.sum(exp_vals, axis=0, keepdims=True)
|
|
200
|
+
return self.A
|
|
201
|
+
|
|
202
|
+
def _backward(self, dA):
|
|
203
|
+
s = np.sum(dA * self.A, axis=0, keepdims=True)
|
|
204
|
+
return self.A * (dA - s)
|
|
205
|
+
|
|
206
|
+
class Linear(Activation):
|
|
207
|
+
|
|
208
|
+
def _forward(self, Z, training=None):
|
|
209
|
+
self.Z = Z
|
|
210
|
+
self.A = Z
|
|
211
|
+
return self.A
|
|
212
|
+
|
|
213
|
+
def _backward(self, dA):
|
|
214
|
+
return dA # derivative is 1
|
|
215
|
+
|
|
216
|
+
class Tanh(Activation):
|
|
217
|
+
|
|
218
|
+
def _forward(self, Z, training=None):
|
|
219
|
+
self.Z = Z
|
|
220
|
+
self.A = np.tanh(Z)
|
|
221
|
+
return self.A
|
|
222
|
+
|
|
223
|
+
def _backward(self, dA):
|
|
224
|
+
return dA * (1 - self.A ** 2)
|
|
225
|
+
|
|
226
|
+
|
|
227
|
+
|
|
228
|
+
class BatchNorm(Layer):
|
|
229
|
+
def __init__(self, momentum=0.9):
|
|
230
|
+
super().__init__()
|
|
231
|
+
self.momentum = momentum
|
|
232
|
+
self._eps = 1e-08
|
|
233
|
+
|
|
234
|
+
self.gamma = None
|
|
235
|
+
self.beta = None
|
|
236
|
+
|
|
237
|
+
self.running_mean = None
|
|
238
|
+
self.running_var = None
|
|
239
|
+
|
|
240
|
+
def build(self, input_size):
|
|
241
|
+
self.gamma = np.ones((input_size, 1), dtype=np.float32)
|
|
242
|
+
self.beta = np.zeros((input_size, 1), dtype=np.float32)
|
|
243
|
+
|
|
244
|
+
self.running_mean = np.zeros((input_size, 1), dtype=np.float32)
|
|
245
|
+
self.running_var = np.ones((input_size, 1), dtype=np.float32)
|
|
246
|
+
|
|
247
|
+
def _forward(self, A_prev, training=True):
|
|
248
|
+
if self.gamma is None:
|
|
249
|
+
self.build(A_prev.shape[0])
|
|
250
|
+
|
|
251
|
+
self.A_prev = A_prev
|
|
252
|
+
|
|
253
|
+
if training:
|
|
254
|
+
self.mean = np.mean(A_prev, axis=1, keepdims=True)
|
|
255
|
+
self.var = np.var(A_prev, axis=1, keepdims=True)
|
|
256
|
+
|
|
257
|
+
# X̂ = (X - μB) / √(σB^2 + ε)
|
|
258
|
+
self.X_hat = (A_prev - self.mean) / np.sqrt(self.var + self._eps)
|
|
259
|
+
# A = γ * X̂ + β
|
|
260
|
+
self.A = self.gamma * self.X_hat + self.beta
|
|
261
|
+
|
|
262
|
+
# Update running stats
|
|
263
|
+
self.running_mean = (
|
|
264
|
+
self.momentum * self.running_mean
|
|
265
|
+
+ (1 - self.momentum) * self.mean
|
|
266
|
+
)
|
|
267
|
+
|
|
268
|
+
self.running_var = (
|
|
269
|
+
self.momentum * self.running_var
|
|
270
|
+
+ (1 - self.momentum) * self.var
|
|
271
|
+
)
|
|
272
|
+
|
|
273
|
+
else:
|
|
274
|
+
self.X_hat = (A_prev - self.running_mean) / np.sqrt(self.running_var + self._eps)
|
|
275
|
+
self.A = self.gamma * self.X_hat + self.beta
|
|
276
|
+
|
|
277
|
+
return self.A
|
|
278
|
+
|
|
279
|
+
def _backward(self, dA, skip_activation=False):
|
|
280
|
+
|
|
281
|
+
m = dA.shape[1]
|
|
282
|
+
|
|
283
|
+
dgamma = np.sum(dA * self.X_hat, axis=1, keepdims=True)
|
|
284
|
+
dbeta = np.sum(dA, axis=1, keepdims=True)
|
|
285
|
+
|
|
286
|
+
dX_hat = dA * self.gamma
|
|
287
|
+
|
|
288
|
+
var_inv = 1. / np.sqrt(self.var + self._eps)
|
|
289
|
+
|
|
290
|
+
dvar = np.sum(dX_hat * (self.A_prev - self.mean) * -0.5 * var_inv**3,
|
|
291
|
+
axis=1, keepdims=True)
|
|
292
|
+
|
|
293
|
+
dmean = (
|
|
294
|
+
np.sum(dX_hat * -var_inv, axis=1, keepdims=True)
|
|
295
|
+
+ dvar * np.mean(-2. * (self.A_prev - self.mean), axis=1, keepdims=True)
|
|
296
|
+
)
|
|
297
|
+
|
|
298
|
+
dX = (
|
|
299
|
+
dX_hat * var_inv
|
|
300
|
+
+ dvar * 2 * (self.A_prev - self.mean) / m
|
|
301
|
+
+ dmean / m
|
|
302
|
+
)
|
|
303
|
+
|
|
304
|
+
self.dgamma = dgamma / m
|
|
305
|
+
self.dbeta = dbeta / m
|
|
306
|
+
|
|
307
|
+
return dX
|
|
308
|
+
|
|
309
|
+
|
|
310
|
+
|
|
311
|
+
def _update(self, lambda_, lr, beta1, beta2, eps, optimizer, t):
|
|
312
|
+
self.gamma -= lr * self.dgamma
|
|
313
|
+
self.beta -= lr * self.dbeta
|
|
314
|
+
|
|
315
|
+
class Dropout(Layer):
|
|
316
|
+
def __init__(self, rate):
|
|
317
|
+
super().__init__()
|
|
318
|
+
self.rate = rate # probability of dropping a unit
|
|
319
|
+
self.mask = None
|
|
320
|
+
|
|
321
|
+
def _forward(self, A_prev, training=True):
|
|
322
|
+
if not training:
|
|
323
|
+
# No dropout during inference
|
|
324
|
+
self.mask = np.ones_like(A_prev)
|
|
325
|
+
self.A = A_prev
|
|
326
|
+
return self.A
|
|
327
|
+
|
|
328
|
+
# Create dropout mask
|
|
329
|
+
self.mask = np.random.rand(A_prev.shape[0], A_prev.shape[1]) > self.rate
|
|
330
|
+
# Apply mask AND scale (inverted dropout)
|
|
331
|
+
self.A = (A_prev * self.mask) / (1 - self.rate)
|
|
332
|
+
|
|
333
|
+
return self.A
|
|
334
|
+
|
|
335
|
+
def _backward(self, dA, skip_activation=False):
|
|
336
|
+
# Backprop only through active neurons
|
|
337
|
+
dA_prev = (dA * self.mask) / (1 - self.rate)
|
|
338
|
+
return dA_prev
|
|
339
|
+
|
|
340
|
+
def _update(self, lambda_, lr, beta1, beta2, _eps, optimizer, t):
|
|
341
|
+
# Dropout layer has no trainable parameters
|
|
342
|
+
pass
|