neuralnetworknumpy 0.1.1__tar.gz

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
@@ -0,0 +1,56 @@
1
+ """
2
+ NeuralNetworkFromScratch
3
+ A minimal deep learning framework built using NumPy.
4
+ """
5
+
6
+ __version__ = "0.1.0"
7
+
8
+ # Layers
9
+ from .layers import (
10
+ Layer,
11
+ Dense,
12
+ Activation,
13
+ ReLu,
14
+ Sigmoid,
15
+ Softmax,
16
+ Linear,
17
+ Tanh,
18
+ BatchNorm,
19
+ Dropout,
20
+ )
21
+
22
+ # Model
23
+ from .model import NeuralNetwork
24
+
25
+ # Utils
26
+ from .utils import (
27
+ History,
28
+ Scaler,
29
+ split_train_test,
30
+ split_train_validation,
31
+ )
32
+
33
+ __all__ = [
34
+ # Core
35
+ "NeuralNetwork",
36
+
37
+ # Base
38
+ "Layer",
39
+
40
+ # Layers
41
+ "Dense",
42
+ "Activation",
43
+ "ReLu",
44
+ "Sigmoid",
45
+ "Softmax",
46
+ "Linear",
47
+ "Tanh",
48
+ "BatchNorm",
49
+ "Dropout",
50
+
51
+ # Utilities
52
+ "History",
53
+ "Scaler",
54
+ "split_train_test",
55
+ "split_train_validation",
56
+ ]
@@ -0,0 +1,342 @@
1
+ import numpy as np
2
+
3
+ class Layer:
4
+ def __init__(self):
5
+ # Trainable parameters (some layers won't use them)
6
+ self.W = None
7
+ self.b = None
8
+
9
+ # Gradients
10
+ self.dW = None
11
+ self.db = None
12
+
13
+ # Optimizer states
14
+ self.vW = None
15
+ self.vb = None
16
+ self.mW = None
17
+ self.mb = None
18
+
19
+ # Forward pass values
20
+ self.A = None
21
+ self.A_prev = None # Input to this layer
22
+
23
+ # Backprop pass values
24
+ self.Z = None
25
+
26
+ # Forward and backward are abstract methods — override in subclasses
27
+ def _forward(self, A_prev, training=None):
28
+ raise NotImplementedError
29
+
30
+ def _backward(self, dA):
31
+ raise NotImplementedError
32
+
33
+ def _update(self, lambda_, lr, beta1, beta2, _eps, optimizer, t):
34
+ raise NotImplementedError
35
+
36
+
37
+ class Dense(Layer):
38
+ def __init__(self, units: int, inputs:int=0, kernel_initializer: str=None):
39
+ super().__init__()
40
+
41
+ self.units = units
42
+ self.in_size = inputs
43
+ self.out_size = units
44
+ self.kernel_initializer = kernel_initializer
45
+
46
+ if not kernel_initializer:
47
+ self._set_default_initializers()
48
+
49
+
50
+ def _set_default_initializers(self):
51
+ self.initializer = "he"
52
+ """if self.activation == "relu":
53
+ self.initializer = "he"
54
+ elif self.activation in ["sigmoid", "tanh", "softmax", "linear"]:
55
+ self.initializer = "xavier"
56
+ else:
57
+ raise Exception("Invalid activation function")"""
58
+
59
+ def build(self, input_size):
60
+ self.in_size = input_size
61
+ self._initialize_weights()
62
+
63
+ self.b = np.zeros((self.out_size, 1), dtype=np.float32)
64
+
65
+ # Optimizer state
66
+ self.vW = np.zeros_like(self.W).astype(np.float32)
67
+ self.vb = np.zeros_like(self.b).astype(np.float32)
68
+ self.mW = np.zeros_like(self.W).astype(np.float32)
69
+ self.mb = np.zeros_like(self.b).astype(np.float32)
70
+
71
+ def _initialize_weights(self):
72
+
73
+ if self.initializer == "he":
74
+ # W ~ N(0, √(2/in))
75
+ std = np.sqrt(2.0 / self.in_size)
76
+ self.W = np.random.randn(self.out_size, self.in_size).astype(np.float32) * std
77
+
78
+ elif self.initializer == "xavier":
79
+ # U(-√(6/(in+out)),√(6/(in+out)))
80
+ limit = np.sqrt(6.0 / (self.in_size + self.out_size))
81
+ self.W = np.random.uniform(-limit, limit, (self.out_size, self.in_size)).astype(np.float32)
82
+
83
+ else:
84
+ self.W = np.random.randn(self.out_size, self.in_size).astype(np.float32) * 0.01
85
+
86
+ def _forward(self, A_prev, training=None):
87
+ if self.W is None:
88
+ self.build(A_prev.shape[0])
89
+
90
+ self.A_prev = A_prev
91
+ self.Z = np.dot(self.W, A_prev) + self.b
92
+
93
+ return self.Z
94
+
95
+ def _backward(self, dA, skip_activation=False):
96
+
97
+ # dW_i = dZ_i · A_{i}^T
98
+ # Gradient of the loss w.r.t. weights of layer i
99
+ self.dW = np.dot(dA, self.A_prev.T)
100
+
101
+ # dB_i = sum(dZ_i) over the batch
102
+ # Gradient of the loss w.r.t. biases of layer i
103
+ self.db = np.sum(dA, axis=1, keepdims=True)
104
+
105
+ # Gradient to pass backward
106
+ # dA_prev_i = W_i · dA_i
107
+ return np.dot(self.W.T, dA)
108
+
109
+
110
+ def _update(self, lambda_, lr, beta1, beta2, _eps, optimizer, t):
111
+
112
+ if optimizer == "adamW":
113
+ dw = self.dW # pure gradient
114
+ else:
115
+ # Get batch size from the last dZ calculation to scale regularization
116
+ m = self.A.shape[1]
117
+ dw = self.dW + (lambda_ / m) * self.W # L2 regularization
118
+
119
+ if optimizer == "momentum":
120
+ self.vW = beta1 * self.vW + dw
121
+ self.vb = beta1 * self.vb + self.db
122
+
123
+ update_w = self.vW
124
+ update_b = self.vb
125
+
126
+ elif optimizer == "adam" or optimizer == "adamW":
127
+
128
+ self.mW = beta1 * self.mW + (1 - beta1) * dw
129
+ self.mb = beta1 * self.mb + (1 - beta1) * self.db
130
+
131
+ self.vW = beta2 * self.vW + (1 - beta2) * (dw ** 2)
132
+ self.vb = beta2 * self.vb + (1 - beta2) * (self.db ** 2)
133
+
134
+ m_w_hat = self.mW / (1 - beta1 ** t)
135
+ m_b_hat = self.mb / (1 - beta1 ** t)
136
+
137
+ v_w_hat = self.vW / (1 - beta2 ** t)
138
+ v_b_hat = self.vb / (1 - beta2 ** t)
139
+
140
+
141
+ update_w = m_w_hat / (np.sqrt(v_w_hat) + _eps)
142
+ update_b = m_b_hat / (np.sqrt(v_b_hat) + _eps)
143
+
144
+ elif optimizer == "rmsprop":
145
+ self.vW = beta2 * self.vW + (1 - beta2) * (dw ** 2)
146
+ self.vb = beta2 * self.vb + (1 - beta2) * (self.db ** 2)
147
+
148
+ update_w = dw / (np.sqrt(self.vW) + _eps)
149
+ update_b = self.db / (np.sqrt(self.vb) + _eps)
150
+
151
+ else:
152
+ # Classic SGD
153
+ update_w = dw
154
+ update_b = self.db
155
+
156
+
157
+ # W = W - lr * (dW + λ * W)
158
+ self.W -= lr * update_w
159
+ self.b -= lr * update_b
160
+
161
+ # Decouple (adamW) Weight Decay
162
+ if optimizer == "adamW":
163
+ self.W *= (1 - lr * lambda_) # Decoupled weight decay
164
+
165
+
166
+ class Activation(Layer):
167
+ def __init__(self):
168
+ super().__init__()
169
+
170
+ def _update(self, *args, **kwargs):
171
+ pass # no parameters to update
172
+
173
+ class ReLu(Activation):
174
+
175
+ def _forward(self, Z, training=None):
176
+ self.Z = Z
177
+ self.A = np.maximum(0, Z)
178
+ return self.A
179
+
180
+ def _backward(self, dA):
181
+ return dA * (self.Z > 0)
182
+
183
+ class Sigmoid(Activation):
184
+
185
+ def _forward(self, Z, training=None):
186
+ self.Z = Z
187
+ self.A = 1 / (1 + np.exp(-Z))
188
+ return self.A
189
+
190
+ def _backward(self, dA):
191
+ return dA * self.A * (1 - self.A)
192
+
193
+ class Softmax(Activation):
194
+
195
+ def _forward(self, Z, training=None):
196
+ self.Z = Z
197
+ shifted = Z - np.max(Z, axis=0, keepdims=True)
198
+ exp_vals = np.exp(shifted)
199
+ self.A = exp_vals / np.sum(exp_vals, axis=0, keepdims=True)
200
+ return self.A
201
+
202
+ def _backward(self, dA):
203
+ s = np.sum(dA * self.A, axis=0, keepdims=True)
204
+ return self.A * (dA - s)
205
+
206
+ class Linear(Activation):
207
+
208
+ def _forward(self, Z, training=None):
209
+ self.Z = Z
210
+ self.A = Z
211
+ return self.A
212
+
213
+ def _backward(self, dA):
214
+ return dA # derivative is 1
215
+
216
+ class Tanh(Activation):
217
+
218
+ def _forward(self, Z, training=None):
219
+ self.Z = Z
220
+ self.A = np.tanh(Z)
221
+ return self.A
222
+
223
+ def _backward(self, dA):
224
+ return dA * (1 - self.A ** 2)
225
+
226
+
227
+
228
+ class BatchNorm(Layer):
229
+ def __init__(self, momentum=0.9):
230
+ super().__init__()
231
+ self.momentum = momentum
232
+ self._eps = 1e-08
233
+
234
+ self.gamma = None
235
+ self.beta = None
236
+
237
+ self.running_mean = None
238
+ self.running_var = None
239
+
240
+ def build(self, input_size):
241
+ self.gamma = np.ones((input_size, 1), dtype=np.float32)
242
+ self.beta = np.zeros((input_size, 1), dtype=np.float32)
243
+
244
+ self.running_mean = np.zeros((input_size, 1), dtype=np.float32)
245
+ self.running_var = np.ones((input_size, 1), dtype=np.float32)
246
+
247
+ def _forward(self, A_prev, training=True):
248
+ if self.gamma is None:
249
+ self.build(A_prev.shape[0])
250
+
251
+ self.A_prev = A_prev
252
+
253
+ if training:
254
+ self.mean = np.mean(A_prev, axis=1, keepdims=True)
255
+ self.var = np.var(A_prev, axis=1, keepdims=True)
256
+
257
+ # X̂ = (X - μB) / √(σB^2 + ε)
258
+ self.X_hat = (A_prev - self.mean) / np.sqrt(self.var + self._eps)
259
+ # A = γ * X̂ + β
260
+ self.A = self.gamma * self.X_hat + self.beta
261
+
262
+ # Update running stats
263
+ self.running_mean = (
264
+ self.momentum * self.running_mean
265
+ + (1 - self.momentum) * self.mean
266
+ )
267
+
268
+ self.running_var = (
269
+ self.momentum * self.running_var
270
+ + (1 - self.momentum) * self.var
271
+ )
272
+
273
+ else:
274
+ self.X_hat = (A_prev - self.running_mean) / np.sqrt(self.running_var + self._eps)
275
+ self.A = self.gamma * self.X_hat + self.beta
276
+
277
+ return self.A
278
+
279
+ def _backward(self, dA, skip_activation=False):
280
+
281
+ m = dA.shape[1]
282
+
283
+ dgamma = np.sum(dA * self.X_hat, axis=1, keepdims=True)
284
+ dbeta = np.sum(dA, axis=1, keepdims=True)
285
+
286
+ dX_hat = dA * self.gamma
287
+
288
+ var_inv = 1. / np.sqrt(self.var + self._eps)
289
+
290
+ dvar = np.sum(dX_hat * (self.A_prev - self.mean) * -0.5 * var_inv**3,
291
+ axis=1, keepdims=True)
292
+
293
+ dmean = (
294
+ np.sum(dX_hat * -var_inv, axis=1, keepdims=True)
295
+ + dvar * np.mean(-2. * (self.A_prev - self.mean), axis=1, keepdims=True)
296
+ )
297
+
298
+ dX = (
299
+ dX_hat * var_inv
300
+ + dvar * 2 * (self.A_prev - self.mean) / m
301
+ + dmean / m
302
+ )
303
+
304
+ self.dgamma = dgamma / m
305
+ self.dbeta = dbeta / m
306
+
307
+ return dX
308
+
309
+
310
+
311
+ def _update(self, lambda_, lr, beta1, beta2, eps, optimizer, t):
312
+ self.gamma -= lr * self.dgamma
313
+ self.beta -= lr * self.dbeta
314
+
315
+ class Dropout(Layer):
316
+ def __init__(self, rate):
317
+ super().__init__()
318
+ self.rate = rate # probability of dropping a unit
319
+ self.mask = None
320
+
321
+ def _forward(self, A_prev, training=True):
322
+ if not training:
323
+ # No dropout during inference
324
+ self.mask = np.ones_like(A_prev)
325
+ self.A = A_prev
326
+ return self.A
327
+
328
+ # Create dropout mask
329
+ self.mask = np.random.rand(A_prev.shape[0], A_prev.shape[1]) > self.rate
330
+ # Apply mask AND scale (inverted dropout)
331
+ self.A = (A_prev * self.mask) / (1 - self.rate)
332
+
333
+ return self.A
334
+
335
+ def _backward(self, dA, skip_activation=False):
336
+ # Backprop only through active neurons
337
+ dA_prev = (dA * self.mask) / (1 - self.rate)
338
+ return dA_prev
339
+
340
+ def _update(self, lambda_, lr, beta1, beta2, _eps, optimizer, t):
341
+ # Dropout layer has no trainable parameters
342
+ pass