froog 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- froog/__init__.py +34 -1
- froog/{gradcheck.py → gradient.py} +4 -11
- froog/ops.py +354 -87
- froog/optim.py +104 -32
- froog/tensor.py +219 -219
- froog/utils.py +8 -7
- froog-0.5.0.dist-info/METADATA +205 -0
- froog-0.5.0.dist-info/RECORD +10 -0
- {froog-0.4.0.dist-info → froog-0.5.0.dist-info}/WHEEL +1 -1
- froog/nn.py +0 -60
- froog/ops_gpu.py +0 -598
- froog-0.4.0.dist-info/LICENSE +0 -1
- froog-0.4.0.dist-info/METADATA +0 -293
- froog-0.4.0.dist-info/RECORD +0 -13
- {froog-0.4.0.dist-info → froog-0.5.0.dist-info}/top_level.txt +0 -0
froog/ops.py
CHANGED
@@ -7,7 +7,8 @@
|
|
7
7
|
# |___| |___| |_||_______||_______||_______|
|
8
8
|
|
9
9
|
import numpy as np
|
10
|
-
from
|
10
|
+
from typing import Tuple, Union, Optional, Any, Callable
|
11
|
+
from froog.tensor import Function, register, Tensor
|
11
12
|
from froog.utils import im2col, col2im
|
12
13
|
|
13
14
|
# *****************************************************
|
@@ -22,32 +23,74 @@ from froog.utils import im2col, col2im
|
|
22
23
|
|
23
24
|
class Add(Function):# x.add(y)
|
24
25
|
@staticmethod # @staticmethod doesn't require an instance of Add to work, so you can do x.add(y)
|
25
|
-
def forward(ctx, x, y):
|
26
|
+
def forward(ctx: Any, x: np.ndarray, y: np.ndarray) -> np.ndarray:
|
27
|
+
# Check if we have GPU buffers
|
28
|
+
is_metal_buffer = lambda x: hasattr(x, '__pyobjc_object__') or str(type(x)).find('Buffer') >= 0
|
29
|
+
if is_metal_buffer(x) or is_metal_buffer(y):
|
30
|
+
# Import get_buffer_data helper for Metal buffers
|
31
|
+
try:
|
32
|
+
from froog.gpu.buffer_utils import get_buffer_data
|
33
|
+
x_data = get_buffer_data(x)
|
34
|
+
y_data = get_buffer_data(y)
|
35
|
+
ctx.save_for_backward(x_data, y_data)
|
36
|
+
return x_data + y_data
|
37
|
+
except ImportError:
|
38
|
+
print("Warning: buffer_utils not available")
|
39
|
+
# Fall back to regular implementation
|
40
|
+
ctx.save_for_backward(x, y)
|
41
|
+
return x + y
|
42
|
+
|
43
|
+
# Regular implementation
|
44
|
+
ctx.save_for_backward(x, y)
|
26
45
|
return x + y
|
27
46
|
|
28
47
|
@staticmethod
|
29
|
-
def backward(ctx, grad_output):
|
48
|
+
def backward(ctx: Any, grad_output: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
|
30
49
|
return grad_output, grad_output
|
31
50
|
register("add", Add)
|
32
51
|
|
33
52
|
class Sub(Function): # x.sub(y)
|
34
53
|
@staticmethod
|
35
|
-
def forward(ctx, x, y):
|
54
|
+
def forward(ctx: Any, x: np.ndarray, y: np.ndarray) -> np.ndarray:
|
36
55
|
return x-y
|
37
56
|
|
38
57
|
@staticmethod
|
39
|
-
def backward(ctx, grad_output):
|
58
|
+
def backward(ctx: Any, grad_output: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
|
40
59
|
return grad_output, -grad_output
|
41
60
|
register('sub', Sub)
|
42
61
|
|
43
62
|
class Mul(Function): # x.mul(y)
|
44
63
|
@staticmethod
|
45
|
-
def forward(ctx, x, y):
|
64
|
+
def forward(ctx: Any, x: np.ndarray, y: np.ndarray) -> np.ndarray:
|
65
|
+
# Check if we have GPU buffers
|
66
|
+
is_metal_buffer = lambda x: hasattr(x, '__pyobjc_object__') or str(type(x)).find('Buffer') >= 0
|
67
|
+
if is_metal_buffer(x) or is_metal_buffer(y):
|
68
|
+
# Import get_buffer_data helper for Metal buffers
|
69
|
+
try:
|
70
|
+
from froog.gpu.buffer_utils import get_buffer_data, buffer_mul
|
71
|
+
x_data = get_buffer_data(x)
|
72
|
+
y_data = get_buffer_data(y)
|
73
|
+
ctx.save_for_backward(x_data, y_data)
|
74
|
+
return buffer_mul(x, y)
|
75
|
+
except Exception as e:
|
76
|
+
print(f"Error in Mul.forward with buffer: {e}")
|
77
|
+
# Fall back to CPU implementation if buffer handling fails
|
78
|
+
from froog.gpu import get_device
|
79
|
+
device = get_device()
|
80
|
+
if device:
|
81
|
+
x_cpu = device.download_tensor(x) if is_metal_buffer(x) else x
|
82
|
+
y_cpu = device.download_tensor(y) if is_metal_buffer(y) else y
|
83
|
+
ctx.save_for_backward(x_cpu, y_cpu)
|
84
|
+
result = x_cpu * y_cpu
|
85
|
+
return device.upload_tensor(result)
|
86
|
+
raise
|
87
|
+
|
88
|
+
# Standard CPU implementation
|
46
89
|
ctx.save_for_backward(x, y)
|
47
90
|
return x * y
|
48
91
|
|
49
92
|
@staticmethod
|
50
|
-
def backward(ctx, grad_output):
|
93
|
+
def backward(ctx: Any, grad_output: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
|
51
94
|
x, y = ctx.saved_tensors
|
52
95
|
return y * grad_output, x * grad_output
|
53
96
|
register("mul", Mul)
|
@@ -58,24 +101,49 @@ class Sum(Function): # x.sum()
|
|
58
101
|
reduces its input tensor to a single value by summing all the elements
|
59
102
|
"""
|
60
103
|
@staticmethod
|
61
|
-
def forward(ctx, input):
|
104
|
+
def forward(ctx: Any, input: np.ndarray) -> np.ndarray:
|
105
|
+
# Check if we have a GPU buffer
|
106
|
+
is_metal_buffer = lambda x: hasattr(x, '__pyobjc_object__') or str(type(x)).find('Buffer') >= 0
|
107
|
+
if is_metal_buffer(input):
|
108
|
+
# Use buffer utilities
|
109
|
+
try:
|
110
|
+
from froog.gpu.buffer_utils import get_buffer_data, buffer_sum
|
111
|
+
input_data = get_buffer_data(input)
|
112
|
+
ctx.save_for_backward(input_data)
|
113
|
+
ctx.input_shape = input_data.shape
|
114
|
+
return buffer_sum(input)
|
115
|
+
except Exception as e:
|
116
|
+
print(f"Error in Sum.forward with buffer: {e}")
|
117
|
+
# Fall back to CPU implementation
|
118
|
+
from froog.gpu import get_device
|
119
|
+
device = get_device()
|
120
|
+
if device:
|
121
|
+
input_cpu = device.download_tensor(input)
|
122
|
+
ctx.save_for_backward(input_cpu)
|
123
|
+
ctx.input_shape = input_cpu.shape
|
124
|
+
result = np.array([np.sum(input_cpu)])
|
125
|
+
return device.upload_tensor(result)
|
126
|
+
raise
|
127
|
+
|
128
|
+
# Standard CPU implementation
|
62
129
|
ctx.save_for_backward(input)
|
63
|
-
|
130
|
+
ctx.input_shape = input.shape
|
131
|
+
return np.array([np.sum(input)])
|
64
132
|
|
65
133
|
@staticmethod
|
66
|
-
def backward(ctx, grad_output):
|
134
|
+
def backward(ctx: Any, grad_output: np.ndarray) -> np.ndarray:
|
67
135
|
(input,) = ctx.saved_tensors
|
68
136
|
return grad_output * np.ones_like(input)
|
69
137
|
register("sum", Sum)
|
70
138
|
|
71
139
|
class Pow(Function): # x.pow(y)
|
72
140
|
@staticmethod
|
73
|
-
def forward(ctx, x, y):
|
141
|
+
def forward(ctx: Any, x: np.ndarray, y: np.ndarray) -> np.ndarray:
|
74
142
|
ctx.save_for_backward(x, y)
|
75
143
|
return x ** y
|
76
144
|
|
77
145
|
@staticmethod
|
78
|
-
def backward(ctx, grad_output):
|
146
|
+
def backward(ctx: Any, grad_output: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
|
79
147
|
x, y = ctx.saved_tensors
|
80
148
|
return y * (x**(y-1.0)) * grad_output, (x**y) * np.log(x) * grad_output # power rule, d/dx (y^x)
|
81
149
|
register("pow", Pow)
|
@@ -92,16 +160,65 @@ register("pow", Pow)
|
|
92
160
|
|
93
161
|
class Dot(Function): # x.dot(y)
|
94
162
|
@staticmethod
|
95
|
-
def forward(ctx, input, weight):
|
163
|
+
def forward(ctx: Any, input: np.ndarray, weight: np.ndarray) -> np.ndarray:
|
96
164
|
ctx.save_for_backward(input, weight)
|
97
|
-
|
165
|
+
|
166
|
+
# Check if we're working with GPU buffers
|
167
|
+
try:
|
168
|
+
from froog.tensor import is_buffer
|
169
|
+
from froog.gpu import download_tensor
|
170
|
+
|
171
|
+
# Convert any GPU buffers to CPU for the operation
|
172
|
+
if is_buffer(input):
|
173
|
+
input_cpu = download_tensor(input)
|
174
|
+
else:
|
175
|
+
input_cpu = input
|
176
|
+
|
177
|
+
if is_buffer(weight):
|
178
|
+
weight_cpu = download_tensor(weight)
|
179
|
+
else:
|
180
|
+
weight_cpu = weight
|
181
|
+
|
182
|
+
return input_cpu.dot(weight_cpu)
|
183
|
+
except Exception as e:
|
184
|
+
import traceback
|
185
|
+
print(f"Error in dot operation: {str(e)}")
|
186
|
+
print(f" Self: {input}")
|
187
|
+
print(f" Arg 0: {weight}")
|
188
|
+
print(f" Kwargs: {{}}")
|
189
|
+
traceback.print_exc()
|
190
|
+
# Try the original method as fallback
|
191
|
+
return input.dot(weight)
|
98
192
|
|
99
193
|
@staticmethod
|
100
|
-
def backward(ctx, grad_output):
|
194
|
+
def backward(ctx: Any, grad_output: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
|
101
195
|
input, weight = ctx.saved_tensors
|
102
|
-
|
103
|
-
|
104
|
-
|
196
|
+
|
197
|
+
# Convert GPU buffers to CPU if needed
|
198
|
+
try:
|
199
|
+
from froog.tensor import is_buffer
|
200
|
+
from froog.gpu import download_tensor
|
201
|
+
|
202
|
+
if is_buffer(input):
|
203
|
+
input_cpu = download_tensor(input)
|
204
|
+
else:
|
205
|
+
input_cpu = input
|
206
|
+
|
207
|
+
if is_buffer(weight):
|
208
|
+
weight_cpu = download_tensor(weight)
|
209
|
+
else:
|
210
|
+
weight_cpu = weight
|
211
|
+
|
212
|
+
if is_buffer(grad_output):
|
213
|
+
grad_output_cpu = download_tensor(grad_output)
|
214
|
+
else:
|
215
|
+
grad_output_cpu = grad_output
|
216
|
+
|
217
|
+
return grad_output_cpu.dot(weight_cpu.T), input_cpu.T.dot(grad_output_cpu)
|
218
|
+
except Exception as e:
|
219
|
+
print(f"Error in dot backward: {str(e)}")
|
220
|
+
# Fallback
|
221
|
+
return grad_output.dot(weight.T), input.T.dot(grad_output)
|
105
222
|
register('dot', Dot)
|
106
223
|
register('matmul', Dot)
|
107
224
|
|
@@ -117,12 +234,12 @@ register('matmul', Dot)
|
|
117
234
|
|
118
235
|
class ReLU(Function):
|
119
236
|
@staticmethod
|
120
|
-
def forward(ctx, input):
|
237
|
+
def forward(ctx: Any, input: np.ndarray) -> np.ndarray:
|
121
238
|
ctx.save_for_backward(input)
|
122
239
|
return np.maximum(input, 0) # relu(x) = max(0,x)
|
123
240
|
|
124
241
|
@staticmethod
|
125
|
-
def backward(ctx, grad_output):
|
242
|
+
def backward(ctx: Any, grad_output: np.ndarray) -> np.ndarray:
|
126
243
|
input, = ctx.saved_tensors
|
127
244
|
grad_input = grad_output * (input >= 0)
|
128
245
|
return grad_input
|
@@ -130,26 +247,61 @@ register("relu", ReLU)
|
|
130
247
|
|
131
248
|
class Sigmoid(Function):
|
132
249
|
@staticmethod
|
133
|
-
def forward(ctx, input):
|
250
|
+
def forward(ctx: Any, input: np.ndarray) -> np.ndarray:
|
134
251
|
ctx.save_for_backward(input)
|
135
252
|
ret = 1/(1 + np.exp(-input)) # sigmoid(x) = 1 / (1 + exp(-x))
|
136
253
|
return ret
|
137
254
|
|
138
255
|
@staticmethod
|
139
|
-
def backward(ctx, grad_output):
|
256
|
+
def backward(ctx: Any, grad_output: np.ndarray) -> np.ndarray:
|
140
257
|
ret, = ctx.saved_tensors
|
141
258
|
grad_input = grad_output * (ret * (1 - ret)) # just take the derivative of sigmoid
|
142
259
|
return grad_input
|
143
260
|
register("sigmoid", Sigmoid)
|
144
261
|
|
262
|
+
class DropoutLayer:
|
263
|
+
"""
|
264
|
+
Dropout layer that randomly sets a fraction of input units to 0 during training time.
|
265
|
+
pytorch version: https://pytorch.org/docs/stable/generated/torch.nn.Dropout.html
|
266
|
+
"""
|
267
|
+
def __init__(self, p: float = 0.5) -> None:
|
268
|
+
self.p = p
|
269
|
+
self.training = True
|
270
|
+
|
271
|
+
def __call__(self, x):
|
272
|
+
# build a CPU‐side random mask of the same shape as the tensor x
|
273
|
+
mask_np = (np.random.rand(*x.shape) >= self.p).astype(np.float32) / (1.0 - self.p)
|
274
|
+
from froog.tensor import Tensor
|
275
|
+
mask_t = Tensor(mask_np)
|
276
|
+
if getattr(x, "is_gpu", False): mask_t = mask_t.to_gpu()
|
277
|
+
return x.mul(mask_t)
|
278
|
+
|
279
|
+
class Dropout(Function):
|
280
|
+
@staticmethod
|
281
|
+
def forward(ctx: Any, input: np.ndarray, p: float = 0.5, training: bool = True) -> np.ndarray:
|
282
|
+
if not training: return input
|
283
|
+
# create a binary mask with probability (1-p) of being 1
|
284
|
+
# scale by 1/(1-p) to keep expectation same
|
285
|
+
ctx.training = training
|
286
|
+
mask = (np.random.rand(*input.shape) >= p).astype(np.float32) / (1.0 - p if p < 1.0 else 1e-9) # avoid division by zero if p is 1.0
|
287
|
+
ctx.save_for_backward(mask)
|
288
|
+
return input * mask
|
289
|
+
|
290
|
+
@staticmethod
|
291
|
+
def backward(ctx: Any, grad_output: np.ndarray) -> np.ndarray:
|
292
|
+
if not ctx.training: return grad_output
|
293
|
+
mask, = ctx.saved_tensors
|
294
|
+
return grad_output * mask
|
295
|
+
register("dropout", Dropout)
|
296
|
+
|
145
297
|
class Reshape(Function):
|
146
298
|
@staticmethod
|
147
|
-
def forward(ctx, x, shape):
|
299
|
+
def forward(ctx: Any, x: np.ndarray, shape: Tuple[int, ...]) -> np.ndarray:
|
148
300
|
ctx.save_for_backward(x.shape)
|
149
301
|
return x.reshape(shape)
|
150
302
|
|
151
303
|
@staticmethod
|
152
|
-
def backward(ctx, grad_output):
|
304
|
+
def backward(ctx: Any, grad_output: np.ndarray) -> np.ndarray:
|
153
305
|
in_shape, = ctx.saved_tensors
|
154
306
|
return grad_output.reshape(in_shape)
|
155
307
|
register('reshape', Reshape)
|
@@ -159,11 +311,13 @@ class Pad2D(Function):
|
|
159
311
|
The first element (0,0) corresponds to padding along the batch dimension, which indicates no padding on both sides (0 elements added).
|
160
312
|
"""
|
161
313
|
@staticmethod
|
162
|
-
def forward(ctx, x, padding=None):
|
314
|
+
def forward(ctx: Any, x: np.ndarray, padding: Optional[Tuple[int, int, int, int]] = None) -> np.ndarray:
|
315
|
+
if padding is None:
|
316
|
+
padding = (0, 0, 0, 0)
|
163
317
|
return np.pad(x, ((0,0), (0,0), (padding[0], padding[1]), (padding[2], padding[3]))) # (top, bottom, left, right)
|
164
318
|
|
165
319
|
@staticmethod
|
166
|
-
def backward(ctx, grad_output):
|
320
|
+
def backward(ctx: Any, grad_output: np.ndarray) -> np.ndarray:
|
167
321
|
raise Exception("write this")
|
168
322
|
register('pad2d', Pad2D)
|
169
323
|
|
@@ -173,8 +327,8 @@ class LogSoftmax(Function):
|
|
173
327
|
probabilities of each value are proportional to the scale of each value
|
174
328
|
"""
|
175
329
|
@staticmethod
|
176
|
-
def forward(ctx, input):
|
177
|
-
def logsumexp(x):
|
330
|
+
def forward(ctx: Any, input: np.ndarray) -> np.ndarray:
|
331
|
+
def logsumexp(x: np.ndarray) -> np.ndarray:
|
178
332
|
c = x.max(axis=1)
|
179
333
|
return c + np.log(np.exp(x - c.reshape((-1, 1))).sum(axis=1)) # axis=1 refers to the columns
|
180
334
|
|
@@ -183,7 +337,7 @@ class LogSoftmax(Function):
|
|
183
337
|
return output
|
184
338
|
|
185
339
|
@staticmethod
|
186
|
-
def backward(ctx, grad_output):
|
340
|
+
def backward(ctx: Any, grad_output: np.ndarray) -> np.ndarray:
|
187
341
|
(output,) = ctx.saved_tensors
|
188
342
|
return grad_output - np.exp(output)*(grad_output.sum(axis=1).reshape((-1, 1)))
|
189
343
|
register("logsoftmax", LogSoftmax)
|
@@ -200,62 +354,123 @@ register("logsoftmax", LogSoftmax)
|
|
200
354
|
|
201
355
|
class Conv2D(Function): # TODO: understand group splits
|
202
356
|
@staticmethod
|
203
|
-
def forward(ctx, x, w, stride=1, groups=1):
|
357
|
+
def forward(ctx: Any, x: np.ndarray, w: np.ndarray, stride: Union[int, Tuple[int, int]] = 1, groups: int = 1) -> np.ndarray:
|
204
358
|
"""
|
205
359
|
https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
|
206
360
|
WARNING: doesn't handle padding or strides yet
|
207
361
|
Args:
|
208
|
-
x.shape[0]
|
209
|
-
cout
|
210
|
-
x.shape[2]-(H-1)
|
211
|
-
x.shape[3]-(W-1)
|
362
|
+
x.shape[0] --> number of input examples (batch size)
|
363
|
+
cout --> number of output channels
|
364
|
+
x.shape[2]-(H-1) --> non-padded height of conv output, need to subtract because this is an unpadded conv
|
365
|
+
x.shape[3]-(W-1) --> width of output
|
212
366
|
Shape:
|
213
367
|
(a, b, c, d)(e, f, g, h) --> (a, e, c-(g-1), d-(h-1))
|
214
368
|
in general, output x and y = [(W−K+2P)/S]+1
|
215
369
|
"""
|
216
|
-
|
370
|
+
ctx.stride = stride
|
371
|
+
ctx.groups = groups
|
372
|
+
|
373
|
+
if isinstance(ctx.stride, int): # ctx stores function params
|
217
374
|
ctx.stride = (ctx.stride, ctx.stride)
|
218
375
|
|
219
|
-
|
220
|
-
|
221
|
-
|
222
|
-
|
223
|
-
|
224
|
-
|
225
|
-
|
226
|
-
|
227
|
-
|
228
|
-
|
229
|
-
|
230
|
-
|
231
|
-
|
232
|
-
|
233
|
-
|
234
|
-
|
235
|
-
|
236
|
-
|
237
|
-
|
238
|
-
|
239
|
-
|
376
|
+
# Check if we're working with GPU buffers and convert to CPU
|
377
|
+
try:
|
378
|
+
from froog.tensor import is_buffer
|
379
|
+
from froog.gpu import download_tensor
|
380
|
+
|
381
|
+
# Convert input to CPU if it's a GPU buffer
|
382
|
+
if is_buffer(x):
|
383
|
+
x_cpu = download_tensor(x)
|
384
|
+
else:
|
385
|
+
x_cpu = x
|
386
|
+
|
387
|
+
# Convert weight to CPU if it's a GPU buffer
|
388
|
+
if is_buffer(w):
|
389
|
+
w_cpu = download_tensor(w)
|
390
|
+
else:
|
391
|
+
w_cpu = w
|
392
|
+
|
393
|
+
# Now use the CPU tensors for the rest of the computation
|
394
|
+
cout, cin, H, W = w_cpu.shape
|
395
|
+
|
396
|
+
tw = w_cpu.reshape(cout, -1).T # slice of kernel
|
397
|
+
y_stride, x_stride = ctx.stride
|
398
|
+
|
399
|
+
bs,cin_,oy,ox = x_cpu.shape[0], x_cpu.shape[1], (x_cpu.shape[2]-(H-y_stride))//y_stride, (x_cpu.shape[3]-(W-x_stride))//x_stride
|
400
|
+
assert cin*ctx.groups == cin_ # ensures that the channel dimensions match appropriately for grouping
|
401
|
+
assert cout % ctx.groups == 0 # ensures that the number of output channels can be evenly divided among the groups
|
402
|
+
g_w_chans = cout//ctx.groups # number of output channels per group
|
403
|
+
|
404
|
+
ctx.save_for_backward(x_cpu, w_cpu)
|
405
|
+
ret = np.zeros((bs, cout, oy, ox), dtype=w_cpu.dtype)
|
406
|
+
|
407
|
+
for g in range(ctx.groups):
|
408
|
+
tw = w_cpu[g*g_w_chans:(g*g_w_chans+g_w_chans)].reshape(g_w_chans, -1).T # transformed kernel weights
|
409
|
+
for Y in range(oy):
|
410
|
+
for X in range(ox):
|
411
|
+
iY,iX = Y*y_stride, X*x_stride
|
412
|
+
tx = x_cpu[:, g*cin:(g*cin+cin), iY:iY+H, iX:iX+W].reshape(bs, -1)
|
413
|
+
ret[:, g*g_w_chans:(g*g_w_chans+g_w_chans), Y, X] += tx.dot(tw)
|
414
|
+
return ret
|
415
|
+
except Exception as e:
|
416
|
+
import traceback
|
417
|
+
print(f"Error in conv2d operation: {str(e)}")
|
418
|
+
print(f" Self: {x}")
|
419
|
+
print(f" Arg 0: {w}")
|
420
|
+
print(f" Kwargs: {{stride: {stride}, groups: {groups}}}")
|
421
|
+
traceback.print_exc()
|
422
|
+
raise
|
240
423
|
|
241
424
|
@staticmethod
|
242
|
-
def backward(ctx, grad_output):
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
425
|
+
def backward(ctx: Any, grad_output: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
|
426
|
+
try:
|
427
|
+
from froog.tensor import is_buffer
|
428
|
+
from froog.gpu import download_tensor
|
429
|
+
|
430
|
+
# Convert grad_output to CPU if it's a GPU buffer
|
431
|
+
if is_buffer(grad_output):
|
432
|
+
grad_output_cpu = download_tensor(grad_output)
|
433
|
+
else:
|
434
|
+
grad_output_cpu = grad_output
|
435
|
+
|
436
|
+
x, w = ctx.saved_tensors
|
437
|
+
cout, cin, H, W = w.shape
|
438
|
+
dx, dw = np.zeros_like(x), np.zeros_like(w)
|
439
|
+
y_stride, x_stride = ctx.stride
|
440
|
+
g_w_chans = cout//ctx.groups
|
441
|
+
|
442
|
+
for g in range(ctx.groups):
|
443
|
+
tw = w[g*g_w_chans:(g*g_w_chans+g_w_chans)].reshape(g_w_chans, -1)
|
444
|
+
for Y in range(grad_output_cpu.shape[2]):
|
445
|
+
for X in range(grad_output_cpu.shape[3]):
|
446
|
+
iY,iX = Y*y_stride, X*x_stride
|
447
|
+
gg = grad_output_cpu[:, g*g_w_chans:(g*g_w_chans+g_w_chans), Y, X] # current multiply element in chain rule
|
448
|
+
tx = x[:, g*cin:(g*cin+cin), iY:iY+H, iX:iX+W].reshape(x.shape[0], -1) # slice of tensor at current conv op
|
449
|
+
dw[g*g_w_chans:(g*g_w_chans+g_w_chans)] += gg.T.dot(tx).reshape((g_w_chans,cin,H,W)) # gradient with respect to input
|
450
|
+
dx[:, g*cin:(g*cin+cin), iY:iY+H, iX:iX+W] += gg.dot(tw).reshape(dx.shape[0], cin, H, W) # accumulate gradient with respect to weights
|
451
|
+
return dx, dw
|
452
|
+
except Exception as e:
|
453
|
+
import traceback
|
454
|
+
print(f"Error in conv2d backward: {str(e)}")
|
455
|
+
print(f" Grad Output: {grad_output}")
|
456
|
+
traceback.print_exc()
|
457
|
+
# Fallback to original implementation
|
458
|
+
x, w = ctx.saved_tensors
|
459
|
+
cout, cin, H, W = w.shape
|
460
|
+
dx, dw = np.zeros_like(x), np.zeros_like(w)
|
461
|
+
y_stride, x_stride = ctx.stride
|
462
|
+
g_w_chans = cout//ctx.groups
|
463
|
+
|
464
|
+
for g in range(ctx.groups):
|
465
|
+
tw = w[g*g_w_chans:(g*g_w_chans+g_w_chans)].reshape(g_w_chans, -1)
|
466
|
+
for Y in range(grad_output.shape[2]):
|
467
|
+
for X in range(grad_output.shape[3]):
|
468
|
+
iY,iX = Y*y_stride, X*x_stride
|
469
|
+
gg = grad_output[:, g*g_w_chans:(g*g_w_chans+g_w_chans), Y, X] # current multiply element in chain rule
|
470
|
+
tx = x[:, g*cin:(g*cin+cin), iY:iY+H, iX:iX+W].reshape(x.shape[0], -1) # slice of tensor at current conv op
|
471
|
+
dw[g*g_w_chans:(g*g_w_chans+g_w_chans)] += gg.T.dot(tx).reshape((g_w_chans,cin,H,W)) # gradient with respect to input
|
472
|
+
dx[:, g*cin:(g*cin+cin), iY:iY+H, iX:iX+W] += gg.dot(tw).reshape(dx.shape[0], cin, H, W) # accumulate gradient with respect to weights
|
473
|
+
return dx, dw
|
259
474
|
register('conv2d', Conv2D)
|
260
475
|
|
261
476
|
|
@@ -266,7 +481,7 @@ class im2ColConv(Function):
|
|
266
481
|
"""
|
267
482
|
|
268
483
|
@staticmethod
|
269
|
-
def forward(ctx, x, w):
|
484
|
+
def forward(ctx: Any, x: np.ndarray, w: np.ndarray) -> np.ndarray:
|
270
485
|
cout, cin, k_h, k_x = w.shape
|
271
486
|
bs, oy, ox = x.shape[0], x.shape[2]-(k_h-1), x.shape[3]-(k_x-1)
|
272
487
|
tw = w.reshape(cout, -1).T # each filter flattened into a row
|
@@ -276,7 +491,7 @@ class im2ColConv(Function):
|
|
276
491
|
return np.moveaxis(ret, [0,1,2,3], [0,2,3,1]) # reorders the axes (batch size, number of channels, height, width)
|
277
492
|
|
278
493
|
@staticmethod
|
279
|
-
def backward(ctx, grad_output):
|
494
|
+
def backward(ctx: Any, grad_output: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
|
280
495
|
bs,_,oy,ox = grad_output.shape
|
281
496
|
tx, w = ctx.saved_tensors # transformed input, filter weights
|
282
497
|
cout,cin,H,W = w.shape
|
@@ -298,7 +513,7 @@ register('im2col2dconv', im2ColConv)
|
|
298
513
|
#
|
299
514
|
# **************** pooling ops ***************
|
300
515
|
|
301
|
-
def stack_for_pool(x, pool_y, pool_x):
|
516
|
+
def stack_for_pool(x: np.ndarray, pool_y: int, pool_x: int) -> np.ndarray:
|
302
517
|
my, mx = (x.shape[2]//pool_y)*pool_y, (x.shape[3]//pool_x)*pool_x # ensures input tensor can be evenly divided into 2x2 blocks for max pooling
|
303
518
|
stack = []
|
304
519
|
cropped_x = x[:, :, :my, :mx] # crop input so 2x2 max pool can be taken
|
@@ -308,47 +523,49 @@ def stack_for_pool(x, pool_y, pool_x):
|
|
308
523
|
return np.concatenate(stack, axis=0) # put all into one row
|
309
524
|
|
310
525
|
|
311
|
-
def unstack_for_pool(fxn, s, py, px):
|
526
|
+
def unstack_for_pool(fxn: Callable[[int], np.ndarray], s: Tuple[int, ...], py: int, px: int) -> np.ndarray:
|
312
527
|
max_y, max_x = (s[2]//py)*py, (s[3]//px)*px # get shape that allows (pool_size_y,pool_size_x) max pool
|
528
|
+
ret = None
|
313
529
|
for Y in range(py):
|
314
530
|
for X in range(px):
|
315
531
|
level_w_new_grad = fxn(Y*px+X)
|
316
532
|
if X == 0 and Y == 0: # pool of zero size
|
317
533
|
ret = np.zeros(s, dtype=level_w_new_grad.dtype)
|
318
|
-
ret
|
319
|
-
|
534
|
+
if ret is not None:
|
535
|
+
ret[:, :, Y:max_y:py, X:max_x:px] = level_w_new_grad
|
536
|
+
return ret if ret is not None else np.zeros(s)
|
320
537
|
|
321
538
|
|
322
539
|
class MaxPool2D(Function):
|
323
540
|
@staticmethod
|
324
|
-
def forward(ctx, x, kernel_size=(2,2)):
|
541
|
+
def forward(ctx: Any, x: np.ndarray, kernel_size: Tuple[int, int] = (2,2)) -> np.ndarray:
|
542
|
+
ctx.kernel_size = kernel_size
|
325
543
|
stack = stack_for_pool(x, *kernel_size)
|
326
544
|
idx_of_max = np.argmax(stack, axis=0)
|
327
545
|
ctx.save_for_backward(idx_of_max, x.shape)
|
328
546
|
return np.max(stack, axis=0)
|
329
547
|
|
330
548
|
@staticmethod
|
331
|
-
def backward(ctx, grad_output):
|
549
|
+
def backward(ctx: Any, grad_output: np.ndarray) -> np.ndarray:
|
332
550
|
"""
|
333
551
|
Distributes the gradient from the output of the max pooling layer to its inputs
|
334
552
|
The purpose of (idxs == idx) is to generate a boolean mask indicating the locations of the maximum values in each 2x2 block of the original input
|
335
553
|
The expression (Y*2+X) is a way to iterate through the four possible positions within the kernel block: e.g. (0,0), (0,1), (1,0), and (1,1), which get mapped to the indices 0, 1, 2, and 3
|
336
554
|
"""
|
337
|
-
idxs, s = ctx.saved_tensors
|
338
|
-
return unstack_for_pool(lambda idx: grad_output * (idxs == idx),
|
339
|
-
s,
|
340
|
-
*ctx.kernel_size)
|
555
|
+
idxs, s = ctx.saved_tensors
|
556
|
+
return unstack_for_pool(lambda idx: grad_output * (idxs == idx), s, *ctx.kernel_size)
|
341
557
|
register('max_pool2d', MaxPool2D)
|
342
558
|
|
343
559
|
class AvgPool2D(Function):
|
344
560
|
@staticmethod
|
345
|
-
def forward(ctx, x, kernel_size=(2,2)):
|
561
|
+
def forward(ctx: Any, x: np.ndarray, kernel_size: Tuple[int, int] = (2,2)) -> np.ndarray:
|
562
|
+
ctx.kernel_size = kernel_size
|
346
563
|
stack = stack_for_pool(x, *kernel_size)
|
347
564
|
ctx.save_for_backward(x.shape)
|
348
565
|
return np.mean(stack, axis=0)
|
349
566
|
|
350
567
|
@staticmethod
|
351
|
-
def backward(ctx, grad_output):
|
568
|
+
def backward(ctx: Any, grad_output: np.ndarray) -> np.ndarray:
|
352
569
|
s, = ctx.saved_tensors
|
353
570
|
py, px = ctx.kernel_size # kernel_size passed from forward context
|
354
571
|
my, mx = (s[2]//py)*py, (s[3]//px)*px
|
@@ -358,3 +575,53 @@ class AvgPool2D(Function):
|
|
358
575
|
ret[:, :, Y:my:py, X:mx:px] = grad_output / py / px # divide by avg of pool, e.g. for 2x2 pool /= 4
|
359
576
|
return ret
|
360
577
|
register('avg_pool2d', AvgPool2D)
|
578
|
+
|
579
|
+
# *************************************
|
580
|
+
# _ ___ __ ____ ____ _____
|
581
|
+
# / | / / | / / / __ \/ __ \/ ___/
|
582
|
+
# / |/ / |/ / / / / / /_/ /\__ \
|
583
|
+
# / /| / /| / / /_/ / ____/___/ /
|
584
|
+
# /_/ |_/_/ |_/ \____/_/ /____/
|
585
|
+
#
|
586
|
+
# ************* nn ops ************
|
587
|
+
|
588
|
+
def Linear(*x: int) -> np.ndarray:
|
589
|
+
# random Glorot initialization
|
590
|
+
ret = np.random.uniform(-1., 1., size=x)/np.sqrt(np.prod(x))
|
591
|
+
return ret.astype(np.float32)
|
592
|
+
|
593
|
+
def swish(x: Tensor) -> Tensor:
|
594
|
+
return x.mul(x.sigmoid())
|
595
|
+
|
596
|
+
class BatchNorm2D:
|
597
|
+
"""
|
598
|
+
__call__ follows the formula from the link below
|
599
|
+
pytorch version: https://pytorch.org/docs/stable/generated/torch.nn.BatchNorm2d.html
|
600
|
+
|
601
|
+
self.weight = γ
|
602
|
+
self.bias = β
|
603
|
+
self.running_mean = E[x]
|
604
|
+
self.running_var = Var[x]
|
605
|
+
|
606
|
+
the reshaping step ensures that each channel of the input has its
|
607
|
+
own separate set of parameters (mean, variance, weight, and bias)
|
608
|
+
|
609
|
+
self.running_mean has shape [num_channels].
|
610
|
+
self.running_mean.reshape(shape=[1, -1, 1, 1]) reshapes it to [1, num_channels, 1, 1]
|
611
|
+
"""
|
612
|
+
def __init__(self, sz: int, eps: float = 0.001) -> None:
|
613
|
+
self.eps = eps
|
614
|
+
self.weight = Tensor.zeros(sz)
|
615
|
+
self.bias = Tensor.zeros(sz)
|
616
|
+
|
617
|
+
# TODO: need running_mean and running_var
|
618
|
+
self.running_mean = Tensor.zeros(sz)
|
619
|
+
self.running_var = Tensor.zeros(sz)
|
620
|
+
self.num_batches_tracked = Tensor.zeros(1)
|
621
|
+
|
622
|
+
def __call__(self, x: Tensor) -> Tensor:
|
623
|
+
x = x.sub(self.running_mean.reshape(shape=[1, -1, 1, 1]))
|
624
|
+
x = x.mul(self.weight.reshape(shape=[1, -1, 1, 1]))
|
625
|
+
x = x.div(self.running_var.add(Tensor([self.eps], gpu=x.gpu)).reshape(shape=[1, -1, 1, 1]).sqrt())
|
626
|
+
x = x.add(self.bias.reshape(shape=[1, -1, 1, 1]))
|
627
|
+
return x
|