froog 0.4.2__py3-none-any.whl → 0.5.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- froog/__init__.py +34 -1
- froog/{gradcheck.py → gradient.py} +4 -11
- froog/ops.py +307 -114
- froog/optim.py +104 -32
- froog/tensor.py +219 -219
- froog/utils.py +8 -7
- froog-0.5.1.dist-info/METADATA +205 -0
- froog-0.5.1.dist-info/RECORD +10 -0
- froog/nn.py +0 -60
- froog/ops_gpu.py +0 -598
- froog-0.4.2.dist-info/LICENSE +0 -1
- froog-0.4.2.dist-info/METADATA +0 -233
- froog-0.4.2.dist-info/RECORD +0 -13
- {froog-0.4.2.dist-info → froog-0.5.1.dist-info}/WHEEL +0 -0
- {froog-0.4.2.dist-info → froog-0.5.1.dist-info}/top_level.txt +0 -0
froog/ops.py
CHANGED
@@ -7,9 +7,9 @@
|
|
7
7
|
# |___| |___| |_||_______||_______||_______|
|
8
8
|
|
9
9
|
import numpy as np
|
10
|
-
from
|
10
|
+
from typing import Tuple, Union, Optional, Any, Callable
|
11
|
+
from froog.tensor import Function, register, Tensor
|
11
12
|
from froog.utils import im2col, col2im
|
12
|
-
from froog.tensor import Tensor
|
13
13
|
|
14
14
|
# *****************************************************
|
15
15
|
# ____ ___ _____ __________ ____ ____ _____
|
@@ -23,32 +23,74 @@ from froog.tensor import Tensor
|
|
23
23
|
|
24
24
|
class Add(Function):# x.add(y)
|
25
25
|
@staticmethod # @staticmethod doesn't require an instance of Add to work, so you can do x.add(y)
|
26
|
-
def forward(ctx, x, y):
|
26
|
+
def forward(ctx: Any, x: np.ndarray, y: np.ndarray) -> np.ndarray:
|
27
|
+
# Check if we have GPU buffers
|
28
|
+
is_metal_buffer = lambda x: hasattr(x, '__pyobjc_object__') or str(type(x)).find('Buffer') >= 0
|
29
|
+
if is_metal_buffer(x) or is_metal_buffer(y):
|
30
|
+
# Import get_buffer_data helper for Metal buffers
|
31
|
+
try:
|
32
|
+
from froog.gpu.buffer_utils import get_buffer_data
|
33
|
+
x_data = get_buffer_data(x)
|
34
|
+
y_data = get_buffer_data(y)
|
35
|
+
ctx.save_for_backward(x_data, y_data)
|
36
|
+
return x_data + y_data
|
37
|
+
except ImportError:
|
38
|
+
print("Warning: buffer_utils not available")
|
39
|
+
# Fall back to regular implementation
|
40
|
+
ctx.save_for_backward(x, y)
|
41
|
+
return x + y
|
42
|
+
|
43
|
+
# Regular implementation
|
44
|
+
ctx.save_for_backward(x, y)
|
27
45
|
return x + y
|
28
46
|
|
29
47
|
@staticmethod
|
30
|
-
def backward(ctx, grad_output):
|
48
|
+
def backward(ctx: Any, grad_output: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
|
31
49
|
return grad_output, grad_output
|
32
50
|
register("add", Add)
|
33
51
|
|
34
52
|
class Sub(Function): # x.sub(y)
|
35
53
|
@staticmethod
|
36
|
-
def forward(ctx, x, y):
|
54
|
+
def forward(ctx: Any, x: np.ndarray, y: np.ndarray) -> np.ndarray:
|
37
55
|
return x-y
|
38
56
|
|
39
57
|
@staticmethod
|
40
|
-
def backward(ctx, grad_output):
|
58
|
+
def backward(ctx: Any, grad_output: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
|
41
59
|
return grad_output, -grad_output
|
42
60
|
register('sub', Sub)
|
43
61
|
|
44
62
|
class Mul(Function): # x.mul(y)
|
45
63
|
@staticmethod
|
46
|
-
def forward(ctx, x, y):
|
64
|
+
def forward(ctx: Any, x: np.ndarray, y: np.ndarray) -> np.ndarray:
|
65
|
+
# Check if we have GPU buffers
|
66
|
+
is_metal_buffer = lambda x: hasattr(x, '__pyobjc_object__') or str(type(x)).find('Buffer') >= 0
|
67
|
+
if is_metal_buffer(x) or is_metal_buffer(y):
|
68
|
+
# Import get_buffer_data helper for Metal buffers
|
69
|
+
try:
|
70
|
+
from froog.gpu.buffer_utils import get_buffer_data, buffer_mul
|
71
|
+
x_data = get_buffer_data(x)
|
72
|
+
y_data = get_buffer_data(y)
|
73
|
+
ctx.save_for_backward(x_data, y_data)
|
74
|
+
return buffer_mul(x, y)
|
75
|
+
except Exception as e:
|
76
|
+
print(f"Error in Mul.forward with buffer: {e}")
|
77
|
+
# Fall back to CPU implementation if buffer handling fails
|
78
|
+
from froog.gpu import get_device
|
79
|
+
device = get_device()
|
80
|
+
if device:
|
81
|
+
x_cpu = device.download_tensor(x) if is_metal_buffer(x) else x
|
82
|
+
y_cpu = device.download_tensor(y) if is_metal_buffer(y) else y
|
83
|
+
ctx.save_for_backward(x_cpu, y_cpu)
|
84
|
+
result = x_cpu * y_cpu
|
85
|
+
return device.upload_tensor(result)
|
86
|
+
raise
|
87
|
+
|
88
|
+
# Standard CPU implementation
|
47
89
|
ctx.save_for_backward(x, y)
|
48
90
|
return x * y
|
49
91
|
|
50
92
|
@staticmethod
|
51
|
-
def backward(ctx, grad_output):
|
93
|
+
def backward(ctx: Any, grad_output: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
|
52
94
|
x, y = ctx.saved_tensors
|
53
95
|
return y * grad_output, x * grad_output
|
54
96
|
register("mul", Mul)
|
@@ -59,24 +101,49 @@ class Sum(Function): # x.sum()
|
|
59
101
|
reduces its input tensor to a single value by summing all the elements
|
60
102
|
"""
|
61
103
|
@staticmethod
|
62
|
-
def forward(ctx, input):
|
104
|
+
def forward(ctx: Any, input: np.ndarray) -> np.ndarray:
|
105
|
+
# Check if we have a GPU buffer
|
106
|
+
is_metal_buffer = lambda x: hasattr(x, '__pyobjc_object__') or str(type(x)).find('Buffer') >= 0
|
107
|
+
if is_metal_buffer(input):
|
108
|
+
# Use buffer utilities
|
109
|
+
try:
|
110
|
+
from froog.gpu.buffer_utils import get_buffer_data, buffer_sum
|
111
|
+
input_data = get_buffer_data(input)
|
112
|
+
ctx.save_for_backward(input_data)
|
113
|
+
ctx.input_shape = input_data.shape
|
114
|
+
return buffer_sum(input)
|
115
|
+
except Exception as e:
|
116
|
+
print(f"Error in Sum.forward with buffer: {e}")
|
117
|
+
# Fall back to CPU implementation
|
118
|
+
from froog.gpu import get_device
|
119
|
+
device = get_device()
|
120
|
+
if device:
|
121
|
+
input_cpu = device.download_tensor(input)
|
122
|
+
ctx.save_for_backward(input_cpu)
|
123
|
+
ctx.input_shape = input_cpu.shape
|
124
|
+
result = np.array([np.sum(input_cpu)])
|
125
|
+
return device.upload_tensor(result)
|
126
|
+
raise
|
127
|
+
|
128
|
+
# Standard CPU implementation
|
63
129
|
ctx.save_for_backward(input)
|
64
|
-
|
130
|
+
ctx.input_shape = input.shape
|
131
|
+
return np.array([np.sum(input)])
|
65
132
|
|
66
133
|
@staticmethod
|
67
|
-
def backward(ctx, grad_output):
|
134
|
+
def backward(ctx: Any, grad_output: np.ndarray) -> np.ndarray:
|
68
135
|
(input,) = ctx.saved_tensors
|
69
136
|
return grad_output * np.ones_like(input)
|
70
137
|
register("sum", Sum)
|
71
138
|
|
72
139
|
class Pow(Function): # x.pow(y)
|
73
140
|
@staticmethod
|
74
|
-
def forward(ctx, x, y):
|
141
|
+
def forward(ctx: Any, x: np.ndarray, y: np.ndarray) -> np.ndarray:
|
75
142
|
ctx.save_for_backward(x, y)
|
76
143
|
return x ** y
|
77
144
|
|
78
145
|
@staticmethod
|
79
|
-
def backward(ctx, grad_output):
|
146
|
+
def backward(ctx: Any, grad_output: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
|
80
147
|
x, y = ctx.saved_tensors
|
81
148
|
return y * (x**(y-1.0)) * grad_output, (x**y) * np.log(x) * grad_output # power rule, d/dx (y^x)
|
82
149
|
register("pow", Pow)
|
@@ -93,16 +160,65 @@ register("pow", Pow)
|
|
93
160
|
|
94
161
|
class Dot(Function): # x.dot(y)
|
95
162
|
@staticmethod
|
96
|
-
def forward(ctx, input, weight):
|
163
|
+
def forward(ctx: Any, input: np.ndarray, weight: np.ndarray) -> np.ndarray:
|
97
164
|
ctx.save_for_backward(input, weight)
|
98
|
-
|
165
|
+
|
166
|
+
# Check if we're working with GPU buffers
|
167
|
+
try:
|
168
|
+
from froog.tensor import is_buffer
|
169
|
+
from froog.gpu import download_tensor
|
170
|
+
|
171
|
+
# Convert any GPU buffers to CPU for the operation
|
172
|
+
if is_buffer(input):
|
173
|
+
input_cpu = download_tensor(input)
|
174
|
+
else:
|
175
|
+
input_cpu = input
|
176
|
+
|
177
|
+
if is_buffer(weight):
|
178
|
+
weight_cpu = download_tensor(weight)
|
179
|
+
else:
|
180
|
+
weight_cpu = weight
|
181
|
+
|
182
|
+
return input_cpu.dot(weight_cpu)
|
183
|
+
except Exception as e:
|
184
|
+
import traceback
|
185
|
+
print(f"Error in dot operation: {str(e)}")
|
186
|
+
print(f" Self: {input}")
|
187
|
+
print(f" Arg 0: {weight}")
|
188
|
+
print(f" Kwargs: {{}}")
|
189
|
+
traceback.print_exc()
|
190
|
+
# Try the original method as fallback
|
191
|
+
return input.dot(weight)
|
99
192
|
|
100
193
|
@staticmethod
|
101
|
-
def backward(ctx, grad_output):
|
194
|
+
def backward(ctx: Any, grad_output: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
|
102
195
|
input, weight = ctx.saved_tensors
|
103
|
-
|
104
|
-
|
105
|
-
|
196
|
+
|
197
|
+
# Convert GPU buffers to CPU if needed
|
198
|
+
try:
|
199
|
+
from froog.tensor import is_buffer
|
200
|
+
from froog.gpu import download_tensor
|
201
|
+
|
202
|
+
if is_buffer(input):
|
203
|
+
input_cpu = download_tensor(input)
|
204
|
+
else:
|
205
|
+
input_cpu = input
|
206
|
+
|
207
|
+
if is_buffer(weight):
|
208
|
+
weight_cpu = download_tensor(weight)
|
209
|
+
else:
|
210
|
+
weight_cpu = weight
|
211
|
+
|
212
|
+
if is_buffer(grad_output):
|
213
|
+
grad_output_cpu = download_tensor(grad_output)
|
214
|
+
else:
|
215
|
+
grad_output_cpu = grad_output
|
216
|
+
|
217
|
+
return grad_output_cpu.dot(weight_cpu.T), input_cpu.T.dot(grad_output_cpu)
|
218
|
+
except Exception as e:
|
219
|
+
print(f"Error in dot backward: {str(e)}")
|
220
|
+
# Fallback
|
221
|
+
return grad_output.dot(weight.T), input.T.dot(grad_output)
|
106
222
|
register('dot', Dot)
|
107
223
|
register('matmul', Dot)
|
108
224
|
|
@@ -118,12 +234,12 @@ register('matmul', Dot)
|
|
118
234
|
|
119
235
|
class ReLU(Function):
|
120
236
|
@staticmethod
|
121
|
-
def forward(ctx, input):
|
237
|
+
def forward(ctx: Any, input: np.ndarray) -> np.ndarray:
|
122
238
|
ctx.save_for_backward(input)
|
123
239
|
return np.maximum(input, 0) # relu(x) = max(0,x)
|
124
240
|
|
125
241
|
@staticmethod
|
126
|
-
def backward(ctx, grad_output):
|
242
|
+
def backward(ctx: Any, grad_output: np.ndarray) -> np.ndarray:
|
127
243
|
input, = ctx.saved_tensors
|
128
244
|
grad_input = grad_output * (input >= 0)
|
129
245
|
return grad_input
|
@@ -131,49 +247,61 @@ register("relu", ReLU)
|
|
131
247
|
|
132
248
|
class Sigmoid(Function):
|
133
249
|
@staticmethod
|
134
|
-
def forward(ctx, input):
|
250
|
+
def forward(ctx: Any, input: np.ndarray) -> np.ndarray:
|
135
251
|
ctx.save_for_backward(input)
|
136
252
|
ret = 1/(1 + np.exp(-input)) # sigmoid(x) = 1 / (1 + exp(-x))
|
137
253
|
return ret
|
138
254
|
|
139
255
|
@staticmethod
|
140
|
-
def backward(ctx, grad_output):
|
256
|
+
def backward(ctx: Any, grad_output: np.ndarray) -> np.ndarray:
|
141
257
|
ret, = ctx.saved_tensors
|
142
258
|
grad_input = grad_output * (ret * (1 - ret)) # just take the derivative of sigmoid
|
143
259
|
return grad_input
|
144
260
|
register("sigmoid", Sigmoid)
|
145
261
|
|
146
|
-
|
147
|
-
|
148
|
-
|
149
|
-
|
150
|
-
|
151
|
-
|
152
|
-
|
153
|
-
|
154
|
-
|
155
|
-
|
156
|
-
#
|
157
|
-
|
158
|
-
|
159
|
-
|
160
|
-
|
161
|
-
|
162
|
-
|
163
|
-
|
164
|
-
|
165
|
-
|
166
|
-
|
167
|
-
#
|
262
|
+
class DropoutLayer:
|
263
|
+
"""
|
264
|
+
Dropout layer that randomly sets a fraction of input units to 0 during training time.
|
265
|
+
pytorch version: https://pytorch.org/docs/stable/generated/torch.nn.Dropout.html
|
266
|
+
"""
|
267
|
+
def __init__(self, p: float = 0.5) -> None:
|
268
|
+
self.p = p
|
269
|
+
self.training = True
|
270
|
+
|
271
|
+
def __call__(self, x):
|
272
|
+
# build a CPU‐side random mask of the same shape as the tensor x
|
273
|
+
mask_np = (np.random.rand(*x.shape) >= self.p).astype(np.float32) / (1.0 - self.p)
|
274
|
+
from froog.tensor import Tensor
|
275
|
+
mask_t = Tensor(mask_np)
|
276
|
+
if getattr(x, "is_gpu", False): mask_t = mask_t.to_gpu()
|
277
|
+
return x.mul(mask_t)
|
278
|
+
|
279
|
+
class Dropout(Function):
|
280
|
+
@staticmethod
|
281
|
+
def forward(ctx: Any, input: np.ndarray, p: float = 0.5, training: bool = True) -> np.ndarray:
|
282
|
+
if not training: return input
|
283
|
+
# create a binary mask with probability (1-p) of being 1
|
284
|
+
# scale by 1/(1-p) to keep expectation same
|
285
|
+
ctx.training = training
|
286
|
+
mask = (np.random.rand(*input.shape) >= p).astype(np.float32) / (1.0 - p if p < 1.0 else 1e-9) # avoid division by zero if p is 1.0
|
287
|
+
ctx.save_for_backward(mask)
|
288
|
+
return input * mask
|
289
|
+
|
290
|
+
@staticmethod
|
291
|
+
def backward(ctx: Any, grad_output: np.ndarray) -> np.ndarray:
|
292
|
+
if not ctx.training: return grad_output
|
293
|
+
mask, = ctx.saved_tensors
|
294
|
+
return grad_output * mask
|
295
|
+
register("dropout", Dropout)
|
168
296
|
|
169
297
|
class Reshape(Function):
|
170
298
|
@staticmethod
|
171
|
-
def forward(ctx, x, shape):
|
299
|
+
def forward(ctx: Any, x: np.ndarray, shape: Tuple[int, ...]) -> np.ndarray:
|
172
300
|
ctx.save_for_backward(x.shape)
|
173
301
|
return x.reshape(shape)
|
174
302
|
|
175
303
|
@staticmethod
|
176
|
-
def backward(ctx, grad_output):
|
304
|
+
def backward(ctx: Any, grad_output: np.ndarray) -> np.ndarray:
|
177
305
|
in_shape, = ctx.saved_tensors
|
178
306
|
return grad_output.reshape(in_shape)
|
179
307
|
register('reshape', Reshape)
|
@@ -183,11 +311,13 @@ class Pad2D(Function):
|
|
183
311
|
The first element (0,0) corresponds to padding along the batch dimension, which indicates no padding on both sides (0 elements added).
|
184
312
|
"""
|
185
313
|
@staticmethod
|
186
|
-
def forward(ctx, x, padding=None):
|
314
|
+
def forward(ctx: Any, x: np.ndarray, padding: Optional[Tuple[int, int, int, int]] = None) -> np.ndarray:
|
315
|
+
if padding is None:
|
316
|
+
padding = (0, 0, 0, 0)
|
187
317
|
return np.pad(x, ((0,0), (0,0), (padding[0], padding[1]), (padding[2], padding[3]))) # (top, bottom, left, right)
|
188
318
|
|
189
319
|
@staticmethod
|
190
|
-
def backward(ctx, grad_output):
|
320
|
+
def backward(ctx: Any, grad_output: np.ndarray) -> np.ndarray:
|
191
321
|
raise Exception("write this")
|
192
322
|
register('pad2d', Pad2D)
|
193
323
|
|
@@ -197,8 +327,8 @@ class LogSoftmax(Function):
|
|
197
327
|
probabilities of each value are proportional to the scale of each value
|
198
328
|
"""
|
199
329
|
@staticmethod
|
200
|
-
def forward(ctx, input):
|
201
|
-
def logsumexp(x):
|
330
|
+
def forward(ctx: Any, input: np.ndarray) -> np.ndarray:
|
331
|
+
def logsumexp(x: np.ndarray) -> np.ndarray:
|
202
332
|
c = x.max(axis=1)
|
203
333
|
return c + np.log(np.exp(x - c.reshape((-1, 1))).sum(axis=1)) # axis=1 refers to the columns
|
204
334
|
|
@@ -207,7 +337,7 @@ class LogSoftmax(Function):
|
|
207
337
|
return output
|
208
338
|
|
209
339
|
@staticmethod
|
210
|
-
def backward(ctx, grad_output):
|
340
|
+
def backward(ctx: Any, grad_output: np.ndarray) -> np.ndarray:
|
211
341
|
(output,) = ctx.saved_tensors
|
212
342
|
return grad_output - np.exp(output)*(grad_output.sum(axis=1).reshape((-1, 1)))
|
213
343
|
register("logsoftmax", LogSoftmax)
|
@@ -224,62 +354,123 @@ register("logsoftmax", LogSoftmax)
|
|
224
354
|
|
225
355
|
class Conv2D(Function): # TODO: understand group splits
|
226
356
|
@staticmethod
|
227
|
-
def forward(ctx, x, w, stride=1, groups=1):
|
357
|
+
def forward(ctx: Any, x: np.ndarray, w: np.ndarray, stride: Union[int, Tuple[int, int]] = 1, groups: int = 1) -> np.ndarray:
|
228
358
|
"""
|
229
359
|
https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
|
230
360
|
WARNING: doesn't handle padding or strides yet
|
231
361
|
Args:
|
232
|
-
x.shape[0]
|
233
|
-
cout
|
234
|
-
x.shape[2]-(H-1)
|
235
|
-
x.shape[3]-(W-1)
|
362
|
+
x.shape[0] --> number of input examples (batch size)
|
363
|
+
cout --> number of output channels
|
364
|
+
x.shape[2]-(H-1) --> non-padded height of conv output, need to subtract because this is an unpadded conv
|
365
|
+
x.shape[3]-(W-1) --> width of output
|
236
366
|
Shape:
|
237
367
|
(a, b, c, d)(e, f, g, h) --> (a, e, c-(g-1), d-(h-1))
|
238
368
|
in general, output x and y = [(W−K+2P)/S]+1
|
239
369
|
"""
|
240
|
-
|
370
|
+
ctx.stride = stride
|
371
|
+
ctx.groups = groups
|
372
|
+
|
373
|
+
if isinstance(ctx.stride, int): # ctx stores function params
|
241
374
|
ctx.stride = (ctx.stride, ctx.stride)
|
242
375
|
|
243
|
-
|
244
|
-
|
245
|
-
|
246
|
-
|
247
|
-
|
248
|
-
|
249
|
-
|
250
|
-
|
251
|
-
|
252
|
-
|
253
|
-
|
254
|
-
|
255
|
-
|
256
|
-
|
257
|
-
|
258
|
-
|
259
|
-
|
260
|
-
|
261
|
-
|
262
|
-
|
263
|
-
|
376
|
+
# Check if we're working with GPU buffers and convert to CPU
|
377
|
+
try:
|
378
|
+
from froog.tensor import is_buffer
|
379
|
+
from froog.gpu import download_tensor
|
380
|
+
|
381
|
+
# Convert input to CPU if it's a GPU buffer
|
382
|
+
if is_buffer(x):
|
383
|
+
x_cpu = download_tensor(x)
|
384
|
+
else:
|
385
|
+
x_cpu = x
|
386
|
+
|
387
|
+
# Convert weight to CPU if it's a GPU buffer
|
388
|
+
if is_buffer(w):
|
389
|
+
w_cpu = download_tensor(w)
|
390
|
+
else:
|
391
|
+
w_cpu = w
|
392
|
+
|
393
|
+
# Now use the CPU tensors for the rest of the computation
|
394
|
+
cout, cin, H, W = w_cpu.shape
|
395
|
+
|
396
|
+
tw = w_cpu.reshape(cout, -1).T # slice of kernel
|
397
|
+
y_stride, x_stride = ctx.stride
|
398
|
+
|
399
|
+
bs,cin_,oy,ox = x_cpu.shape[0], x_cpu.shape[1], (x_cpu.shape[2]-(H-y_stride))//y_stride, (x_cpu.shape[3]-(W-x_stride))//x_stride
|
400
|
+
assert cin*ctx.groups == cin_ # ensures that the channel dimensions match appropriately for grouping
|
401
|
+
assert cout % ctx.groups == 0 # ensures that the number of output channels can be evenly divided among the groups
|
402
|
+
g_w_chans = cout//ctx.groups # number of output channels per group
|
403
|
+
|
404
|
+
ctx.save_for_backward(x_cpu, w_cpu)
|
405
|
+
ret = np.zeros((bs, cout, oy, ox), dtype=w_cpu.dtype)
|
406
|
+
|
407
|
+
for g in range(ctx.groups):
|
408
|
+
tw = w_cpu[g*g_w_chans:(g*g_w_chans+g_w_chans)].reshape(g_w_chans, -1).T # transformed kernel weights
|
409
|
+
for Y in range(oy):
|
410
|
+
for X in range(ox):
|
411
|
+
iY,iX = Y*y_stride, X*x_stride
|
412
|
+
tx = x_cpu[:, g*cin:(g*cin+cin), iY:iY+H, iX:iX+W].reshape(bs, -1)
|
413
|
+
ret[:, g*g_w_chans:(g*g_w_chans+g_w_chans), Y, X] += tx.dot(tw)
|
414
|
+
return ret
|
415
|
+
except Exception as e:
|
416
|
+
import traceback
|
417
|
+
print(f"Error in conv2d operation: {str(e)}")
|
418
|
+
print(f" Self: {x}")
|
419
|
+
print(f" Arg 0: {w}")
|
420
|
+
print(f" Kwargs: {{stride: {stride}, groups: {groups}}}")
|
421
|
+
traceback.print_exc()
|
422
|
+
raise
|
264
423
|
|
265
424
|
@staticmethod
|
266
|
-
def backward(ctx, grad_output):
|
267
|
-
|
268
|
-
|
269
|
-
|
270
|
-
|
271
|
-
|
272
|
-
|
273
|
-
|
274
|
-
|
275
|
-
|
276
|
-
|
277
|
-
|
278
|
-
|
279
|
-
|
280
|
-
|
281
|
-
|
282
|
-
|
425
|
+
def backward(ctx: Any, grad_output: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
|
426
|
+
try:
|
427
|
+
from froog.tensor import is_buffer
|
428
|
+
from froog.gpu import download_tensor
|
429
|
+
|
430
|
+
# Convert grad_output to CPU if it's a GPU buffer
|
431
|
+
if is_buffer(grad_output):
|
432
|
+
grad_output_cpu = download_tensor(grad_output)
|
433
|
+
else:
|
434
|
+
grad_output_cpu = grad_output
|
435
|
+
|
436
|
+
x, w = ctx.saved_tensors
|
437
|
+
cout, cin, H, W = w.shape
|
438
|
+
dx, dw = np.zeros_like(x), np.zeros_like(w)
|
439
|
+
y_stride, x_stride = ctx.stride
|
440
|
+
g_w_chans = cout//ctx.groups
|
441
|
+
|
442
|
+
for g in range(ctx.groups):
|
443
|
+
tw = w[g*g_w_chans:(g*g_w_chans+g_w_chans)].reshape(g_w_chans, -1)
|
444
|
+
for Y in range(grad_output_cpu.shape[2]):
|
445
|
+
for X in range(grad_output_cpu.shape[3]):
|
446
|
+
iY,iX = Y*y_stride, X*x_stride
|
447
|
+
gg = grad_output_cpu[:, g*g_w_chans:(g*g_w_chans+g_w_chans), Y, X] # current multiply element in chain rule
|
448
|
+
tx = x[:, g*cin:(g*cin+cin), iY:iY+H, iX:iX+W].reshape(x.shape[0], -1) # slice of tensor at current conv op
|
449
|
+
dw[g*g_w_chans:(g*g_w_chans+g_w_chans)] += gg.T.dot(tx).reshape((g_w_chans,cin,H,W)) # gradient with respect to input
|
450
|
+
dx[:, g*cin:(g*cin+cin), iY:iY+H, iX:iX+W] += gg.dot(tw).reshape(dx.shape[0], cin, H, W) # accumulate gradient with respect to weights
|
451
|
+
return dx, dw
|
452
|
+
except Exception as e:
|
453
|
+
import traceback
|
454
|
+
print(f"Error in conv2d backward: {str(e)}")
|
455
|
+
print(f" Grad Output: {grad_output}")
|
456
|
+
traceback.print_exc()
|
457
|
+
# Fallback to original implementation
|
458
|
+
x, w = ctx.saved_tensors
|
459
|
+
cout, cin, H, W = w.shape
|
460
|
+
dx, dw = np.zeros_like(x), np.zeros_like(w)
|
461
|
+
y_stride, x_stride = ctx.stride
|
462
|
+
g_w_chans = cout//ctx.groups
|
463
|
+
|
464
|
+
for g in range(ctx.groups):
|
465
|
+
tw = w[g*g_w_chans:(g*g_w_chans+g_w_chans)].reshape(g_w_chans, -1)
|
466
|
+
for Y in range(grad_output.shape[2]):
|
467
|
+
for X in range(grad_output.shape[3]):
|
468
|
+
iY,iX = Y*y_stride, X*x_stride
|
469
|
+
gg = grad_output[:, g*g_w_chans:(g*g_w_chans+g_w_chans), Y, X] # current multiply element in chain rule
|
470
|
+
tx = x[:, g*cin:(g*cin+cin), iY:iY+H, iX:iX+W].reshape(x.shape[0], -1) # slice of tensor at current conv op
|
471
|
+
dw[g*g_w_chans:(g*g_w_chans+g_w_chans)] += gg.T.dot(tx).reshape((g_w_chans,cin,H,W)) # gradient with respect to input
|
472
|
+
dx[:, g*cin:(g*cin+cin), iY:iY+H, iX:iX+W] += gg.dot(tw).reshape(dx.shape[0], cin, H, W) # accumulate gradient with respect to weights
|
473
|
+
return dx, dw
|
283
474
|
register('conv2d', Conv2D)
|
284
475
|
|
285
476
|
|
@@ -290,7 +481,7 @@ class im2ColConv(Function):
|
|
290
481
|
"""
|
291
482
|
|
292
483
|
@staticmethod
|
293
|
-
def forward(ctx, x, w):
|
484
|
+
def forward(ctx: Any, x: np.ndarray, w: np.ndarray) -> np.ndarray:
|
294
485
|
cout, cin, k_h, k_x = w.shape
|
295
486
|
bs, oy, ox = x.shape[0], x.shape[2]-(k_h-1), x.shape[3]-(k_x-1)
|
296
487
|
tw = w.reshape(cout, -1).T # each filter flattened into a row
|
@@ -300,7 +491,7 @@ class im2ColConv(Function):
|
|
300
491
|
return np.moveaxis(ret, [0,1,2,3], [0,2,3,1]) # reorders the axes (batch size, number of channels, height, width)
|
301
492
|
|
302
493
|
@staticmethod
|
303
|
-
def backward(ctx, grad_output):
|
494
|
+
def backward(ctx: Any, grad_output: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
|
304
495
|
bs,_,oy,ox = grad_output.shape
|
305
496
|
tx, w = ctx.saved_tensors # transformed input, filter weights
|
306
497
|
cout,cin,H,W = w.shape
|
@@ -322,7 +513,7 @@ register('im2col2dconv', im2ColConv)
|
|
322
513
|
#
|
323
514
|
# **************** pooling ops ***************
|
324
515
|
|
325
|
-
def stack_for_pool(x, pool_y, pool_x):
|
516
|
+
def stack_for_pool(x: np.ndarray, pool_y: int, pool_x: int) -> np.ndarray:
|
326
517
|
my, mx = (x.shape[2]//pool_y)*pool_y, (x.shape[3]//pool_x)*pool_x # ensures input tensor can be evenly divided into 2x2 blocks for max pooling
|
327
518
|
stack = []
|
328
519
|
cropped_x = x[:, :, :my, :mx] # crop input so 2x2 max pool can be taken
|
@@ -332,47 +523,49 @@ def stack_for_pool(x, pool_y, pool_x):
|
|
332
523
|
return np.concatenate(stack, axis=0) # put all into one row
|
333
524
|
|
334
525
|
|
335
|
-
def unstack_for_pool(fxn, s, py, px):
|
526
|
+
def unstack_for_pool(fxn: Callable[[int], np.ndarray], s: Tuple[int, ...], py: int, px: int) -> np.ndarray:
|
336
527
|
max_y, max_x = (s[2]//py)*py, (s[3]//px)*px # get shape that allows (pool_size_y,pool_size_x) max pool
|
528
|
+
ret = None
|
337
529
|
for Y in range(py):
|
338
530
|
for X in range(px):
|
339
531
|
level_w_new_grad = fxn(Y*px+X)
|
340
532
|
if X == 0 and Y == 0: # pool of zero size
|
341
533
|
ret = np.zeros(s, dtype=level_w_new_grad.dtype)
|
342
|
-
ret
|
343
|
-
|
534
|
+
if ret is not None:
|
535
|
+
ret[:, :, Y:max_y:py, X:max_x:px] = level_w_new_grad
|
536
|
+
return ret if ret is not None else np.zeros(s)
|
344
537
|
|
345
538
|
|
346
539
|
class MaxPool2D(Function):
|
347
540
|
@staticmethod
|
348
|
-
def forward(ctx, x, kernel_size=(2,2)):
|
541
|
+
def forward(ctx: Any, x: np.ndarray, kernel_size: Tuple[int, int] = (2,2)) -> np.ndarray:
|
542
|
+
ctx.kernel_size = kernel_size
|
349
543
|
stack = stack_for_pool(x, *kernel_size)
|
350
544
|
idx_of_max = np.argmax(stack, axis=0)
|
351
545
|
ctx.save_for_backward(idx_of_max, x.shape)
|
352
546
|
return np.max(stack, axis=0)
|
353
547
|
|
354
548
|
@staticmethod
|
355
|
-
def backward(ctx, grad_output):
|
549
|
+
def backward(ctx: Any, grad_output: np.ndarray) -> np.ndarray:
|
356
550
|
"""
|
357
551
|
Distributes the gradient from the output of the max pooling layer to its inputs
|
358
552
|
The purpose of (idxs == idx) is to generate a boolean mask indicating the locations of the maximum values in each 2x2 block of the original input
|
359
553
|
The expression (Y*2+X) is a way to iterate through the four possible positions within the kernel block: e.g. (0,0), (0,1), (1,0), and (1,1), which get mapped to the indices 0, 1, 2, and 3
|
360
554
|
"""
|
361
|
-
idxs, s = ctx.saved_tensors
|
362
|
-
return unstack_for_pool(lambda idx: grad_output * (idxs == idx),
|
363
|
-
s,
|
364
|
-
*ctx.kernel_size)
|
555
|
+
idxs, s = ctx.saved_tensors
|
556
|
+
return unstack_for_pool(lambda idx: grad_output * (idxs == idx), s, *ctx.kernel_size)
|
365
557
|
register('max_pool2d', MaxPool2D)
|
366
558
|
|
367
559
|
class AvgPool2D(Function):
|
368
560
|
@staticmethod
|
369
|
-
def forward(ctx, x, kernel_size=(2,2)):
|
561
|
+
def forward(ctx: Any, x: np.ndarray, kernel_size: Tuple[int, int] = (2,2)) -> np.ndarray:
|
562
|
+
ctx.kernel_size = kernel_size
|
370
563
|
stack = stack_for_pool(x, *kernel_size)
|
371
564
|
ctx.save_for_backward(x.shape)
|
372
565
|
return np.mean(stack, axis=0)
|
373
566
|
|
374
567
|
@staticmethod
|
375
|
-
def backward(ctx, grad_output):
|
568
|
+
def backward(ctx: Any, grad_output: np.ndarray) -> np.ndarray:
|
376
569
|
s, = ctx.saved_tensors
|
377
570
|
py, px = ctx.kernel_size # kernel_size passed from forward context
|
378
571
|
my, mx = (s[2]//py)*py, (s[3]//px)*px
|
@@ -392,12 +585,12 @@ register('avg_pool2d', AvgPool2D)
|
|
392
585
|
#
|
393
586
|
# ************* nn ops ************
|
394
587
|
|
395
|
-
def Linear(*x):
|
588
|
+
def Linear(*x: int) -> np.ndarray:
|
396
589
|
# random Glorot initialization
|
397
590
|
ret = np.random.uniform(-1., 1., size=x)/np.sqrt(np.prod(x))
|
398
591
|
return ret.astype(np.float32)
|
399
592
|
|
400
|
-
def swish(x):
|
593
|
+
def swish(x: Tensor) -> Tensor:
|
401
594
|
return x.mul(x.sigmoid())
|
402
595
|
|
403
596
|
class BatchNorm2D:
|
@@ -416,7 +609,7 @@ class BatchNorm2D:
|
|
416
609
|
self.running_mean has shape [num_channels].
|
417
610
|
self.running_mean.reshape(shape=[1, -1, 1, 1]) reshapes it to [1, num_channels, 1, 1]
|
418
611
|
"""
|
419
|
-
def __init__(self, sz, eps=0.001):
|
612
|
+
def __init__(self, sz: int, eps: float = 0.001) -> None:
|
420
613
|
self.eps = eps
|
421
614
|
self.weight = Tensor.zeros(sz)
|
422
615
|
self.bias = Tensor.zeros(sz)
|
@@ -426,7 +619,7 @@ class BatchNorm2D:
|
|
426
619
|
self.running_var = Tensor.zeros(sz)
|
427
620
|
self.num_batches_tracked = Tensor.zeros(1)
|
428
621
|
|
429
|
-
def __call__(self, x):
|
622
|
+
def __call__(self, x: Tensor) -> Tensor:
|
430
623
|
x = x.sub(self.running_mean.reshape(shape=[1, -1, 1, 1]))
|
431
624
|
x = x.mul(self.weight.reshape(shape=[1, -1, 1, 1]))
|
432
625
|
x = x.div(self.running_var.add(Tensor([self.eps], gpu=x.gpu)).reshape(shape=[1, -1, 1, 1]).sqrt())
|