froog 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
froog/ops.py CHANGED
@@ -7,7 +7,8 @@
7
7
  # |___| |___| |_||_______||_______||_______|
8
8
 
9
9
  import numpy as np
10
- from froog.tensor import Function, register
10
+ from typing import Tuple, Union, Optional, Any, Callable
11
+ from froog.tensor import Function, register, Tensor
11
12
  from froog.utils import im2col, col2im
12
13
 
13
14
  # *****************************************************
@@ -22,32 +23,74 @@ from froog.utils import im2col, col2im
22
23
 
23
24
  class Add(Function):# x.add(y)
24
25
  @staticmethod # @staticmethod doesn't require an instance of Add to work, so you can do x.add(y)
25
- def forward(ctx, x, y):
26
+ def forward(ctx: Any, x: np.ndarray, y: np.ndarray) -> np.ndarray:
27
+ # Check if we have GPU buffers
28
+ is_metal_buffer = lambda x: hasattr(x, '__pyobjc_object__') or str(type(x)).find('Buffer') >= 0
29
+ if is_metal_buffer(x) or is_metal_buffer(y):
30
+ # Import get_buffer_data helper for Metal buffers
31
+ try:
32
+ from froog.gpu.buffer_utils import get_buffer_data
33
+ x_data = get_buffer_data(x)
34
+ y_data = get_buffer_data(y)
35
+ ctx.save_for_backward(x_data, y_data)
36
+ return x_data + y_data
37
+ except ImportError:
38
+ print("Warning: buffer_utils not available")
39
+ # Fall back to regular implementation
40
+ ctx.save_for_backward(x, y)
41
+ return x + y
42
+
43
+ # Regular implementation
44
+ ctx.save_for_backward(x, y)
26
45
  return x + y
27
46
 
28
47
  @staticmethod
29
- def backward(ctx, grad_output):
48
+ def backward(ctx: Any, grad_output: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
30
49
  return grad_output, grad_output
31
50
  register("add", Add)
32
51
 
33
52
  class Sub(Function): # x.sub(y)
34
53
  @staticmethod
35
- def forward(ctx, x, y):
54
+ def forward(ctx: Any, x: np.ndarray, y: np.ndarray) -> np.ndarray:
36
55
  return x-y
37
56
 
38
57
  @staticmethod
39
- def backward(ctx, grad_output):
58
+ def backward(ctx: Any, grad_output: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
40
59
  return grad_output, -grad_output
41
60
  register('sub', Sub)
42
61
 
43
62
  class Mul(Function): # x.mul(y)
44
63
  @staticmethod
45
- def forward(ctx, x, y):
64
+ def forward(ctx: Any, x: np.ndarray, y: np.ndarray) -> np.ndarray:
65
+ # Check if we have GPU buffers
66
+ is_metal_buffer = lambda x: hasattr(x, '__pyobjc_object__') or str(type(x)).find('Buffer') >= 0
67
+ if is_metal_buffer(x) or is_metal_buffer(y):
68
+ # Import get_buffer_data helper for Metal buffers
69
+ try:
70
+ from froog.gpu.buffer_utils import get_buffer_data, buffer_mul
71
+ x_data = get_buffer_data(x)
72
+ y_data = get_buffer_data(y)
73
+ ctx.save_for_backward(x_data, y_data)
74
+ return buffer_mul(x, y)
75
+ except Exception as e:
76
+ print(f"Error in Mul.forward with buffer: {e}")
77
+ # Fall back to CPU implementation if buffer handling fails
78
+ from froog.gpu import get_device
79
+ device = get_device()
80
+ if device:
81
+ x_cpu = device.download_tensor(x) if is_metal_buffer(x) else x
82
+ y_cpu = device.download_tensor(y) if is_metal_buffer(y) else y
83
+ ctx.save_for_backward(x_cpu, y_cpu)
84
+ result = x_cpu * y_cpu
85
+ return device.upload_tensor(result)
86
+ raise
87
+
88
+ # Standard CPU implementation
46
89
  ctx.save_for_backward(x, y)
47
90
  return x * y
48
91
 
49
92
  @staticmethod
50
- def backward(ctx, grad_output):
93
+ def backward(ctx: Any, grad_output: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
51
94
  x, y = ctx.saved_tensors
52
95
  return y * grad_output, x * grad_output
53
96
  register("mul", Mul)
@@ -58,24 +101,49 @@ class Sum(Function): # x.sum()
58
101
  reduces its input tensor to a single value by summing all the elements
59
102
  """
60
103
  @staticmethod
61
- def forward(ctx, input):
104
+ def forward(ctx: Any, input: np.ndarray) -> np.ndarray:
105
+ # Check if we have a GPU buffer
106
+ is_metal_buffer = lambda x: hasattr(x, '__pyobjc_object__') or str(type(x)).find('Buffer') >= 0
107
+ if is_metal_buffer(input):
108
+ # Use buffer utilities
109
+ try:
110
+ from froog.gpu.buffer_utils import get_buffer_data, buffer_sum
111
+ input_data = get_buffer_data(input)
112
+ ctx.save_for_backward(input_data)
113
+ ctx.input_shape = input_data.shape
114
+ return buffer_sum(input)
115
+ except Exception as e:
116
+ print(f"Error in Sum.forward with buffer: {e}")
117
+ # Fall back to CPU implementation
118
+ from froog.gpu import get_device
119
+ device = get_device()
120
+ if device:
121
+ input_cpu = device.download_tensor(input)
122
+ ctx.save_for_backward(input_cpu)
123
+ ctx.input_shape = input_cpu.shape
124
+ result = np.array([np.sum(input_cpu)])
125
+ return device.upload_tensor(result)
126
+ raise
127
+
128
+ # Standard CPU implementation
62
129
  ctx.save_for_backward(input)
63
- return np.array([input.sum()])
130
+ ctx.input_shape = input.shape
131
+ return np.array([np.sum(input)])
64
132
 
65
133
  @staticmethod
66
- def backward(ctx, grad_output):
134
+ def backward(ctx: Any, grad_output: np.ndarray) -> np.ndarray:
67
135
  (input,) = ctx.saved_tensors
68
136
  return grad_output * np.ones_like(input)
69
137
  register("sum", Sum)
70
138
 
71
139
  class Pow(Function): # x.pow(y)
72
140
  @staticmethod
73
- def forward(ctx, x, y):
141
+ def forward(ctx: Any, x: np.ndarray, y: np.ndarray) -> np.ndarray:
74
142
  ctx.save_for_backward(x, y)
75
143
  return x ** y
76
144
 
77
145
  @staticmethod
78
- def backward(ctx, grad_output):
146
+ def backward(ctx: Any, grad_output: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
79
147
  x, y = ctx.saved_tensors
80
148
  return y * (x**(y-1.0)) * grad_output, (x**y) * np.log(x) * grad_output # power rule, d/dx (y^x)
81
149
  register("pow", Pow)
@@ -92,16 +160,65 @@ register("pow", Pow)
92
160
 
93
161
  class Dot(Function): # x.dot(y)
94
162
  @staticmethod
95
- def forward(ctx, input, weight):
163
+ def forward(ctx: Any, input: np.ndarray, weight: np.ndarray) -> np.ndarray:
96
164
  ctx.save_for_backward(input, weight)
97
- return input.dot(weight)
165
+
166
+ # Check if we're working with GPU buffers
167
+ try:
168
+ from froog.tensor import is_buffer
169
+ from froog.gpu import download_tensor
170
+
171
+ # Convert any GPU buffers to CPU for the operation
172
+ if is_buffer(input):
173
+ input_cpu = download_tensor(input)
174
+ else:
175
+ input_cpu = input
176
+
177
+ if is_buffer(weight):
178
+ weight_cpu = download_tensor(weight)
179
+ else:
180
+ weight_cpu = weight
181
+
182
+ return input_cpu.dot(weight_cpu)
183
+ except Exception as e:
184
+ import traceback
185
+ print(f"Error in dot operation: {str(e)}")
186
+ print(f" Self: {input}")
187
+ print(f" Arg 0: {weight}")
188
+ print(f" Kwargs: {{}}")
189
+ traceback.print_exc()
190
+ # Try the original method as fallback
191
+ return input.dot(weight)
98
192
 
99
193
  @staticmethod
100
- def backward(ctx, grad_output):
194
+ def backward(ctx: Any, grad_output: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
101
195
  input, weight = ctx.saved_tensors
102
- grad_input = grad_output.dot(weight.T)
103
- grad_weight = input.T.dot(grad_output)
104
- return grad_input, grad_weight
196
+
197
+ # Convert GPU buffers to CPU if needed
198
+ try:
199
+ from froog.tensor import is_buffer
200
+ from froog.gpu import download_tensor
201
+
202
+ if is_buffer(input):
203
+ input_cpu = download_tensor(input)
204
+ else:
205
+ input_cpu = input
206
+
207
+ if is_buffer(weight):
208
+ weight_cpu = download_tensor(weight)
209
+ else:
210
+ weight_cpu = weight
211
+
212
+ if is_buffer(grad_output):
213
+ grad_output_cpu = download_tensor(grad_output)
214
+ else:
215
+ grad_output_cpu = grad_output
216
+
217
+ return grad_output_cpu.dot(weight_cpu.T), input_cpu.T.dot(grad_output_cpu)
218
+ except Exception as e:
219
+ print(f"Error in dot backward: {str(e)}")
220
+ # Fallback
221
+ return grad_output.dot(weight.T), input.T.dot(grad_output)
105
222
  register('dot', Dot)
106
223
  register('matmul', Dot)
107
224
 
@@ -117,12 +234,12 @@ register('matmul', Dot)
117
234
 
118
235
  class ReLU(Function):
119
236
  @staticmethod
120
- def forward(ctx, input):
237
+ def forward(ctx: Any, input: np.ndarray) -> np.ndarray:
121
238
  ctx.save_for_backward(input)
122
239
  return np.maximum(input, 0) # relu(x) = max(0,x)
123
240
 
124
241
  @staticmethod
125
- def backward(ctx, grad_output):
242
+ def backward(ctx: Any, grad_output: np.ndarray) -> np.ndarray:
126
243
  input, = ctx.saved_tensors
127
244
  grad_input = grad_output * (input >= 0)
128
245
  return grad_input
@@ -130,26 +247,61 @@ register("relu", ReLU)
130
247
 
131
248
  class Sigmoid(Function):
132
249
  @staticmethod
133
- def forward(ctx, input):
250
+ def forward(ctx: Any, input: np.ndarray) -> np.ndarray:
134
251
  ctx.save_for_backward(input)
135
252
  ret = 1/(1 + np.exp(-input)) # sigmoid(x) = 1 / (1 + exp(-x))
136
253
  return ret
137
254
 
138
255
  @staticmethod
139
- def backward(ctx, grad_output):
256
+ def backward(ctx: Any, grad_output: np.ndarray) -> np.ndarray:
140
257
  ret, = ctx.saved_tensors
141
258
  grad_input = grad_output * (ret * (1 - ret)) # just take the derivative of sigmoid
142
259
  return grad_input
143
260
  register("sigmoid", Sigmoid)
144
261
 
262
+ class DropoutLayer:
263
+ """
264
+ Dropout layer that randomly sets a fraction of input units to 0 during training time.
265
+ pytorch version: https://pytorch.org/docs/stable/generated/torch.nn.Dropout.html
266
+ """
267
+ def __init__(self, p: float = 0.5) -> None:
268
+ self.p = p
269
+ self.training = True
270
+
271
+ def __call__(self, x):
272
+ # build a CPU‐side random mask of the same shape as the tensor x
273
+ mask_np = (np.random.rand(*x.shape) >= self.p).astype(np.float32) / (1.0 - self.p)
274
+ from froog.tensor import Tensor
275
+ mask_t = Tensor(mask_np)
276
+ if getattr(x, "is_gpu", False): mask_t = mask_t.to_gpu()
277
+ return x.mul(mask_t)
278
+
279
+ class Dropout(Function):
280
+ @staticmethod
281
+ def forward(ctx: Any, input: np.ndarray, p: float = 0.5, training: bool = True) -> np.ndarray:
282
+ if not training: return input
283
+ # create a binary mask with probability (1-p) of being 1
284
+ # scale by 1/(1-p) to keep expectation same
285
+ ctx.training = training
286
+ mask = (np.random.rand(*input.shape) >= p).astype(np.float32) / (1.0 - p if p < 1.0 else 1e-9) # avoid division by zero if p is 1.0
287
+ ctx.save_for_backward(mask)
288
+ return input * mask
289
+
290
+ @staticmethod
291
+ def backward(ctx: Any, grad_output: np.ndarray) -> np.ndarray:
292
+ if not ctx.training: return grad_output
293
+ mask, = ctx.saved_tensors
294
+ return grad_output * mask
295
+ register("dropout", Dropout)
296
+
145
297
  class Reshape(Function):
146
298
  @staticmethod
147
- def forward(ctx, x, shape):
299
+ def forward(ctx: Any, x: np.ndarray, shape: Tuple[int, ...]) -> np.ndarray:
148
300
  ctx.save_for_backward(x.shape)
149
301
  return x.reshape(shape)
150
302
 
151
303
  @staticmethod
152
- def backward(ctx, grad_output):
304
+ def backward(ctx: Any, grad_output: np.ndarray) -> np.ndarray:
153
305
  in_shape, = ctx.saved_tensors
154
306
  return grad_output.reshape(in_shape)
155
307
  register('reshape', Reshape)
@@ -159,11 +311,13 @@ class Pad2D(Function):
159
311
  The first element (0,0) corresponds to padding along the batch dimension, which indicates no padding on both sides (0 elements added).
160
312
  """
161
313
  @staticmethod
162
- def forward(ctx, x, padding=None):
314
+ def forward(ctx: Any, x: np.ndarray, padding: Optional[Tuple[int, int, int, int]] = None) -> np.ndarray:
315
+ if padding is None:
316
+ padding = (0, 0, 0, 0)
163
317
  return np.pad(x, ((0,0), (0,0), (padding[0], padding[1]), (padding[2], padding[3]))) # (top, bottom, left, right)
164
318
 
165
319
  @staticmethod
166
- def backward(ctx, grad_output):
320
+ def backward(ctx: Any, grad_output: np.ndarray) -> np.ndarray:
167
321
  raise Exception("write this")
168
322
  register('pad2d', Pad2D)
169
323
 
@@ -173,8 +327,8 @@ class LogSoftmax(Function):
173
327
  probabilities of each value are proportional to the scale of each value
174
328
  """
175
329
  @staticmethod
176
- def forward(ctx, input):
177
- def logsumexp(x):
330
+ def forward(ctx: Any, input: np.ndarray) -> np.ndarray:
331
+ def logsumexp(x: np.ndarray) -> np.ndarray:
178
332
  c = x.max(axis=1)
179
333
  return c + np.log(np.exp(x - c.reshape((-1, 1))).sum(axis=1)) # axis=1 refers to the columns
180
334
 
@@ -183,7 +337,7 @@ class LogSoftmax(Function):
183
337
  return output
184
338
 
185
339
  @staticmethod
186
- def backward(ctx, grad_output):
340
+ def backward(ctx: Any, grad_output: np.ndarray) -> np.ndarray:
187
341
  (output,) = ctx.saved_tensors
188
342
  return grad_output - np.exp(output)*(grad_output.sum(axis=1).reshape((-1, 1)))
189
343
  register("logsoftmax", LogSoftmax)
@@ -200,62 +354,123 @@ register("logsoftmax", LogSoftmax)
200
354
 
201
355
  class Conv2D(Function): # TODO: understand group splits
202
356
  @staticmethod
203
- def forward(ctx, x, w, stride=1, groups=1):
357
+ def forward(ctx: Any, x: np.ndarray, w: np.ndarray, stride: Union[int, Tuple[int, int]] = 1, groups: int = 1) -> np.ndarray:
204
358
  """
205
359
  https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
206
360
  WARNING: doesn't handle padding or strides yet
207
361
  Args:
208
- x.shape[0] --> number of input examples (batch size)
209
- cout --> number of output channels
210
- x.shape[2]-(H-1) --> non-padded height of conv output, need to subtract because this is an unpadded conv
211
- x.shape[3]-(W-1) --> width of output
362
+ x.shape[0] --> number of input examples (batch size)
363
+ cout --> number of output channels
364
+ x.shape[2]-(H-1) --> non-padded height of conv output, need to subtract because this is an unpadded conv
365
+ x.shape[3]-(W-1) --> width of output
212
366
  Shape:
213
367
  (a, b, c, d)(e, f, g, h) --> (a, e, c-(g-1), d-(h-1))
214
368
  in general, output x and y = [(W−K+2P)/S]+1
215
369
  """
216
- if type(ctx.stride) == int: # ctx stores function params
370
+ ctx.stride = stride
371
+ ctx.groups = groups
372
+
373
+ if isinstance(ctx.stride, int): # ctx stores function params
217
374
  ctx.stride = (ctx.stride, ctx.stride)
218
375
 
219
- cout, cin, H, W = w.shape
220
-
221
- tw = w.reshape(cout, -1).T # slice of kernel
222
- y_stride, x_stride = ctx.stride
223
-
224
- bs,cin_,oy,ox = x.shape[0], x.shape[1], (x.shape[2]-(H-y_stride))//y_stride, (x.shape[3]-(W-x_stride))//x_stride
225
- assert cin*ctx.groups == cin_ # ensures that the channel dimensions match appropriately for grouping
226
- assert cout % ctx.groups == 0 # ensures that the number of output channels can be evenly divided among the groups
227
- g_w_chans = cout//ctx.groups # number of output channels per group
228
-
229
- ctx.save_for_backward(x, w)
230
- ret = np.zeros((bs, cout, oy, ox), dtype=w.dtype)
231
-
232
- for g in range(ctx.groups):
233
- tw = w[g*g_w_chans:(g*g_w_chans+g_w_chans)].reshape(g_w_chans, -1).T # transformed kernel weights
234
- for Y in range(oy):
235
- for X in range(ox):
236
- iY,iX = Y*y_stride, X*x_stride
237
- tx = x[:, g*cin:(g*cin+cin), iY:iY+H, iX:iX+W].reshape(bs, -1)
238
- ret[:, g*g_w_chans:(g*g_w_chans+g_w_chans), Y, X] += tx.dot(tw)
239
- return ret
376
+ # Check if we're working with GPU buffers and convert to CPU
377
+ try:
378
+ from froog.tensor import is_buffer
379
+ from froog.gpu import download_tensor
380
+
381
+ # Convert input to CPU if it's a GPU buffer
382
+ if is_buffer(x):
383
+ x_cpu = download_tensor(x)
384
+ else:
385
+ x_cpu = x
386
+
387
+ # Convert weight to CPU if it's a GPU buffer
388
+ if is_buffer(w):
389
+ w_cpu = download_tensor(w)
390
+ else:
391
+ w_cpu = w
392
+
393
+ # Now use the CPU tensors for the rest of the computation
394
+ cout, cin, H, W = w_cpu.shape
395
+
396
+ tw = w_cpu.reshape(cout, -1).T # slice of kernel
397
+ y_stride, x_stride = ctx.stride
398
+
399
+ bs,cin_,oy,ox = x_cpu.shape[0], x_cpu.shape[1], (x_cpu.shape[2]-(H-y_stride))//y_stride, (x_cpu.shape[3]-(W-x_stride))//x_stride
400
+ assert cin*ctx.groups == cin_ # ensures that the channel dimensions match appropriately for grouping
401
+ assert cout % ctx.groups == 0 # ensures that the number of output channels can be evenly divided among the groups
402
+ g_w_chans = cout//ctx.groups # number of output channels per group
403
+
404
+ ctx.save_for_backward(x_cpu, w_cpu)
405
+ ret = np.zeros((bs, cout, oy, ox), dtype=w_cpu.dtype)
406
+
407
+ for g in range(ctx.groups):
408
+ tw = w_cpu[g*g_w_chans:(g*g_w_chans+g_w_chans)].reshape(g_w_chans, -1).T # transformed kernel weights
409
+ for Y in range(oy):
410
+ for X in range(ox):
411
+ iY,iX = Y*y_stride, X*x_stride
412
+ tx = x_cpu[:, g*cin:(g*cin+cin), iY:iY+H, iX:iX+W].reshape(bs, -1)
413
+ ret[:, g*g_w_chans:(g*g_w_chans+g_w_chans), Y, X] += tx.dot(tw)
414
+ return ret
415
+ except Exception as e:
416
+ import traceback
417
+ print(f"Error in conv2d operation: {str(e)}")
418
+ print(f" Self: {x}")
419
+ print(f" Arg 0: {w}")
420
+ print(f" Kwargs: {{stride: {stride}, groups: {groups}}}")
421
+ traceback.print_exc()
422
+ raise
240
423
 
241
424
  @staticmethod
242
- def backward(ctx, grad_output):
243
- x, w = ctx.saved_tensors
244
- cout, cin, H, W = w.shape
245
- dx, dw = np.zeros_like(x), np.zeros_like(w)
246
- y_stride, x_stride = ctx.stride
247
- g_w_chans = cout//ctx.groups
248
-
249
- for g in range(ctx.groups):
250
- tw = w[g*g_w_chans:(g*g_w_chans+g_w_chans)].reshape(g_w_chans, -1)
251
- for Y in range(grad_output.shape[2]):
252
- for X in range(grad_output.shape[3]):
253
- iY,iX = Y*y_stride, X*x_stride
254
- gg = grad_output[:, g*g_w_chans:(g*g_w_chans+g_w_chans), Y, X] # current multiply element in chain rule
255
- tx = x[:, g*cin:(g*cin+cin), iY:iY+H, iX:iX+W].reshape(x.shape[0], -1) # slice of tensor at current conv op
256
- dw[g*g_w_chans:(g*g_w_chans+g_w_chans)] += gg.T.dot(tx).reshape((g_w_chans,cin,H,W)) # gradient with respect to input
257
- dx[:, g*cin:(g*cin+cin), iY:iY+H, iX:iX+W] += gg.dot(tw).reshape(dx.shape[0], cin, H, W) # accumulate gradient with respect to weights
258
- return dx, dw
425
+ def backward(ctx: Any, grad_output: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
426
+ try:
427
+ from froog.tensor import is_buffer
428
+ from froog.gpu import download_tensor
429
+
430
+ # Convert grad_output to CPU if it's a GPU buffer
431
+ if is_buffer(grad_output):
432
+ grad_output_cpu = download_tensor(grad_output)
433
+ else:
434
+ grad_output_cpu = grad_output
435
+
436
+ x, w = ctx.saved_tensors
437
+ cout, cin, H, W = w.shape
438
+ dx, dw = np.zeros_like(x), np.zeros_like(w)
439
+ y_stride, x_stride = ctx.stride
440
+ g_w_chans = cout//ctx.groups
441
+
442
+ for g in range(ctx.groups):
443
+ tw = w[g*g_w_chans:(g*g_w_chans+g_w_chans)].reshape(g_w_chans, -1)
444
+ for Y in range(grad_output_cpu.shape[2]):
445
+ for X in range(grad_output_cpu.shape[3]):
446
+ iY,iX = Y*y_stride, X*x_stride
447
+ gg = grad_output_cpu[:, g*g_w_chans:(g*g_w_chans+g_w_chans), Y, X] # current multiply element in chain rule
448
+ tx = x[:, g*cin:(g*cin+cin), iY:iY+H, iX:iX+W].reshape(x.shape[0], -1) # slice of tensor at current conv op
449
+ dw[g*g_w_chans:(g*g_w_chans+g_w_chans)] += gg.T.dot(tx).reshape((g_w_chans,cin,H,W)) # gradient with respect to input
450
+ dx[:, g*cin:(g*cin+cin), iY:iY+H, iX:iX+W] += gg.dot(tw).reshape(dx.shape[0], cin, H, W) # accumulate gradient with respect to weights
451
+ return dx, dw
452
+ except Exception as e:
453
+ import traceback
454
+ print(f"Error in conv2d backward: {str(e)}")
455
+ print(f" Grad Output: {grad_output}")
456
+ traceback.print_exc()
457
+ # Fallback to original implementation
458
+ x, w = ctx.saved_tensors
459
+ cout, cin, H, W = w.shape
460
+ dx, dw = np.zeros_like(x), np.zeros_like(w)
461
+ y_stride, x_stride = ctx.stride
462
+ g_w_chans = cout//ctx.groups
463
+
464
+ for g in range(ctx.groups):
465
+ tw = w[g*g_w_chans:(g*g_w_chans+g_w_chans)].reshape(g_w_chans, -1)
466
+ for Y in range(grad_output.shape[2]):
467
+ for X in range(grad_output.shape[3]):
468
+ iY,iX = Y*y_stride, X*x_stride
469
+ gg = grad_output[:, g*g_w_chans:(g*g_w_chans+g_w_chans), Y, X] # current multiply element in chain rule
470
+ tx = x[:, g*cin:(g*cin+cin), iY:iY+H, iX:iX+W].reshape(x.shape[0], -1) # slice of tensor at current conv op
471
+ dw[g*g_w_chans:(g*g_w_chans+g_w_chans)] += gg.T.dot(tx).reshape((g_w_chans,cin,H,W)) # gradient with respect to input
472
+ dx[:, g*cin:(g*cin+cin), iY:iY+H, iX:iX+W] += gg.dot(tw).reshape(dx.shape[0], cin, H, W) # accumulate gradient with respect to weights
473
+ return dx, dw
259
474
  register('conv2d', Conv2D)
260
475
 
261
476
 
@@ -266,7 +481,7 @@ class im2ColConv(Function):
266
481
  """
267
482
 
268
483
  @staticmethod
269
- def forward(ctx, x, w):
484
+ def forward(ctx: Any, x: np.ndarray, w: np.ndarray) -> np.ndarray:
270
485
  cout, cin, k_h, k_x = w.shape
271
486
  bs, oy, ox = x.shape[0], x.shape[2]-(k_h-1), x.shape[3]-(k_x-1)
272
487
  tw = w.reshape(cout, -1).T # each filter flattened into a row
@@ -276,7 +491,7 @@ class im2ColConv(Function):
276
491
  return np.moveaxis(ret, [0,1,2,3], [0,2,3,1]) # reorders the axes (batch size, number of channels, height, width)
277
492
 
278
493
  @staticmethod
279
- def backward(ctx, grad_output):
494
+ def backward(ctx: Any, grad_output: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
280
495
  bs,_,oy,ox = grad_output.shape
281
496
  tx, w = ctx.saved_tensors # transformed input, filter weights
282
497
  cout,cin,H,W = w.shape
@@ -298,7 +513,7 @@ register('im2col2dconv', im2ColConv)
298
513
  #
299
514
  # **************** pooling ops ***************
300
515
 
301
- def stack_for_pool(x, pool_y, pool_x):
516
+ def stack_for_pool(x: np.ndarray, pool_y: int, pool_x: int) -> np.ndarray:
302
517
  my, mx = (x.shape[2]//pool_y)*pool_y, (x.shape[3]//pool_x)*pool_x # ensures input tensor can be evenly divided into 2x2 blocks for max pooling
303
518
  stack = []
304
519
  cropped_x = x[:, :, :my, :mx] # crop input so 2x2 max pool can be taken
@@ -308,47 +523,49 @@ def stack_for_pool(x, pool_y, pool_x):
308
523
  return np.concatenate(stack, axis=0) # put all into one row
309
524
 
310
525
 
311
- def unstack_for_pool(fxn, s, py, px):
526
+ def unstack_for_pool(fxn: Callable[[int], np.ndarray], s: Tuple[int, ...], py: int, px: int) -> np.ndarray:
312
527
  max_y, max_x = (s[2]//py)*py, (s[3]//px)*px # get shape that allows (pool_size_y,pool_size_x) max pool
528
+ ret = None
313
529
  for Y in range(py):
314
530
  for X in range(px):
315
531
  level_w_new_grad = fxn(Y*px+X)
316
532
  if X == 0 and Y == 0: # pool of zero size
317
533
  ret = np.zeros(s, dtype=level_w_new_grad.dtype)
318
- ret[:, :, Y:max_y:py, X:max_x:px] = level_w_new_grad
319
- return ret
534
+ if ret is not None:
535
+ ret[:, :, Y:max_y:py, X:max_x:px] = level_w_new_grad
536
+ return ret if ret is not None else np.zeros(s)
320
537
 
321
538
 
322
539
  class MaxPool2D(Function):
323
540
  @staticmethod
324
- def forward(ctx, x, kernel_size=(2,2)):
541
+ def forward(ctx: Any, x: np.ndarray, kernel_size: Tuple[int, int] = (2,2)) -> np.ndarray:
542
+ ctx.kernel_size = kernel_size
325
543
  stack = stack_for_pool(x, *kernel_size)
326
544
  idx_of_max = np.argmax(stack, axis=0)
327
545
  ctx.save_for_backward(idx_of_max, x.shape)
328
546
  return np.max(stack, axis=0)
329
547
 
330
548
  @staticmethod
331
- def backward(ctx, grad_output):
549
+ def backward(ctx: Any, grad_output: np.ndarray) -> np.ndarray:
332
550
  """
333
551
  Distributes the gradient from the output of the max pooling layer to its inputs
334
552
  The purpose of (idxs == idx) is to generate a boolean mask indicating the locations of the maximum values in each 2x2 block of the original input
335
553
  The expression (Y*2+X) is a way to iterate through the four possible positions within the kernel block: e.g. (0,0), (0,1), (1,0), and (1,1), which get mapped to the indices 0, 1, 2, and 3
336
554
  """
337
- idxs, s = ctx.saved_tensors
338
- return unstack_for_pool(lambda idx: grad_output * (idxs == idx),
339
- s,
340
- *ctx.kernel_size)
555
+ idxs, s = ctx.saved_tensors
556
+ return unstack_for_pool(lambda idx: grad_output * (idxs == idx), s, *ctx.kernel_size)
341
557
  register('max_pool2d', MaxPool2D)
342
558
 
343
559
  class AvgPool2D(Function):
344
560
  @staticmethod
345
- def forward(ctx, x, kernel_size=(2,2)):
561
+ def forward(ctx: Any, x: np.ndarray, kernel_size: Tuple[int, int] = (2,2)) -> np.ndarray:
562
+ ctx.kernel_size = kernel_size
346
563
  stack = stack_for_pool(x, *kernel_size)
347
564
  ctx.save_for_backward(x.shape)
348
565
  return np.mean(stack, axis=0)
349
566
 
350
567
  @staticmethod
351
- def backward(ctx, grad_output):
568
+ def backward(ctx: Any, grad_output: np.ndarray) -> np.ndarray:
352
569
  s, = ctx.saved_tensors
353
570
  py, px = ctx.kernel_size # kernel_size passed from forward context
354
571
  my, mx = (s[2]//py)*py, (s[3]//px)*px
@@ -358,3 +575,53 @@ class AvgPool2D(Function):
358
575
  ret[:, :, Y:my:py, X:mx:px] = grad_output / py / px # divide by avg of pool, e.g. for 2x2 pool /= 4
359
576
  return ret
360
577
  register('avg_pool2d', AvgPool2D)
578
+
579
+ # *************************************
580
+ # _ ___ __ ____ ____ _____
581
+ # / | / / | / / / __ \/ __ \/ ___/
582
+ # / |/ / |/ / / / / / /_/ /\__ \
583
+ # / /| / /| / / /_/ / ____/___/ /
584
+ # /_/ |_/_/ |_/ \____/_/ /____/
585
+ #
586
+ # ************* nn ops ************
587
+
588
+ def Linear(*x: int) -> np.ndarray:
589
+ # random Glorot initialization
590
+ ret = np.random.uniform(-1., 1., size=x)/np.sqrt(np.prod(x))
591
+ return ret.astype(np.float32)
592
+
593
+ def swish(x: Tensor) -> Tensor:
594
+ return x.mul(x.sigmoid())
595
+
596
+ class BatchNorm2D:
597
+ """
598
+ __call__ follows the formula from the link below
599
+ pytorch version: https://pytorch.org/docs/stable/generated/torch.nn.BatchNorm2d.html
600
+
601
+ self.weight = γ
602
+ self.bias = β
603
+ self.running_mean = E[x]
604
+ self.running_var = Var[x]
605
+
606
+ the reshaping step ensures that each channel of the input has its
607
+ own separate set of parameters (mean, variance, weight, and bias)
608
+
609
+ self.running_mean has shape [num_channels].
610
+ self.running_mean.reshape(shape=[1, -1, 1, 1]) reshapes it to [1, num_channels, 1, 1]
611
+ """
612
+ def __init__(self, sz: int, eps: float = 0.001) -> None:
613
+ self.eps = eps
614
+ self.weight = Tensor.zeros(sz)
615
+ self.bias = Tensor.zeros(sz)
616
+
617
+ # TODO: need running_mean and running_var
618
+ self.running_mean = Tensor.zeros(sz)
619
+ self.running_var = Tensor.zeros(sz)
620
+ self.num_batches_tracked = Tensor.zeros(1)
621
+
622
+ def __call__(self, x: Tensor) -> Tensor:
623
+ x = x.sub(self.running_mean.reshape(shape=[1, -1, 1, 1]))
624
+ x = x.mul(self.weight.reshape(shape=[1, -1, 1, 1]))
625
+ x = x.div(self.running_var.add(Tensor([self.eps], gpu=x.gpu)).reshape(shape=[1, -1, 1, 1]).sqrt())
626
+ x = x.add(self.bias.reshape(shape=[1, -1, 1, 1]))
627
+ return x