PyPI - froog - Versions diffs - 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl - Mend

froog 0.4.0py3-none-any.whl → 0.5.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (15) hide show

froog/__init__.py +34 -1
froog/{gradcheck.py → gradient.py} +4 -11
froog/ops.py +354 -87
froog/optim.py +104 -32
froog/tensor.py +219 -219
froog/utils.py +8 -7
froog-0.5.0.dist-info/METADATA +205 -0
froog-0.5.0.dist-info/RECORD +10 -0
{froog-0.4.0.dist-info → froog-0.5.0.dist-info}/WHEEL +1 -1
froog/nn.py +0 -60
froog/ops_gpu.py +0 -598
froog-0.4.0.dist-info/LICENSE +0 -1
froog-0.4.0.dist-info/METADATA +0 -293
froog-0.4.0.dist-info/RECORD +0 -13
{froog-0.4.0.dist-info → froog-0.5.0.dist-info}/top_level.txt +0 -0

froog/ops.py CHANGED Viewed

@@ -7,7 +7,8 @@
 # |___|    |___|  |_||_______||_______||_______|
 import numpy as np
-from froog.tensor import Function, register
+from typing import Tuple, Union, Optional, Any, Callable
+from froog.tensor import Function, register, Tensor
 from froog.utils import im2col, col2im
 # *****************************************************
@@ -22,32 +23,74 @@ from froog.utils import im2col, col2im
 class Add(Function):# x.add(y)
   @staticmethod     # @staticmethod doesn't require an instance of Add to work, so you can do x.add(y)
-  def forward(ctx, x, y):
+  def forward(ctx: Any, x: np.ndarray, y: np.ndarray) -> np.ndarray:
+    # Check if we have GPU buffers
+    is_metal_buffer = lambda x: hasattr(x, '__pyobjc_object__') or str(type(x)).find('Buffer') >= 0
+    if is_metal_buffer(x) or is_metal_buffer(y):
+      # Import get_buffer_data helper for Metal buffers
+      try:
+        from froog.gpu.buffer_utils import get_buffer_data
+        x_data = get_buffer_data(x)
+        y_data = get_buffer_data(y)
+        ctx.save_for_backward(x_data, y_data)
+        return x_data + y_data
+      except ImportError:
+        print("Warning: buffer_utils not available")
+        # Fall back to regular implementation
+        ctx.save_for_backward(x, y)
+        return x + y
+    # Regular implementation
+    ctx.save_for_backward(x, y)
     return x + y
   @staticmethod
-  def backward(ctx, grad_output):
+  def backward(ctx: Any, grad_output: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
     return grad_output, grad_output
 register("add", Add)
 class Sub(Function): # x.sub(y)
   @staticmethod
-  def forward(ctx, x, y):
+  def forward(ctx: Any, x: np.ndarray, y: np.ndarray) -> np.ndarray:
     return x-y
   @staticmethod
-  def backward(ctx, grad_output):
+  def backward(ctx: Any, grad_output: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
     return grad_output, -grad_output
 register('sub', Sub)
 class Mul(Function): # x.mul(y)
   @staticmethod
-  def forward(ctx, x, y):
+  def forward(ctx: Any, x: np.ndarray, y: np.ndarray) -> np.ndarray:
+    # Check if we have GPU buffers
+    is_metal_buffer = lambda x: hasattr(x, '__pyobjc_object__') or str(type(x)).find('Buffer') >= 0
+    if is_metal_buffer(x) or is_metal_buffer(y):
+      # Import get_buffer_data helper for Metal buffers
+      try:
+        from froog.gpu.buffer_utils import get_buffer_data, buffer_mul
+        x_data = get_buffer_data(x)
+        y_data = get_buffer_data(y)
+        ctx.save_for_backward(x_data, y_data)
+        return buffer_mul(x, y)
+      except Exception as e:
+        print(f"Error in Mul.forward with buffer: {e}")
+        # Fall back to CPU implementation if buffer handling fails
+        from froog.gpu import get_device
+        device = get_device()
+        if device:
+          x_cpu = device.download_tensor(x) if is_metal_buffer(x) else x
+          y_cpu = device.download_tensor(y) if is_metal_buffer(y) else y
+          ctx.save_for_backward(x_cpu, y_cpu)
+          result = x_cpu * y_cpu
+          return device.upload_tensor(result)
+        raise
+    # Standard CPU implementation
     ctx.save_for_backward(x, y)
     return x * y
   @staticmethod
-  def backward(ctx, grad_output):
+  def backward(ctx: Any, grad_output: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
     x, y = ctx.saved_tensors
     return y * grad_output, x * grad_output
 register("mul", Mul)
@@ -58,24 +101,49 @@ class Sum(Function): # x.sum()
   reduces its input tensor to a single value by summing all the elements
   """
   @staticmethod
-  def forward(ctx, input):
+  def forward(ctx: Any, input: np.ndarray) -> np.ndarray:
+    # Check if we have a GPU buffer
+    is_metal_buffer = lambda x: hasattr(x, '__pyobjc_object__') or str(type(x)).find('Buffer') >= 0
+    if is_metal_buffer(input):
+      # Use buffer utilities
+      try:
+        from froog.gpu.buffer_utils import get_buffer_data, buffer_sum
+        input_data = get_buffer_data(input)
+        ctx.save_for_backward(input_data)
+        ctx.input_shape = input_data.shape
+        return buffer_sum(input)
+      except Exception as e:
+        print(f"Error in Sum.forward with buffer: {e}")
+        # Fall back to CPU implementation
+        from froog.gpu import get_device
+        device = get_device()
+        if device:
+          input_cpu = device.download_tensor(input)
+          ctx.save_for_backward(input_cpu)
+          ctx.input_shape = input_cpu.shape
+          result = np.array([np.sum(input_cpu)])
+          return device.upload_tensor(result)
+        raise
+    # Standard CPU implementation
     ctx.save_for_backward(input)
-    return np.array([input.sum()])
+    ctx.input_shape = input.shape
+    return np.array([np.sum(input)])
   @staticmethod
-  def backward(ctx, grad_output):
+  def backward(ctx: Any, grad_output: np.ndarray) -> np.ndarray:
     (input,) = ctx.saved_tensors
     return grad_output * np.ones_like(input)
 register("sum", Sum)
 class Pow(Function): # x.pow(y)
   @staticmethod
-  def forward(ctx, x, y):
+  def forward(ctx: Any, x: np.ndarray, y: np.ndarray) -> np.ndarray:
     ctx.save_for_backward(x, y)
     return x ** y
   @staticmethod
-  def backward(ctx, grad_output):
+  def backward(ctx: Any, grad_output: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
     x, y = ctx.saved_tensors
     return y * (x**(y-1.0)) * grad_output, (x**y) * np.log(x) * grad_output # power rule, d/dx (y^x)
 register("pow", Pow)
@@ -92,16 +160,65 @@ register("pow", Pow)
 class Dot(Function):  # x.dot(y)
   @staticmethod
-  def forward(ctx, input, weight):
+  def forward(ctx: Any, input: np.ndarray, weight: np.ndarray) -> np.ndarray:
     ctx.save_for_backward(input, weight)
-    return input.dot(weight)
+    # Check if we're working with GPU buffers
+    try:
+        from froog.tensor import is_buffer
+        from froog.gpu import download_tensor
+        # Convert any GPU buffers to CPU for the operation
+        if is_buffer(input):
+            input_cpu = download_tensor(input)
+        else:
+            input_cpu = input
+        if is_buffer(weight):
+            weight_cpu = download_tensor(weight)
+        else:
+            weight_cpu = weight
+        return input_cpu.dot(weight_cpu)
+    except Exception as e:
+        import traceback
+        print(f"Error in dot operation: {str(e)}")
+        print(f"  Self: {input}")
+        print(f"  Arg 0: {weight}")
+        print(f"  Kwargs: {{}}")
+        traceback.print_exc()
+        # Try the original method as fallback
+        return input.dot(weight)
   @staticmethod
-  def backward(ctx, grad_output):
+  def backward(ctx: Any, grad_output: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
     input, weight = ctx.saved_tensors
-    grad_input = grad_output.dot(weight.T)
-    grad_weight = input.T.dot(grad_output)
-    return grad_input, grad_weight
+    # Convert GPU buffers to CPU if needed
+    try:
+        from froog.tensor import is_buffer
+        from froog.gpu import download_tensor
+        if is_buffer(input):
+            input_cpu = download_tensor(input)
+        else:
+            input_cpu = input
+        if is_buffer(weight):
+            weight_cpu = download_tensor(weight)
+        else:
+            weight_cpu = weight
+        if is_buffer(grad_output):
+            grad_output_cpu = download_tensor(grad_output)
+        else:
+            grad_output_cpu = grad_output
+        return grad_output_cpu.dot(weight_cpu.T), input_cpu.T.dot(grad_output_cpu)
+    except Exception as e:
+        print(f"Error in dot backward: {str(e)}")
+        # Fallback
+        return grad_output.dot(weight.T), input.T.dot(grad_output)
 register('dot', Dot)
 register('matmul', Dot)
@@ -117,12 +234,12 @@ register('matmul', Dot)
 class ReLU(Function):
   @staticmethod
-  def forward(ctx, input):
+  def forward(ctx: Any, input: np.ndarray) -> np.ndarray:
     ctx.save_for_backward(input)
     return np.maximum(input, 0)                     # relu(x) = max(0,x)
   @staticmethod
-  def backward(ctx, grad_output):
+  def backward(ctx: Any, grad_output: np.ndarray) -> np.ndarray:
     input, = ctx.saved_tensors
     grad_input = grad_output * (input >= 0)
     return grad_input
@@ -130,26 +247,61 @@ register("relu", ReLU)
 class Sigmoid(Function):
   @staticmethod
-  def forward(ctx, input):
+  def forward(ctx: Any, input: np.ndarray) -> np.ndarray:
     ctx.save_for_backward(input)
     ret = 1/(1 + np.exp(-input))                    # sigmoid(x) = 1 / (1 + exp(-x))
     return ret
   @staticmethod
-  def backward(ctx, grad_output):
+  def backward(ctx: Any, grad_output: np.ndarray) -> np.ndarray:
     ret, = ctx.saved_tensors
     grad_input = grad_output * (ret * (1 - ret))    # just take the derivative of sigmoid
     return grad_input
 register("sigmoid", Sigmoid)
+class DropoutLayer:
+  """
+  Dropout layer that randomly sets a fraction of input units to 0 during training time.
+  pytorch version: https://pytorch.org/docs/stable/generated/torch.nn.Dropout.html
+  """
+  def __init__(self, p: float = 0.5) -> None:
+    self.p = p
+    self.training = True
+  def __call__(self, x):
+    # build a CPU‐side random mask of the same shape as the tensor x
+    mask_np = (np.random.rand(*x.shape) >= self.p).astype(np.float32) / (1.0 - self.p)
+    from froog.tensor import Tensor
+    mask_t = Tensor(mask_np)
+    if getattr(x, "is_gpu", False): mask_t = mask_t.to_gpu()
+    return x.mul(mask_t)
+class Dropout(Function):
+  @staticmethod
+  def forward(ctx: Any, input: np.ndarray, p: float = 0.5, training: bool = True) -> np.ndarray:
+    if not training: return input
+    # create a binary mask with probability (1-p) of being 1
+    # scale by 1/(1-p) to keep expectation same
+    ctx.training = training
+    mask = (np.random.rand(*input.shape) >= p).astype(np.float32) / (1.0 - p if p < 1.0 else 1e-9) # avoid division by zero if p is 1.0
+    ctx.save_for_backward(mask)
+    return input * mask
+  @staticmethod
+  def backward(ctx: Any, grad_output: np.ndarray) -> np.ndarray:
+    if not ctx.training: return grad_output
+    mask, = ctx.saved_tensors
+    return grad_output * mask
+register("dropout", Dropout)
 class Reshape(Function):
   @staticmethod
-  def forward(ctx, x, shape):
+  def forward(ctx: Any, x: np.ndarray, shape: Tuple[int, ...]) -> np.ndarray:
     ctx.save_for_backward(x.shape)
     return x.reshape(shape)
   @staticmethod
-  def backward(ctx, grad_output):
+  def backward(ctx: Any, grad_output: np.ndarray) -> np.ndarray:
     in_shape, = ctx.saved_tensors
     return grad_output.reshape(in_shape)
 register('reshape', Reshape)
@@ -159,11 +311,13 @@ class Pad2D(Function):
   The first element (0,0) corresponds to padding along the batch dimension, which indicates no padding on both sides (0 elements added).
   """
   @staticmethod
-  def forward(ctx, x, padding=None):
+  def forward(ctx: Any, x: np.ndarray, padding: Optional[Tuple[int, int, int, int]] = None) -> np.ndarray:
+    if padding is None:
+      padding = (0, 0, 0, 0)
     return np.pad(x, ((0,0), (0,0), (padding[0], padding[1]), (padding[2], padding[3]))) # (top, bottom, left, right)
   @staticmethod
-  def backward(ctx, grad_output):
+  def backward(ctx: Any, grad_output: np.ndarray) -> np.ndarray:
     raise Exception("write this")
 register('pad2d', Pad2D)
@@ -173,8 +327,8 @@ class LogSoftmax(Function):
   probabilities of each value are proportional to the scale of each value
   """
   @staticmethod
-  def forward(ctx, input):
-    def logsumexp(x):
+  def forward(ctx: Any, input: np.ndarray) -> np.ndarray:
+    def logsumexp(x: np.ndarray) -> np.ndarray:
       c = x.max(axis=1)
       return c + np.log(np.exp(x - c.reshape((-1, 1))).sum(axis=1)) # axis=1 refers to the columns
@@ -183,7 +337,7 @@ class LogSoftmax(Function):
     return output
   @staticmethod
-  def backward(ctx, grad_output):
+  def backward(ctx: Any, grad_output: np.ndarray) -> np.ndarray:
     (output,) = ctx.saved_tensors
     return grad_output - np.exp(output)*(grad_output.sum(axis=1).reshape((-1, 1)))
 register("logsoftmax", LogSoftmax)
@@ -200,62 +354,123 @@ register("logsoftmax", LogSoftmax)
 class Conv2D(Function): # TODO: understand group splits
   @staticmethod
-  def forward(ctx, x, w, stride=1, groups=1):
+  def forward(ctx: Any, x: np.ndarray, w: np.ndarray, stride: Union[int, Tuple[int, int]] = 1, groups: int = 1) -> np.ndarray:
     """
     https://github.com/vdumoulin/conv_arithmetic/blob/master/README.md
     WARNING: doesn't handle padding or strides yet
     Args:
-      x.shape[0] 									  --> number of input examples (batch size)
-      cout 			 								    --> number of output channels
-      x.shape[2]-(H-1)					 	  --> non-padded height of conv output, need to subtract because this is an unpadded conv
-      x.shape[3]-(W-1)						  --> width of output
+      x.shape[0]                                                                        --> number of input examples (batch size)
+      cout                                                                                        --> number of output channels
+      x.shape[2]-(H-1)                                                  --> non-padded height of conv output, need to subtract because this is an unpadded conv
+      x.shape[3]-(W-1)                                                  --> width of output
     Shape:
       (a, b, c, d)(e, f, g, h)      --> (a, e, c-(g-1), d-(h-1))
       in general, output x and y = [(W−K+2P)/S]+1
     """
-    if type(ctx.stride) == int:                                                                           # ctx stores function params
+    ctx.stride = stride
+    ctx.groups = groups
+    if isinstance(ctx.stride, int):                                                                       # ctx stores function params
       ctx.stride = (ctx.stride, ctx.stride)
-    cout, cin, H, W = w.shape
-    tw = w.reshape(cout, -1).T                                                                            # slice of kernel
-    y_stride, x_stride = ctx.stride
-    bs,cin_,oy,ox = x.shape[0], x.shape[1], (x.shape[2]-(H-y_stride))//y_stride, (x.shape[3]-(W-x_stride))//x_stride
-    assert cin*ctx.groups == cin_                                                                         # ensures that the channel dimensions match appropriately for grouping
-    assert cout % ctx.groups == 0                                                                         # ensures that the number of output channels can be evenly divided among the groups
-    g_w_chans = cout//ctx.groups                                                                          # number of output channels per group
-    ctx.save_for_backward(x, w)
-    ret = np.zeros((bs, cout, oy, ox), dtype=w.dtype)
-    for g in range(ctx.groups):
-      tw = w[g*g_w_chans:(g*g_w_chans+g_w_chans)].reshape(g_w_chans, -1).T                                # transformed kernel weights
-      for Y in range(oy):
-        for X in range(ox):
-          iY,iX = Y*y_stride, X*x_stride
-          tx = x[:, g*cin:(g*cin+cin), iY:iY+H, iX:iX+W].reshape(bs, -1)
-          ret[:, g*g_w_chans:(g*g_w_chans+g_w_chans), Y, X] += tx.dot(tw)
-    return ret
+    # Check if we're working with GPU buffers and convert to CPU
+    try:
+        from froog.tensor import is_buffer
+        from froog.gpu import download_tensor
+        # Convert input to CPU if it's a GPU buffer
+        if is_buffer(x):
+            x_cpu = download_tensor(x)
+        else:
+            x_cpu = x
+        # Convert weight to CPU if it's a GPU buffer
+        if is_buffer(w):
+            w_cpu = download_tensor(w)
+        else:
+            w_cpu = w
+        # Now use the CPU tensors for the rest of the computation
+        cout, cin, H, W = w_cpu.shape
+        tw = w_cpu.reshape(cout, -1).T                                                                        # slice of kernel
+        y_stride, x_stride = ctx.stride
+        bs,cin_,oy,ox = x_cpu.shape[0], x_cpu.shape[1], (x_cpu.shape[2]-(H-y_stride))//y_stride, (x_cpu.shape[3]-(W-x_stride))//x_stride
+        assert cin*ctx.groups == cin_                                                                         # ensures that the channel dimensions match appropriately for grouping
+        assert cout % ctx.groups == 0                                                                         # ensures that the number of output channels can be evenly divided among the groups
+        g_w_chans = cout//ctx.groups                                                                          # number of output channels per group
+        ctx.save_for_backward(x_cpu, w_cpu)
+        ret = np.zeros((bs, cout, oy, ox), dtype=w_cpu.dtype)
+        for g in range(ctx.groups):
+          tw = w_cpu[g*g_w_chans:(g*g_w_chans+g_w_chans)].reshape(g_w_chans, -1).T                           # transformed kernel weights
+          for Y in range(oy):
+            for X in range(ox):
+              iY,iX = Y*y_stride, X*x_stride
+              tx = x_cpu[:, g*cin:(g*cin+cin), iY:iY+H, iX:iX+W].reshape(bs, -1)
+              ret[:, g*g_w_chans:(g*g_w_chans+g_w_chans), Y, X] += tx.dot(tw)
+        return ret
+    except Exception as e:
+        import traceback
+        print(f"Error in conv2d operation: {str(e)}")
+        print(f"  Self: {x}")
+        print(f"  Arg 0: {w}")
+        print(f"  Kwargs: {{stride: {stride}, groups: {groups}}}")
+        traceback.print_exc()
+        raise
   @staticmethod
-  def backward(ctx, grad_output):
-    x, w = ctx.saved_tensors
-    cout, cin, H, W = w.shape
-    dx, dw = np.zeros_like(x), np.zeros_like(w)
-    y_stride, x_stride = ctx.stride
-    g_w_chans = cout//ctx.groups
-    for g in range(ctx.groups):
-      tw = w[g*g_w_chans:(g*g_w_chans+g_w_chans)].reshape(g_w_chans, -1)
-      for Y in range(grad_output.shape[2]):
-        for X in range(grad_output.shape[3]):
-          iY,iX = Y*y_stride, X*x_stride
-          gg = grad_output[:, g*g_w_chans:(g*g_w_chans+g_w_chans), Y, X]                                  # current multiply element in chain rule
-          tx = x[:, g*cin:(g*cin+cin), iY:iY+H, iX:iX+W].reshape(x.shape[0], -1)                          # slice of tensor at current conv op
-          dw[g*g_w_chans:(g*g_w_chans+g_w_chans)] += gg.T.dot(tx).reshape((g_w_chans,cin,H,W))            # gradient with respect to input
-          dx[:, g*cin:(g*cin+cin), iY:iY+H, iX:iX+W] += gg.dot(tw).reshape(dx.shape[0], cin, H, W)        # accumulate gradient with respect to weights
-    return dx, dw
+  def backward(ctx: Any, grad_output: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+    try:
+        from froog.tensor import is_buffer
+        from froog.gpu import download_tensor
+        # Convert grad_output to CPU if it's a GPU buffer
+        if is_buffer(grad_output):
+            grad_output_cpu = download_tensor(grad_output)
+        else:
+            grad_output_cpu = grad_output
+        x, w = ctx.saved_tensors
+        cout, cin, H, W = w.shape
+        dx, dw = np.zeros_like(x), np.zeros_like(w)
+        y_stride, x_stride = ctx.stride
+        g_w_chans = cout//ctx.groups
+        for g in range(ctx.groups):
+          tw = w[g*g_w_chans:(g*g_w_chans+g_w_chans)].reshape(g_w_chans, -1)
+          for Y in range(grad_output_cpu.shape[2]):
+            for X in range(grad_output_cpu.shape[3]):
+              iY,iX = Y*y_stride, X*x_stride
+              gg = grad_output_cpu[:, g*g_w_chans:(g*g_w_chans+g_w_chans), Y, X]                             # current multiply element in chain rule
+              tx = x[:, g*cin:(g*cin+cin), iY:iY+H, iX:iX+W].reshape(x.shape[0], -1)                         # slice of tensor at current conv op
+              dw[g*g_w_chans:(g*g_w_chans+g_w_chans)] += gg.T.dot(tx).reshape((g_w_chans,cin,H,W))           # gradient with respect to input
+              dx[:, g*cin:(g*cin+cin), iY:iY+H, iX:iX+W] += gg.dot(tw).reshape(dx.shape[0], cin, H, W)       # accumulate gradient with respect to weights
+        return dx, dw
+    except Exception as e:
+        import traceback
+        print(f"Error in conv2d backward: {str(e)}")
+        print(f"  Grad Output: {grad_output}")
+        traceback.print_exc()
+        # Fallback to original implementation
+        x, w = ctx.saved_tensors
+        cout, cin, H, W = w.shape
+        dx, dw = np.zeros_like(x), np.zeros_like(w)
+        y_stride, x_stride = ctx.stride
+        g_w_chans = cout//ctx.groups
+        for g in range(ctx.groups):
+          tw = w[g*g_w_chans:(g*g_w_chans+g_w_chans)].reshape(g_w_chans, -1)
+          for Y in range(grad_output.shape[2]):
+            for X in range(grad_output.shape[3]):
+              iY,iX = Y*y_stride, X*x_stride
+              gg = grad_output[:, g*g_w_chans:(g*g_w_chans+g_w_chans), Y, X]                                  # current multiply element in chain rule
+              tx = x[:, g*cin:(g*cin+cin), iY:iY+H, iX:iX+W].reshape(x.shape[0], -1)                          # slice of tensor at current conv op
+              dw[g*g_w_chans:(g*g_w_chans+g_w_chans)] += gg.T.dot(tx).reshape((g_w_chans,cin,H,W))            # gradient with respect to input
+              dx[:, g*cin:(g*cin+cin), iY:iY+H, iX:iX+W] += gg.dot(tw).reshape(dx.shape[0], cin, H, W)        # accumulate gradient with respect to weights
+        return dx, dw
 register('conv2d', Conv2D)
@@ -266,7 +481,7 @@ class im2ColConv(Function):
   """
   @staticmethod
-  def forward(ctx, x, w):
+  def forward(ctx: Any, x: np.ndarray, w: np.ndarray) -> np.ndarray:
     cout, cin, k_h, k_x = w.shape
     bs, oy, ox = x.shape[0], x.shape[2]-(k_h-1), x.shape[3]-(k_x-1)
     tw = w.reshape(cout, -1).T                                             # each filter flattened into a row
@@ -276,7 +491,7 @@ class im2ColConv(Function):
     return np.moveaxis(ret, [0,1,2,3], [0,2,3,1])                          # reorders the axes (batch size, number of channels, height, width)
   @staticmethod
-  def backward(ctx, grad_output):
+  def backward(ctx: Any, grad_output: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
     bs,_,oy,ox = grad_output.shape
     tx, w = ctx.saved_tensors                                              # transformed input, filter weights
     cout,cin,H,W = w.shape
@@ -298,7 +513,7 @@ register('im2col2dconv', im2ColConv)
 #
 # **************** pooling ops ***************
-def stack_for_pool(x, pool_y, pool_x):
+def stack_for_pool(x: np.ndarray, pool_y: int, pool_x: int) -> np.ndarray:
   my, mx = (x.shape[2]//pool_y)*pool_y, (x.shape[3]//pool_x)*pool_x        # ensures input tensor can be evenly divided into 2x2 blocks for max pooling
   stack = []
   cropped_x = x[:, :, :my, :mx]                                            # crop input so 2x2 max pool can be taken
@@ -308,47 +523,49 @@ def stack_for_pool(x, pool_y, pool_x):
   return np.concatenate(stack, axis=0)                                     # put all into one row
-def unstack_for_pool(fxn, s, py, px):
+def unstack_for_pool(fxn: Callable[[int], np.ndarray], s: Tuple[int, ...], py: int, px: int) -> np.ndarray:
   max_y, max_x = (s[2]//py)*py, (s[3]//px)*px                              # get shape that allows (pool_size_y,pool_size_x) max pool
+  ret = None
   for Y in range(py):
     for X in range(px):
       level_w_new_grad = fxn(Y*px+X)
       if X == 0 and Y == 0:                                                # pool of zero size
         ret = np.zeros(s, dtype=level_w_new_grad.dtype)
-      ret[:, :, Y:max_y:py, X:max_x:px] = level_w_new_grad
-  return ret
+      if ret is not None:
+        ret[:, :, Y:max_y:py, X:max_x:px] = level_w_new_grad
+  return ret if ret is not None else np.zeros(s)
 class MaxPool2D(Function):
   @staticmethod
-  def forward(ctx, x, kernel_size=(2,2)):
+  def forward(ctx: Any, x: np.ndarray, kernel_size: Tuple[int, int] = (2,2)) -> np.ndarray:
+    ctx.kernel_size = kernel_size
     stack = stack_for_pool(x, *kernel_size)
     idx_of_max = np.argmax(stack, axis=0)
     ctx.save_for_backward(idx_of_max, x.shape)
     return np.max(stack, axis=0)
   @staticmethod
-  def backward(ctx, grad_output):
+  def backward(ctx: Any, grad_output: np.ndarray) -> np.ndarray:
     """
     Distributes the gradient from the output of the max pooling layer to its inputs
     The purpose of (idxs == idx) is to generate a boolean mask indicating the locations of the maximum values in each 2x2 block of the original input
     The expression (Y*2+X) is a way to iterate through the four possible positions within the kernel block: e.g. (0,0), (0,1), (1,0), and (1,1), which get mapped to the indices 0, 1, 2, and 3
     """
-    idxs, s = ctx.saved_tensors
-    return unstack_for_pool(lambda idx: grad_output * (idxs == idx),
-                            s,
-                            *ctx.kernel_size)
+    idxs, s = ctx.saved_tensors
+    return unstack_for_pool(lambda idx: grad_output * (idxs == idx),  s, *ctx.kernel_size)
 register('max_pool2d', MaxPool2D)
 class AvgPool2D(Function):
   @staticmethod
-  def forward(ctx, x, kernel_size=(2,2)):
+  def forward(ctx: Any, x: np.ndarray, kernel_size: Tuple[int, int] = (2,2)) -> np.ndarray:
+    ctx.kernel_size = kernel_size
     stack = stack_for_pool(x, *kernel_size)
     ctx.save_for_backward(x.shape)
     return np.mean(stack, axis=0)
   @staticmethod
-  def backward(ctx, grad_output):
+  def backward(ctx: Any, grad_output: np.ndarray) -> np.ndarray:
     s, = ctx.saved_tensors
     py, px = ctx.kernel_size                                  # kernel_size passed from forward context
     my, mx = (s[2]//py)*py, (s[3]//px)*px
@@ -358,3 +575,53 @@ class AvgPool2D(Function):
         ret[:, :, Y:my:py, X:mx:px] = grad_output / py / px   # divide by avg of pool, e.g. for 2x2 pool /= 4
     return ret
 register('avg_pool2d', AvgPool2D)
+# *************************************
+#     _   ___   __   ____  ____  _____
+#    / | / / | / /  / __ \/ __ \/ ___/
+#   /  |/ /  |/ /  / / / / /_/ /\__ \
+#  / /|  / /|  /  / /_/ / ____/___/ /
+# /_/ |_/_/ |_/   \____/_/    /____/
+#
+# ************* nn ops ************
+def Linear(*x: int) -> np.ndarray:
+  # random Glorot initialization
+  ret = np.random.uniform(-1., 1., size=x)/np.sqrt(np.prod(x))
+  return ret.astype(np.float32)
+def swish(x: Tensor) -> Tensor:
+  return x.mul(x.sigmoid())
+class BatchNorm2D:
+  """
+  __call__ follows the formula from the link below
+  pytorch version: https://pytorch.org/docs/stable/generated/torch.nn.BatchNorm2d.html
+  self.weight       = γ
+  self.bias         = β
+  self.running_mean = E[x]
+  self.running_var  = Var[x]
+  the reshaping step ensures that each channel of the input has its
+  own separate set of parameters (mean, variance, weight, and bias)
+  self.running_mean has shape [num_channels].
+  self.running_mean.reshape(shape=[1, -1, 1, 1]) reshapes it to [1, num_channels, 1, 1]
+  """
+  def __init__(self, sz: int, eps: float = 0.001) -> None:
+    self.eps = eps
+    self.weight = Tensor.zeros(sz)
+    self.bias = Tensor.zeros(sz)
+    # TODO: need running_mean and running_var
+    self.running_mean = Tensor.zeros(sz)
+    self.running_var = Tensor.zeros(sz)
+    self.num_batches_tracked = Tensor.zeros(1)
+  def __call__(self, x: Tensor) -> Tensor:
+    x = x.sub(self.running_mean.reshape(shape=[1, -1, 1, 1]))
+    x = x.mul(self.weight.reshape(shape=[1, -1, 1, 1]))
+    x = x.div(self.running_var.add(Tensor([self.eps], gpu=x.gpu)).reshape(shape=[1, -1, 1, 1]).sqrt())
+    x = x.add(self.bias.reshape(shape=[1, -1, 1, 1]))
+    return x

froog 0.4.0__py3-none-any.whl → 0.5.0__py3-none-any.whl

froog 0.4.0py3-none-any.whl → 0.5.0py3-none-any.whl