PyPI - ocnn - Versions diffs - 2.2.7__py3-none-any.whl → 2.3.0__py3-none-any.whl - Mend

ocnn 2.2.7py3-none-any.whl → 2.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (21) hide show

ocnn/__init__.py +1 -1
ocnn/models/resnet.py +2 -2
ocnn/nn/__init__.py +2 -1
ocnn/nn/kernels/__init__.py +14 -0
ocnn/nn/kernels/autotuner.py +416 -0
ocnn/nn/kernels/config.py +67 -0
ocnn/nn/kernels/conv_bwd_implicit_gemm.py +229 -0
ocnn/nn/kernels/conv_bwd_implicit_gemm_splitk.py +347 -0
ocnn/nn/kernels/conv_fwd_implicit_gemm.py +109 -0
ocnn/nn/kernels/conv_fwd_implicit_gemm_splitk.py +150 -0
ocnn/nn/kernels/utils.py +44 -0
ocnn/nn/octree_conv.py +2 -1
ocnn/nn/octree_conv_t.py +148 -0
ocnn/nn/octree_pad.py +4 -4
ocnn/octree/octree.py +218 -109
ocnn/octree/points.py +95 -34
{ocnn-2.2.7.dist-info → ocnn-2.3.0.dist-info}/METADATA +11 -6
{ocnn-2.2.7.dist-info → ocnn-2.3.0.dist-info}/RECORD +21 -12
{ocnn-2.2.7.dist-info → ocnn-2.3.0.dist-info}/WHEEL +1 -1
{ocnn-2.2.7.dist-info → ocnn-2.3.0.dist-info}/licenses/LICENSE +0 -0
{ocnn-2.2.7.dist-info → ocnn-2.3.0.dist-info}/top_level.txt +0 -0

ocnn/nn/kernels/conv_fwd_implicit_gemm_splitk.py ADDED Viewed

@@ -0,0 +1,150 @@
+import math
+import torch
+import triton
+import triton.language as tl
+from .utils import get_num_sm
+from .autotuner import triton_autotune, autotune
+from . import config
+from .conv_fwd_implicit_gemm import conv_fwd_implicit_gemm_kernel
+@triton_autotune(
+    configs=config.autotune_config,
+    key=['LOGN', 'Ci', 'Co', 'V', 'SPLITK', 'allow_tf32'],
+)
+@triton.jit
+def conv_fwd_implicit_gemm_splitk_kernel(
+    input,
+    weight,
+    bias,
+    neighbor,
+    output,
+    # Tensor dimensions
+    N, LOGN, Ci, Co, V: tl.constexpr,
+    # Meta-parameters
+    B1: tl.constexpr,   # Block size for N dimension
+    B2: tl.constexpr,   # Block size for Co dimension
+    BK: tl.constexpr,   # Block size for K dimension (V * Ci)
+    SPLITK: tl.constexpr,  # Split K dimension
+    allow_tf32: tl.constexpr,  # Allow TF32 precision for matmuls
+):
+    """
+    Sparse submanifold convolution forward kernel using implicit GEMM with split K dimension.
+    Args:
+        input (pointer): A pointer to the input tensor of shape (N, Ci)
+        weight (pointer): A pointer to the weight tensor of shape (Co, V, Ci)
+        bias (pointer): A pointer to the bias tensor of shape (Co)
+        neighbor (pointer): A pointer to the neighbor tensor of shape (N, V)
+        output (pointer): A pointer to the output tensor of shape (N, Co)
+    """
+    block_id_k = tl.program_id(axis=1)  # SplitK dimension
+    block_id = tl.program_id(axis=0)
+    block_dim_co = tl.cdiv(Co, B2)
+    block_id_co = block_id % block_dim_co
+    block_id_n = block_id // block_dim_co
+    # Create pointers for submatrices of A and B.
+    num_k = tl.cdiv(Ci, BK)  # Number of blocks in K dimension
+    k_start = tl.cdiv(num_k * V * block_id_k, SPLITK)
+    k_end = tl.cdiv(num_k * V * (block_id_k + 1), SPLITK)
+    offset_n = (block_id_n * B1 + tl.arange(0, B1)) % N         # (B1,)
+    offset_co = (block_id_co * B2 + tl.arange(0, B2)) % Co      # (B2,)
+    offset_k = tl.arange(0, BK)                                 # (BK,)
+    # Create a block of the output matrix C.
+    accumulator = tl.zeros((B1, B2), dtype=tl.float32)
+    curr_v = k_start // num_k
+    curr_bk = k_start % num_k
+    weight_offset_base = curr_v * Ci + curr_bk * BK
+    weight_ptr = weight + weight_offset_base + (offset_co[None, :] * V * Ci + offset_k[:, None])     # (BK, B2)
+    # Iterate along V*Ci dimension.
+    for k in range(k_start, k_end):
+        v = k // num_k
+        bk = k % num_k
+        # Calculate pointers to input matrix.
+        neighbor_offset_n = tl.load(neighbor + offset_n * V + v).to(tl.int64)                   # (B1,)
+        input_ptr = input + bk * BK + (neighbor_offset_n[:, None].to(tl.int64) * Ci + offset_k[None, :])     # (B1, BK)
+        # Load the next block of input and weight.
+        neigh_mask = neighbor_offset_n != -1
+        k_mask = offset_k < Ci - bk * BK
+        input_block = tl.load(input_ptr, mask=neigh_mask[:, None] & k_mask[None, :], other=0.0)
+        weight_block = tl.load(weight_ptr, mask=k_mask[:, None], other=0.0)
+        # Accumulate along the K dimension.
+        accumulator = tl.dot(input_block, weight_block, accumulator,
+                             input_precision='tf32' if allow_tf32 else 'ieee')                  # (B1, B2)
+        # Advance the pointers to the next Ci block.
+        weight_ptr += min(BK, Ci - bk * BK)
+    # add bias
+    if bias is not None and block_id_k == 0:
+        bias_block = tl.load(bias + offset_co)
+        accumulator += bias_block[None, :]
+    # Write back the block of the output matrix with masks.
+    out_offset_n = block_id_n * B1 + tl.arange(0, B1)
+    out_offset_co = block_id_co * B2 + tl.arange(0, B2)
+    out_ptr = output + block_id_k * N * Co + (out_offset_n[:, None] * Co + out_offset_co[None, :])
+    out_mask = (out_offset_n[:, None] < N) & (out_offset_co[None, :] < Co)
+    tl.store(out_ptr, accumulator, mask=out_mask)
+def conv_fwd_implicit_gemm_splitk_configs(input, weight, bias, neighbor):
+    N, Co = neighbor.shape[0], weight.shape[0]
+    MAX_NB1 = (N + 128 - 1) // 128
+    MAX_NB2 = (Co + 128 - 1) // 128
+    NUM_BLOCKS = MAX_NB1 * MAX_NB2
+    MIN_NUM_BLOCKS = get_num_sm()
+    MAX_NUM_BLOCKS = 32 * get_num_sm()
+    MIN_NUM_BLOCKS_LOG2 = max(0, int(math.log2(MIN_NUM_BLOCKS / NUM_BLOCKS)))
+    MAX_NUM_BLOCKS_LOG2 = max(1, int(math.log2(MAX_NUM_BLOCKS / NUM_BLOCKS) + 1))
+    configs = []
+    for i in range(MIN_NUM_BLOCKS_LOG2, MAX_NUM_BLOCKS_LOG2):
+        configs.append({'SPLITK': 2 ** i})
+    return configs
+def conv_fwd_implicit_gemm_splitk_keys(input, weight, bias, neighbor):
+    N, Ci, Co, V = neighbor.shape[0], input.shape[1], weight.shape[0], weight.shape[1]
+    return f'(2^{int(math.log2(N))}, {Ci}, {Co}, {V})'
+@autotune(
+    config_fn=conv_fwd_implicit_gemm_splitk_configs,
+    key_fn=conv_fwd_implicit_gemm_splitk_keys,
+)
+def conv_fwd_implicit_gemm_splitk(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    neighbor: torch.Tensor,
+    SPLITK: int = 1,
+) -> torch.Tensor:
+    assert input.shape[1] == weight.shape[2], "Incompatible dimensions"
+    assert input.is_contiguous(), "Matrix input must be contiguous"
+    assert weight.is_contiguous(), "Matrix weight must be contiguous"
+    assert neighbor.is_contiguous(), "Matrix neighbor must be contiguous"
+    N, Ci, Co, V = neighbor.shape[0], input.shape[1], weight.shape[0], weight.shape[1]
+    LOGN = int(math.log2(N))
+    # Launch the kernel.
+    if SPLITK == 1:
+        output = torch.empty((N, Co), device=input.device, dtype=input.dtype)
+        grid = lambda META: (triton.cdiv(Co, META['B2']) * triton.cdiv(N, META['B1']),)
+        conv_fwd_implicit_gemm_kernel[grid](
+            input, weight, bias, neighbor, output,
+            N, LOGN, Ci, Co, V,
+            allow_tf32=config.allow_tf32,
+        )
+        return output
+    else:
+        output = torch.empty((SPLITK, N, Co), device=input.device, dtype=torch.float32)
+        grid = lambda META: (triton.cdiv(Co, META['B2']) * triton.cdiv(N, META['B1']), SPLITK)
+        conv_fwd_implicit_gemm_splitk_kernel[grid](
+            input, weight, bias, neighbor, output,
+            N, LOGN, Ci, Co, V,
+            SPLITK=SPLITK,
+            allow_tf32=config.allow_tf32,
+        )
+        return output.sum(dim=0).to(input.dtype)

ocnn/nn/kernels/utils.py ADDED Viewed

@@ -0,0 +1,44 @@
+from typing import *
+import torch
+import triton
+def get_gpu_name():
+    return torch.cuda.get_device_name()
+def get_platform_name():
+    if torch.cuda.is_available():
+        if getattr(torch.version, 'hip', None) is not None:
+            return 'hip'
+        return 'cuda'
+    return 'unknown'
+def get_num_sm():
+    return torch.cuda.get_device_properties("cuda").multi_processor_count
+def get_autotune_config(
+    default: List[triton.Config] = None,
+    platform: Dict[str, List[triton.Config]] = None,
+    device: Dict[str, List[triton.Config]] = None,
+) -> List[triton.Config]:
+    """
+    Get the autotune configuration for the current platform and device.
+    """
+    if device is not None:
+        gpu_name = get_gpu_name()
+        for key, value in device.items():
+            if key.lower() in gpu_name.lower():
+                return value
+    if platform is not None:
+        platform_name = get_platform_name()
+        for key, value in platform.items():
+            if key.lower() in platform_name.lower():
+                return value
+    if default is None:
+        raise ValueError("No autotune configuration found for the current platform and device.")
+    return default

ocnn/nn/octree_conv.py CHANGED Viewed

@@ -98,7 +98,8 @@ class OctreeConvBase:
     # Check the shape of input data
     check = tuple(data.shape) == self.in_shape
-    assert check, 'The shape of input data is wrong.'
+    assert check, ('The shape of input data is wrong: ' +
+                   'expected {}, got {}.'.format(self.in_shape, data.shape))
     # Init the output data
     out = data.new_zeros(self.out_shape)

ocnn/nn/octree_conv_t.py ADDED Viewed

@@ -0,0 +1,148 @@
+# --------------------------------------------------------
+# Octree-based Sparse Convolutional Neural Networks
+# Copyright (c) 2022 Peng-Shuai Wang <wangps@hotmail.com>
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Peng-Shuai Wang
+# --------------------------------------------------------
+import torch
+import torch.nn
+from torch.autograd import Function
+from typing import List
+from ocnn.octree import Octree
+from ocnn.nn import OctreeConv
+from ocnn.utils import xavier_uniform_, resize_with_last_val, list2str
+from ocnn.nn.kernels import conv_fwd_implicit_gemm_splitk, conv_bwd_implicit_gemm_splitk
+class OctreeConvTritonFunction(Function):
+  r''' Wrap the octree convolution for auto-diff.
+  '''
+  @staticmethod
+  def forward(ctx, data: torch.Tensor, weights: torch.Tensor, bias: torch.Tensor,
+              neigh: torch.Tensor):
+    data = data.contiguous()
+    weights = weights.contiguous()
+    neigh = neigh.contiguous()
+    if bias is not None:
+      bias = bias.contiguous()
+    out = conv_fwd_implicit_gemm_splitk(data, weights, bias, neigh)
+    ctx.save_for_backward(data, weights, bias, neigh)
+    return out
+  @staticmethod
+  def backward(ctx, grad):
+    data, weights, bias, neigh = ctx.saved_tensors
+    grad = grad.contiguous()
+    grad_input, grad_weight, grad_bias = conv_bwd_implicit_gemm_splitk(
+        grad, data, weights, bias, neigh, ctx.needs_input_grad)
+    return grad_input, grad_weight, grad_bias, None
+# alias
+octree_conv_triton = OctreeConvTritonFunction.apply
+class OctreeConvTriton(torch.nn.Module):
+  r''' Performs octree convolution.
+  Args:
+    in_channels (int): Number of input channels.
+    out_channels (int): Number of output channels.
+    kernel_size (List(int)): The kernel shape, only :obj:`[3]` and :obj:`[3,3,3]`
+        are supported now for the triton implementation.
+    stride (int): The stride of the convolution, only :obj:`1` is supported now.
+    nempty (bool): If True, only performs the convolution on non-empty octree
+        nodes; otherwise, performs the convolution on all octree nodes.
+    use_bias (bool): If True, add a bias term to the convolution.
+  .. note::
+    Each non-empty octree node has exactly 8 children nodes, among which some
+    children nodes are non-empty and some are empty. If :attr:`nempty` is true,
+    the convolution is performed on non-empty octree nodes only, which is exactly
+    the same as SparseConvNet and MinkowsiNet; if :attr:`nempty` is false, the
+    convolution is performed on all octree nodes, which is essential for shape
+    reconstruction tasks and can also be used in classification and segmentation
+    (with slightly better performance and larger memory cost).
+  '''
+  def __init__(self, in_channels: int, out_channels: int,
+               kernel_size: List[int] = [3], stride: int = 1,
+               nempty: bool = False, direct_method: bool = False,
+               use_bias: bool = False, max_buffer: int = int(2e8)):
+    super().__init__()
+    self.in_channels = in_channels
+    self.out_channels = out_channels
+    self.kernel_size = resize_with_last_val(kernel_size)
+    self.kernel = list2str(self.kernel_size)
+    self.stride = stride
+    self.nempty = nempty
+    self.use_bias = use_bias
+    assert self.stride == 1, 'Only stride=1 is supported now.'
+    assert self.kernel == '333', 'Only kernel_size=[3,3,3] is supported now.'
+    self.kdim = self.kernel_size[0] * self.kernel_size[1] * self.kernel_size[2]
+    self.weights_shape = (self.kdim, self.in_channels, self.out_channels)
+    self.weights = torch.nn.Parameter(torch.Tensor(*self.weights_shape))
+    self.bias = (torch.nn.Parameter(torch.Tensor(self.out_channels))
+                 if use_bias else None)
+    self.reset_parameters()
+  def reset_parameters(self):
+    xavier_uniform_(self.weights)
+    if self.use_bias:
+      torch.nn.init.zeros_(self.bias)
+  def forward(self, data: torch.Tensor, octree: Octree, depth: int):
+    r''' Defines the octree convolution.
+    Args:
+      data (torch.Tensor): The input data.
+      octree (Octree): The corresponding octree.
+      depth (int): The depth of current octree.
+    '''
+    # TODO: remove the permute operation by changing the kernel implementation
+    weight = self.weights.permute(2, 0, 1)   # (V,Ci,Co) -> (Co,V,Ci)
+    neigh = octree.get_neigh(depth, self.kernel, self.stride, self.nempty)
+    out = octree_conv_triton(data, weight, self.bias, neigh)
+    return out
+  def extra_repr(self) -> str:
+    r''' Sets the extra representation of the module.
+    '''
+    return ('triton, in_channels={}, out_channels={}, kernel_size={}, stride={}, '
+            'nempty={}, bias={}').format(self.in_channels, self.out_channels,
+             self.kernel_size, self.stride, self.nempty, self.use_bias)  # noqa
+# alias
+OctreeConvT = OctreeConvTriton
+def convert_conv_triton(module: torch.nn.Module) -> torch.nn.Module:
+  r''' Convert OctreeConv modules to OctreeConvTriton modules in a network.
+  Args:
+    module (torch.nn.Module): The input module.
+  '''
+  module_out = module
+  if (isinstance(module, OctreeConv) and
+          module.stride == 1 and module.kernel_size == [3, 3, 3]):
+    module_out = OctreeConvTriton(
+        module.in_channels, module.out_channels, module.kernel_size,
+        module.stride, module.nempty, use_bias=module.use_bias,)
+    with torch.no_grad():
+      module_out.weights = module.weights
+      if module.use_bias:
+        module_out.bias = module.bias
+  for name, child in module.named_children():
+    module_out.add_module(name, convert_conv_triton(child))
+  del module
+  return module_out

ocnn/nn/octree_pad.py CHANGED Viewed

@@ -22,10 +22,10 @@ def octree_pad(data: torch.Tensor, octree: Octree, depth: int, val: float = 0.0)
     val (float): The padding value. (Default: :obj:`0.0`)
   '''
-  mask = octree.nempty_mask(depth)
+  idx = octree.nempty_index(depth)
   size = (octree.nnum[depth], data.shape[1])  # (N, C)
   out = torch.full(size, val, dtype=data.dtype, device=data.device)
-  out[mask] = data
+  out[idx] = data
   return out
@@ -35,5 +35,5 @@ def octree_depad(data: torch.Tensor, octree: Octree, depth: int):
   Please refer to :func:`octree_depad` for the meaning of the arguments.
   '''
-  mask = octree.nempty_mask(depth)
-  return data[mask]
+  idx = octree.nempty_index(depth)
+  return data[idx]

ocnn 2.2.7__py3-none-any.whl → 2.3.0__py3-none-any.whl

ocnn 2.2.7py3-none-any.whl → 2.3.0py3-none-any.whl