PyPI - ocnn - Versions diffs - 2.2.8__py3-none-any.whl → 2.3.0__py3-none-any.whl - Mend

ocnn 2.2.8py3-none-any.whl → 2.3.0py3-none-any.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Files changed (46) hide show

ocnn/__init__.py +24 -24
ocnn/dataset.py +160 -160
ocnn/models/__init__.py +29 -29
ocnn/models/autoencoder.py +155 -155
ocnn/models/hrnet.py +192 -192
ocnn/models/image2shape.py +128 -128
ocnn/models/lenet.py +46 -46
ocnn/models/ounet.py +94 -94
ocnn/models/resnet.py +53 -53
ocnn/models/segnet.py +72 -72
ocnn/models/unet.py +105 -105
ocnn/modules/__init__.py +26 -26
ocnn/modules/modules.py +303 -303
ocnn/modules/resblocks.py +158 -158
ocnn/nn/__init__.py +45 -44
ocnn/nn/kernels/__init__.py +14 -0
ocnn/nn/kernels/autotuner.py +416 -0
ocnn/nn/kernels/config.py +67 -0
ocnn/nn/kernels/conv_bwd_implicit_gemm.py +229 -0
ocnn/nn/kernels/conv_bwd_implicit_gemm_splitk.py +347 -0
ocnn/nn/kernels/conv_fwd_implicit_gemm.py +109 -0
ocnn/nn/kernels/conv_fwd_implicit_gemm_splitk.py +150 -0
ocnn/nn/kernels/utils.py +44 -0
ocnn/nn/octree2col.py +53 -53
ocnn/nn/octree2vox.py +50 -50
ocnn/nn/octree_align.py +46 -46
ocnn/nn/octree_conv.py +430 -429
ocnn/nn/octree_conv_t.py +148 -0
ocnn/nn/octree_drop.py +55 -55
ocnn/nn/octree_dwconv.py +222 -222
ocnn/nn/octree_gconv.py +79 -79
ocnn/nn/octree_interp.py +196 -196
ocnn/nn/octree_norm.py +126 -126
ocnn/nn/octree_pad.py +39 -39
ocnn/nn/octree_pool.py +200 -200
ocnn/octree/__init__.py +22 -22
ocnn/octree/octree.py +770 -770
ocnn/octree/points.py +384 -323
ocnn/octree/shuffled_key.py +115 -115
ocnn/utils.py +205 -205
{ocnn-2.2.8.dist-info → ocnn-2.3.0.dist-info}/METADATA +117 -111
ocnn-2.3.0.dist-info/RECORD +45 -0
{ocnn-2.2.8.dist-info → ocnn-2.3.0.dist-info}/WHEEL +1 -1
{ocnn-2.2.8.dist-info → ocnn-2.3.0.dist-info}/licenses/LICENSE +21 -21
ocnn-2.2.8.dist-info/RECORD +0 -36
{ocnn-2.2.8.dist-info → ocnn-2.3.0.dist-info}/top_level.txt +0 -0

ocnn/nn/kernels/conv_fwd_implicit_gemm_splitk.py ADDED Viewed

@@ -0,0 +1,150 @@
+import math
+import torch
+import triton
+import triton.language as tl
+from .utils import get_num_sm
+from .autotuner import triton_autotune, autotune
+from . import config
+from .conv_fwd_implicit_gemm import conv_fwd_implicit_gemm_kernel
+@triton_autotune(
+    configs=config.autotune_config,
+    key=['LOGN', 'Ci', 'Co', 'V', 'SPLITK', 'allow_tf32'],
+)
+@triton.jit
+def conv_fwd_implicit_gemm_splitk_kernel(
+    input,
+    weight,
+    bias,
+    neighbor,
+    output,
+    # Tensor dimensions
+    N, LOGN, Ci, Co, V: tl.constexpr,
+    # Meta-parameters
+    B1: tl.constexpr,   # Block size for N dimension
+    B2: tl.constexpr,   # Block size for Co dimension
+    BK: tl.constexpr,   # Block size for K dimension (V * Ci)
+    SPLITK: tl.constexpr,  # Split K dimension
+    allow_tf32: tl.constexpr,  # Allow TF32 precision for matmuls
+):
+    """
+    Sparse submanifold convolution forward kernel using implicit GEMM with split K dimension.
+    Args:
+        input (pointer): A pointer to the input tensor of shape (N, Ci)
+        weight (pointer): A pointer to the weight tensor of shape (Co, V, Ci)
+        bias (pointer): A pointer to the bias tensor of shape (Co)
+        neighbor (pointer): A pointer to the neighbor tensor of shape (N, V)
+        output (pointer): A pointer to the output tensor of shape (N, Co)
+    """
+    block_id_k = tl.program_id(axis=1)  # SplitK dimension
+    block_id = tl.program_id(axis=0)
+    block_dim_co = tl.cdiv(Co, B2)
+    block_id_co = block_id % block_dim_co
+    block_id_n = block_id // block_dim_co
+    # Create pointers for submatrices of A and B.
+    num_k = tl.cdiv(Ci, BK)  # Number of blocks in K dimension
+    k_start = tl.cdiv(num_k * V * block_id_k, SPLITK)
+    k_end = tl.cdiv(num_k * V * (block_id_k + 1), SPLITK)
+    offset_n = (block_id_n * B1 + tl.arange(0, B1)) % N         # (B1,)
+    offset_co = (block_id_co * B2 + tl.arange(0, B2)) % Co      # (B2,)
+    offset_k = tl.arange(0, BK)                                 # (BK,)
+    # Create a block of the output matrix C.
+    accumulator = tl.zeros((B1, B2), dtype=tl.float32)
+    curr_v = k_start // num_k
+    curr_bk = k_start % num_k
+    weight_offset_base = curr_v * Ci + curr_bk * BK
+    weight_ptr = weight + weight_offset_base + (offset_co[None, :] * V * Ci + offset_k[:, None])     # (BK, B2)
+    # Iterate along V*Ci dimension.
+    for k in range(k_start, k_end):
+        v = k // num_k
+        bk = k % num_k
+        # Calculate pointers to input matrix.
+        neighbor_offset_n = tl.load(neighbor + offset_n * V + v).to(tl.int64)                   # (B1,)
+        input_ptr = input + bk * BK + (neighbor_offset_n[:, None].to(tl.int64) * Ci + offset_k[None, :])     # (B1, BK)
+        # Load the next block of input and weight.
+        neigh_mask = neighbor_offset_n != -1
+        k_mask = offset_k < Ci - bk * BK
+        input_block = tl.load(input_ptr, mask=neigh_mask[:, None] & k_mask[None, :], other=0.0)
+        weight_block = tl.load(weight_ptr, mask=k_mask[:, None], other=0.0)
+        # Accumulate along the K dimension.
+        accumulator = tl.dot(input_block, weight_block, accumulator,
+                             input_precision='tf32' if allow_tf32 else 'ieee')                  # (B1, B2)
+        # Advance the pointers to the next Ci block.
+        weight_ptr += min(BK, Ci - bk * BK)
+    # add bias
+    if bias is not None and block_id_k == 0:
+        bias_block = tl.load(bias + offset_co)
+        accumulator += bias_block[None, :]
+    # Write back the block of the output matrix with masks.
+    out_offset_n = block_id_n * B1 + tl.arange(0, B1)
+    out_offset_co = block_id_co * B2 + tl.arange(0, B2)
+    out_ptr = output + block_id_k * N * Co + (out_offset_n[:, None] * Co + out_offset_co[None, :])
+    out_mask = (out_offset_n[:, None] < N) & (out_offset_co[None, :] < Co)
+    tl.store(out_ptr, accumulator, mask=out_mask)
+def conv_fwd_implicit_gemm_splitk_configs(input, weight, bias, neighbor):
+    N, Co = neighbor.shape[0], weight.shape[0]
+    MAX_NB1 = (N + 128 - 1) // 128
+    MAX_NB2 = (Co + 128 - 1) // 128
+    NUM_BLOCKS = MAX_NB1 * MAX_NB2
+    MIN_NUM_BLOCKS = get_num_sm()
+    MAX_NUM_BLOCKS = 32 * get_num_sm()
+    MIN_NUM_BLOCKS_LOG2 = max(0, int(math.log2(MIN_NUM_BLOCKS / NUM_BLOCKS)))
+    MAX_NUM_BLOCKS_LOG2 = max(1, int(math.log2(MAX_NUM_BLOCKS / NUM_BLOCKS) + 1))
+    configs = []
+    for i in range(MIN_NUM_BLOCKS_LOG2, MAX_NUM_BLOCKS_LOG2):
+        configs.append({'SPLITK': 2 ** i})
+    return configs
+def conv_fwd_implicit_gemm_splitk_keys(input, weight, bias, neighbor):
+    N, Ci, Co, V = neighbor.shape[0], input.shape[1], weight.shape[0], weight.shape[1]
+    return f'(2^{int(math.log2(N))}, {Ci}, {Co}, {V})'
+@autotune(
+    config_fn=conv_fwd_implicit_gemm_splitk_configs,
+    key_fn=conv_fwd_implicit_gemm_splitk_keys,
+)
+def conv_fwd_implicit_gemm_splitk(
+    input: torch.Tensor,
+    weight: torch.Tensor,
+    bias: torch.Tensor,
+    neighbor: torch.Tensor,
+    SPLITK: int = 1,
+) -> torch.Tensor:
+    assert input.shape[1] == weight.shape[2], "Incompatible dimensions"
+    assert input.is_contiguous(), "Matrix input must be contiguous"
+    assert weight.is_contiguous(), "Matrix weight must be contiguous"
+    assert neighbor.is_contiguous(), "Matrix neighbor must be contiguous"
+    N, Ci, Co, V = neighbor.shape[0], input.shape[1], weight.shape[0], weight.shape[1]
+    LOGN = int(math.log2(N))
+    # Launch the kernel.
+    if SPLITK == 1:
+        output = torch.empty((N, Co), device=input.device, dtype=input.dtype)
+        grid = lambda META: (triton.cdiv(Co, META['B2']) * triton.cdiv(N, META['B1']),)
+        conv_fwd_implicit_gemm_kernel[grid](
+            input, weight, bias, neighbor, output,
+            N, LOGN, Ci, Co, V,
+            allow_tf32=config.allow_tf32,
+        )
+        return output
+    else:
+        output = torch.empty((SPLITK, N, Co), device=input.device, dtype=torch.float32)
+        grid = lambda META: (triton.cdiv(Co, META['B2']) * triton.cdiv(N, META['B1']), SPLITK)
+        conv_fwd_implicit_gemm_splitk_kernel[grid](
+            input, weight, bias, neighbor, output,
+            N, LOGN, Ci, Co, V,
+            SPLITK=SPLITK,
+            allow_tf32=config.allow_tf32,
+        )
+        return output.sum(dim=0).to(input.dtype)

ocnn/nn/kernels/utils.py ADDED Viewed

@@ -0,0 +1,44 @@
+from typing import *
+import torch
+import triton
+def get_gpu_name():
+    return torch.cuda.get_device_name()
+def get_platform_name():
+    if torch.cuda.is_available():
+        if getattr(torch.version, 'hip', None) is not None:
+            return 'hip'
+        return 'cuda'
+    return 'unknown'
+def get_num_sm():
+    return torch.cuda.get_device_properties("cuda").multi_processor_count
+def get_autotune_config(
+    default: List[triton.Config] = None,
+    platform: Dict[str, List[triton.Config]] = None,
+    device: Dict[str, List[triton.Config]] = None,
+) -> List[triton.Config]:
+    """
+    Get the autotune configuration for the current platform and device.
+    """
+    if device is not None:
+        gpu_name = get_gpu_name()
+        for key, value in device.items():
+            if key.lower() in gpu_name.lower():
+                return value
+    if platform is not None:
+        platform_name = get_platform_name()
+        for key, value in platform.items():
+            if key.lower() in platform_name.lower():
+                return value
+    if default is None:
+        raise ValueError("No autotune configuration found for the current platform and device.")
+    return default

ocnn/nn/octree2col.py CHANGED Viewed

@@ -1,53 +1,53 @@
-# --------------------------------------------------------
-# Octree-based Sparse Convolutional Neural Networks
-# Copyright (c) 2022 Peng-Shuai Wang <wangps@hotmail.com>
-# Licensed under The MIT License [see LICENSE for details]
-# Written by Peng-Shuai Wang
-# --------------------------------------------------------
-import torch
-import torch.nn
-from ocnn.octree import Octree
-from ocnn.utils import scatter_add
-def octree2col(data: torch.Tensor, octree: Octree, depth: int,
-               kernel_size: str = '333', stride: int = 1, nempty: bool = False):
-  r''' Gathers the neighboring features for convolutions.
-  Args:
-    data (torch.Tensor): The input data.
-    octree (Octree): The corresponding octree.
-    depth (int): The depth of current octree.
-    kernel_size (str): The kernel shape, choose from :obj:`333`, :obj:`311`,
-        :obj:`131`, :obj:`113`, :obj:`222`, :obj:`331`, :obj:`133`, and
-        :obj:`313`.
-    stride (int): The stride of neighborhoods (:obj:`1` or :obj:`2`). If the
-        stride is :obj:`2`, it always returns the neighborhood of the first
-        siblings, and the number of elements of output tensor is
-        :obj:`octree.nnum[depth] / 8`.
-    nempty (bool): If True, only returns the neighborhoods of the non-empty
-        octree nodes.
-  '''
-  neigh = octree.get_neigh(depth, kernel_size, stride, nempty)
-  size = (neigh.shape[0], neigh.shape[1], data.shape[1])
-  out = torch.zeros(size, dtype=data.dtype, device=data.device)
-  valid = neigh >= 0
-  out[valid] = data[neigh[valid]]  # (N, K, C)
-  return out
-def col2octree(data: torch.Tensor, octree: Octree, depth: int,
-               kernel_size: str = '333', stride: int = 1, nempty: bool = False):
-  r''' Scatters the convolution features to an octree.
-  Please refer to :func:`octree2col` for the usage of function parameters.
-  '''
-  neigh = octree.get_neigh(depth, kernel_size, stride, nempty)
-  valid = neigh >= 0
-  dim_size = octree.nnum_nempty[depth] if nempty else octree.nnum[depth]
-  out = scatter_add(data[valid], neigh[valid], dim=0, dim_size=dim_size)
-  return out
+# --------------------------------------------------------
+# Octree-based Sparse Convolutional Neural Networks
+# Copyright (c) 2022 Peng-Shuai Wang <wangps@hotmail.com>
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Peng-Shuai Wang
+# --------------------------------------------------------
+import torch
+import torch.nn
+from ocnn.octree import Octree
+from ocnn.utils import scatter_add
+def octree2col(data: torch.Tensor, octree: Octree, depth: int,
+               kernel_size: str = '333', stride: int = 1, nempty: bool = False):
+  r''' Gathers the neighboring features for convolutions.
+  Args:
+    data (torch.Tensor): The input data.
+    octree (Octree): The corresponding octree.
+    depth (int): The depth of current octree.
+    kernel_size (str): The kernel shape, choose from :obj:`333`, :obj:`311`,
+        :obj:`131`, :obj:`113`, :obj:`222`, :obj:`331`, :obj:`133`, and
+        :obj:`313`.
+    stride (int): The stride of neighborhoods (:obj:`1` or :obj:`2`). If the
+        stride is :obj:`2`, it always returns the neighborhood of the first
+        siblings, and the number of elements of output tensor is
+        :obj:`octree.nnum[depth] / 8`.
+    nempty (bool): If True, only returns the neighborhoods of the non-empty
+        octree nodes.
+  '''
+  neigh = octree.get_neigh(depth, kernel_size, stride, nempty)
+  size = (neigh.shape[0], neigh.shape[1], data.shape[1])
+  out = torch.zeros(size, dtype=data.dtype, device=data.device)
+  valid = neigh >= 0
+  out[valid] = data[neigh[valid]]  # (N, K, C)
+  return out
+def col2octree(data: torch.Tensor, octree: Octree, depth: int,
+               kernel_size: str = '333', stride: int = 1, nempty: bool = False):
+  r''' Scatters the convolution features to an octree.
+  Please refer to :func:`octree2col` for the usage of function parameters.
+  '''
+  neigh = octree.get_neigh(depth, kernel_size, stride, nempty)
+  valid = neigh >= 0
+  dim_size = octree.nnum_nempty[depth] if nempty else octree.nnum[depth]
+  out = scatter_add(data[valid], neigh[valid], dim=0, dim_size=dim_size)
+  return out

ocnn/nn/octree2vox.py CHANGED Viewed

@@ -1,50 +1,50 @@
-# --------------------------------------------------------
-# Octree-based Sparse Convolutional Neural Networks
-# Copyright (c) 2022 Peng-Shuai Wang <wangps@hotmail.com>
-# Licensed under The MIT License [see LICENSE for details]
-# Written by Peng-Shuai Wang
-# --------------------------------------------------------
-import torch
-from ocnn.octree import Octree
-def octree2voxel(data: torch.Tensor, octree: Octree, depth: int,
-                 nempty: bool = False):
-  r''' Converts the input feature to the full-voxel-based representation.
-  Args:
-    data (torch.Tensor): The input feature.
-    octree (Octree): The corresponding octree.
-    depth (int): The depth of current octree.
-    nempty (bool): If True, :attr:`data` only contains the features of non-empty
-        octree nodes.
-  '''
-  x, y, z, b = octree.xyzb(depth, nempty)
-  num = 1 << depth
-  channel = data.shape[1]
-  vox = data.new_zeros([octree.batch_size, num, num, num, channel])
-  vox[b, x, y, z] = data
-  return vox
-class Octree2Voxel(torch.nn.Module):
-  r''' Converts the input feature to the full-voxel-based representation.
-  Please refer to :func:`octree2voxel` for details.
-  '''
-  def __init__(self, nempty: bool = False):
-    super().__init__()
-    self.nempty = nempty
-  def forward(self, data: torch.Tensor, octree: Octree, depth: int):
-    r''''''
-    return octree2voxel(data, octree, depth, self.nempty)
-  def extra_repr(self) -> str:
-    return 'nempty={}'.format(self.nempty)
+# --------------------------------------------------------
+# Octree-based Sparse Convolutional Neural Networks
+# Copyright (c) 2022 Peng-Shuai Wang <wangps@hotmail.com>
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Peng-Shuai Wang
+# --------------------------------------------------------
+import torch
+from ocnn.octree import Octree
+def octree2voxel(data: torch.Tensor, octree: Octree, depth: int,
+                 nempty: bool = False):
+  r''' Converts the input feature to the full-voxel-based representation.
+  Args:
+    data (torch.Tensor): The input feature.
+    octree (Octree): The corresponding octree.
+    depth (int): The depth of current octree.
+    nempty (bool): If True, :attr:`data` only contains the features of non-empty
+        octree nodes.
+  '''
+  x, y, z, b = octree.xyzb(depth, nempty)
+  num = 1 << depth
+  channel = data.shape[1]
+  vox = data.new_zeros([octree.batch_size, num, num, num, channel])
+  vox[b, x, y, z] = data
+  return vox
+class Octree2Voxel(torch.nn.Module):
+  r''' Converts the input feature to the full-voxel-based representation.
+  Please refer to :func:`octree2voxel` for details.
+  '''
+  def __init__(self, nempty: bool = False):
+    super().__init__()
+    self.nempty = nempty
+  def forward(self, data: torch.Tensor, octree: Octree, depth: int):
+    r''''''
+    return octree2voxel(data, octree, depth, self.nempty)
+  def extra_repr(self) -> str:
+    return 'nempty={}'.format(self.nempty)

ocnn/nn/octree_align.py CHANGED Viewed

@@ -1,46 +1,46 @@
-# --------------------------------------------------------
-# Octree-based Sparse Convolutional Neural Networks
-# Copyright (c) 2022 Peng-Shuai Wang <wangps@hotmail.com>
-# Licensed under The MIT License [see LICENSE for details]
-# Written by Peng-Shuai Wang
-# --------------------------------------------------------
-import torch
-from ocnn.octree import Octree
-def search_value(value: torch.Tensor, key: torch.Tensor, query: torch.Tensor):
-  r''' Searches values according to sorted shuffled keys.
-  Args:
-    value (torch.Tensor): The input tensor with shape (N, C).
-    key (torch.Tensor): The key tensor corresponds to :attr:`value` with shape
-        (N,), which contains sorted shuffled keys of an octree.
-    query (torch.Tensor): The query tensor, which also contains shuffled keys.
-  '''
-  # deal with out-of-bound queries, the indices of these queries
-  # returned by torch.searchsorted equal to `key.shape[0]`
-  out_of_bound = query > key[-1]
-  # search
-  idx = torch.searchsorted(key, query)
-  idx[out_of_bound] = -1   # to avoid overflow when executing the following line
-  found = key[idx] == query
-  # assign the found value to the output
-  out = torch.zeros(query.shape[0], value.shape[1], device=value.device)
-  out[found] = value[idx[found]]
-  return out
-def octree_align(value: torch.Tensor, octree: Octree, octree_query: Octree,
-                 depth: int, nempty: bool = False):
-  r''' Wraps :func:`octree_align` to take octrees as input for convenience.
-  '''
-  key = octree.key(depth, nempty)
-  query = octree_query.key(depth, nempty)
-  assert key.shape[0] == value.shape[0]
-  return search_value(value, key, query)
+# --------------------------------------------------------
+# Octree-based Sparse Convolutional Neural Networks
+# Copyright (c) 2022 Peng-Shuai Wang <wangps@hotmail.com>
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Peng-Shuai Wang
+# --------------------------------------------------------
+import torch
+from ocnn.octree import Octree
+def search_value(value: torch.Tensor, key: torch.Tensor, query: torch.Tensor):
+  r''' Searches values according to sorted shuffled keys.
+  Args:
+    value (torch.Tensor): The input tensor with shape (N, C).
+    key (torch.Tensor): The key tensor corresponds to :attr:`value` with shape
+        (N,), which contains sorted shuffled keys of an octree.
+    query (torch.Tensor): The query tensor, which also contains shuffled keys.
+  '''
+  # deal with out-of-bound queries, the indices of these queries
+  # returned by torch.searchsorted equal to `key.shape[0]`
+  out_of_bound = query > key[-1]
+  # search
+  idx = torch.searchsorted(key, query)
+  idx[out_of_bound] = -1   # to avoid overflow when executing the following line
+  found = key[idx] == query
+  # assign the found value to the output
+  out = torch.zeros(query.shape[0], value.shape[1], device=value.device)
+  out[found] = value[idx[found]]
+  return out
+def octree_align(value: torch.Tensor, octree: Octree, octree_query: Octree,
+                 depth: int, nempty: bool = False):
+  r''' Wraps :func:`octree_align` to take octrees as input for convenience.
+  '''
+  key = octree.key(depth, nempty)
+  query = octree_query.key(depth, nempty)
+  assert key.shape[0] == value.shape[0]
+  return search_value(value, key, query)

ocnn 2.2.8__py3-none-any.whl → 2.3.0__py3-none-any.whl

ocnn 2.2.8py3-none-any.whl → 2.3.0py3-none-any.whl