PyPI - warp-lang - Versions diffs - 1.5.1__py3-none-manylinux2014_aarch64.whl → 1.6.1__py3-none-manylinux2014_aarch64.whl - Mend

warp-lang 1.5.1__py3-none-manylinux2014_aarch64.whl → 1.6.1__py3-none-manylinux2014_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (131) hide show

warp/__init__.py +5 -0
warp/autograd.py +414 -191
warp/bin/warp-clang.so +0 -0
warp/bin/warp.so +0 -0
warp/build.py +40 -12
warp/build_dll.py +13 -6
warp/builtins.py +1077 -481
warp/codegen.py +250 -122
warp/config.py +65 -21
warp/context.py +500 -149
warp/examples/assets/square_cloth.usd +0 -0
warp/examples/benchmarks/benchmark_gemm.py +27 -18
warp/examples/benchmarks/benchmark_interop_paddle.py +3 -3
warp/examples/benchmarks/benchmark_interop_torch.py +3 -3
warp/examples/core/example_marching_cubes.py +1 -1
warp/examples/core/example_mesh.py +1 -1
warp/examples/core/example_torch.py +18 -34
warp/examples/core/example_wave.py +1 -1
warp/examples/fem/example_apic_fluid.py +1 -0
warp/examples/fem/example_mixed_elasticity.py +1 -1
warp/examples/optim/example_bounce.py +1 -1
warp/examples/optim/example_cloth_throw.py +1 -1
warp/examples/optim/example_diffray.py +4 -15
warp/examples/optim/example_drone.py +1 -1
warp/examples/optim/example_softbody_properties.py +392 -0
warp/examples/optim/example_trajectory.py +1 -3
warp/examples/optim/example_walker.py +5 -0
warp/examples/sim/example_cartpole.py +0 -2
warp/examples/sim/example_cloth_self_contact.py +314 -0
warp/examples/sim/example_granular_collision_sdf.py +4 -5
warp/examples/sim/example_jacobian_ik.py +0 -2
warp/examples/sim/example_quadruped.py +5 -2
warp/examples/tile/example_tile_cholesky.py +79 -0
warp/examples/tile/example_tile_convolution.py +2 -2
warp/examples/tile/example_tile_fft.py +2 -2
warp/examples/tile/example_tile_filtering.py +3 -3
warp/examples/tile/example_tile_matmul.py +4 -4
warp/examples/tile/example_tile_mlp.py +12 -12
warp/examples/tile/example_tile_nbody.py +191 -0
warp/examples/tile/example_tile_walker.py +319 -0
warp/math.py +147 -0
warp/native/array.h +12 -0
warp/native/builtin.h +0 -1
warp/native/bvh.cpp +149 -70
warp/native/bvh.cu +287 -68
warp/native/bvh.h +195 -85
warp/native/clang/clang.cpp +6 -2
warp/native/crt.h +1 -0
warp/native/cuda_util.cpp +35 -0
warp/native/cuda_util.h +5 -0
warp/native/exports.h +40 -40
warp/native/intersect.h +17 -0
warp/native/mat.h +57 -3
warp/native/mathdx.cpp +19 -0
warp/native/mesh.cpp +25 -8
warp/native/mesh.cu +153 -101
warp/native/mesh.h +482 -403
warp/native/quat.h +40 -0
warp/native/solid_angle.h +7 -0
warp/native/sort.cpp +85 -0
warp/native/sort.cu +34 -0
warp/native/sort.h +3 -1
warp/native/spatial.h +11 -0
warp/native/tile.h +1189 -664
warp/native/tile_reduce.h +8 -6
warp/native/vec.h +41 -0
warp/native/warp.cpp +8 -1
warp/native/warp.cu +263 -40
warp/native/warp.h +19 -5
warp/optim/linear.py +22 -4
warp/render/render_opengl.py +132 -59
warp/render/render_usd.py +10 -2
warp/sim/__init__.py +6 -1
warp/sim/collide.py +289 -32
warp/sim/import_urdf.py +20 -5
warp/sim/integrator_euler.py +25 -7
warp/sim/integrator_featherstone.py +147 -35
warp/sim/integrator_vbd.py +842 -40
warp/sim/model.py +173 -112
warp/sim/render.py +2 -2
warp/stubs.py +249 -116
warp/tape.py +28 -30
warp/tests/aux_test_module_unload.py +15 -0
warp/tests/{test_sim_grad.py → flaky_test_sim_grad.py} +104 -63
warp/tests/test_array.py +100 -0
warp/tests/test_assert.py +242 -0
warp/tests/test_codegen.py +14 -61
warp/tests/test_collision.py +8 -8
warp/tests/test_examples.py +16 -1
warp/tests/test_grad_debug.py +87 -2
warp/tests/test_hash_grid.py +1 -1
warp/tests/test_ipc.py +116 -0
warp/tests/test_launch.py +77 -26
warp/tests/test_mat.py +213 -168
warp/tests/test_math.py +47 -1
warp/tests/test_matmul.py +11 -7
warp/tests/test_matmul_lite.py +4 -4
warp/tests/test_mesh.py +84 -60
warp/tests/test_mesh_query_aabb.py +165 -0
warp/tests/test_mesh_query_point.py +328 -286
warp/tests/test_mesh_query_ray.py +134 -121
warp/tests/test_mlp.py +2 -2
warp/tests/test_operators.py +43 -0
warp/tests/test_overwrite.py +6 -5
warp/tests/test_quat.py +77 -0
warp/tests/test_reload.py +29 -0
warp/tests/test_sim_grad_bounce_linear.py +204 -0
warp/tests/test_static.py +16 -0
warp/tests/test_tape.py +25 -0
warp/tests/test_tile.py +134 -191
warp/tests/test_tile_load.py +399 -0
warp/tests/test_tile_mathdx.py +61 -8
warp/tests/test_tile_mlp.py +17 -17
warp/tests/test_tile_reduce.py +24 -18
warp/tests/test_tile_shared_memory.py +66 -17
warp/tests/test_tile_view.py +165 -0
warp/tests/test_torch.py +35 -0
warp/tests/test_utils.py +36 -24
warp/tests/test_vec.py +110 -0
warp/tests/unittest_suites.py +29 -4
warp/tests/unittest_utils.py +30 -11
warp/thirdparty/unittest_parallel.py +5 -2
warp/types.py +419 -111
warp/utils.py +9 -5
{warp_lang-1.5.1.dist-info → warp_lang-1.6.1.dist-info}/METADATA +86 -45
{warp_lang-1.5.1.dist-info → warp_lang-1.6.1.dist-info}/RECORD +129 -118
{warp_lang-1.5.1.dist-info → warp_lang-1.6.1.dist-info}/WHEEL +1 -1
warp/examples/benchmarks/benchmark_tile.py +0 -179
warp/native/tile_gemm.h +0 -341
{warp_lang-1.5.1.dist-info → warp_lang-1.6.1.dist-info}/LICENSE.md +0 -0
{warp_lang-1.5.1.dist-info → warp_lang-1.6.1.dist-info}/top_level.txt +0 -0

warp/types.py CHANGED Viewed

@@ -171,8 +171,7 @@ def vector(length, dtype):
                     iter(value)
                 except TypeError:
                     raise TypeError(
-                        f"Expected to assign a slice from a sequence of values "
-                        f"but got `{type(value).__name__}` instead"
+                        f"Expected to assign a slice from a sequence of values but got `{type(value).__name__}` instead"
                     ) from None
                 if self._wp_scalar_type_ == float16:
@@ -350,6 +349,9 @@ def matrix(shape, dtype):
                     f"Invalid number of arguments in matrix constructor, expected {self._length_} elements, got {num_args}"
                 )
+        def __len__(self):
+            return self._shape_[0]
         def __add__(self, y):
             return warp.add(self, y)
@@ -419,7 +421,7 @@ def matrix(shape, dtype):
                 iter(v)
             except TypeError:
                 raise TypeError(
-                    f"Expected to assign a slice from a sequence of values " f"but got `{type(v).__name__}` instead"
+                    f"Expected to assign a slice from a sequence of values but got `{type(v).__name__}` instead"
                 ) from None
             row_start = r * self._shape_[1]
@@ -676,6 +678,10 @@ def transformation(dtype=Any):
         def __init__(self, *args, **kwargs):
             if len(args) == 1 and len(kwargs) == 0:
+                if is_float(args[0]):
+                    # Initialize from a single scalar.
+                    super().__init__(args[0])
+                    return
                 if args[0]._wp_generic_type_str_ == self._wp_generic_type_str_:
                     # Copy constructor.
                     super().__init__(*args[0])
@@ -1314,7 +1320,7 @@ def type_repr(t):
     if is_array(t):
         return str(f"array(ndim={t.ndim}, dtype={t.dtype})")
     if is_tile(t):
-        return str(f"tile(dtype={t.dtype}, m={t.M}, n={t.N})")
+        return str(f"tile(dtype={t.dtype}, shape={t.shape}")
     if type_is_vector(t):
         return str(f"vector(length={t._shape_[0]}, dtype={t._wp_scalar_type_})")
     if type_is_matrix(t):
@@ -1357,6 +1363,11 @@ def type_is_matrix(t):
     return getattr(t, "_wp_generic_type_hint_", None) is Matrix
+# returns True if the passed *type* is a transformation
+def type_is_transformation(t):
+    return getattr(t, "_wp_generic_type_hint_", None) is Transformation
 value_types = (int, float, builtins.bool) + scalar_types
@@ -1514,7 +1525,7 @@ def strides_from_shape(shape: Tuple, dtype):
 def check_array_shape(shape: Tuple):
-    """Checks that the size in each dimension is positive and less than 2^32."""
+    """Checks that the size in each dimension is positive and less than 2^31."""
     for dim_index, dim_size in enumerate(shape):
         if dim_size < 0:
@@ -1701,8 +1712,22 @@ class array(Array):
                     )
         elif length is not None:
             # backward compatibility
+            warp.utils.warn(
+                "The 'length' keyword is deprecated and will be removed in a future version. Use 'shape' instead.",
+                category=DeprecationWarning,
+                stacklevel=2,
+            )
             shape = (length,)
+        if owner:
+            warp.utils.warn(
+                "The 'owner' keyword in the array initializer is\n"
+                "deprecated and will be removed in a future version. It currently has no effect.\n"
+                "Pass a function to the 'deleter' keyword instead.",
+                category=DeprecationWarning,
+                stacklevel=2,
+            )
         # determine the construction path from the given arguments
         if data is not None:
             # data or ptr, not both
@@ -1734,32 +1759,6 @@ class array(Array):
         if not hasattr(data, "__len__"):
             raise RuntimeError(f"Data must be a sequence or array, got scalar {data}")
-        if hasattr(data, "__cuda_array_interface__"):
-            try:
-                # Performance note: try first, ask questions later
-                device = warp.context.runtime.get_device(device)
-            except Exception:
-                # Fallback to using the public API for retrieving the device,
-                # which takes take of initializing Warp if needed.
-                device = warp.context.get_device(device)
-            if device.is_cuda:
-                desc = data.__cuda_array_interface__
-                shape = desc.get("shape")
-                strides = desc.get("strides")
-                dtype = np_dtype_to_warp_type[np.dtype(desc.get("typestr"))]
-                ptr = desc.get("data")[0]
-                self._init_from_ptr(ptr, dtype, shape, strides, None, device, False, None)
-                # keep a ref to the source data to keep allocation alive
-                self._ref = data
-                return
-            else:
-                raise RuntimeError(
-                    f"Trying to construct a Warp array from data argument's __cuda_array_interface__ but {device} is not CUDA-capable"
-                )
         if hasattr(dtype, "_wp_scalar_type_"):
             dtype_shape = dtype._shape_
             dtype_ndim = len(dtype_shape)
@@ -1769,6 +1768,76 @@ class array(Array):
             dtype_ndim = 0
             scalar_dtype = dtype
+        try:
+            # Performance note: try first, ask questions later
+            device = warp.context.runtime.get_device(device)
+        except Exception:
+            # Fallback to using the public API for retrieving the device,
+            # which takes take of initializing Warp if needed.
+            device = warp.context.get_device(device)
+        if device.is_cuda and hasattr(data, "__cuda_array_interface__"):
+            desc = data.__cuda_array_interface__
+            data_shape = desc.get("shape")
+            data_strides = desc.get("strides")
+            data_dtype = np.dtype(desc.get("typestr"))
+            data_ptr = desc.get("data")[0]
+            if dtype == Any:
+                dtype = np_dtype_to_warp_type[data_dtype]
+            if data_strides is None:
+                data_strides = strides_from_shape(data_shape, dtype)
+            data_ndim = len(data_shape)
+            # determine whether the input needs reshaping
+            target_npshape = None
+            if shape is not None:
+                target_npshape = (*shape, *dtype_shape)
+            elif dtype_ndim > 0:
+                # prune inner dimensions of length 1
+                while data_ndim > 1 and data_shape[-1] == 1:
+                    data_shape = data_shape[:-1]
+                # if the inner dims don't match exactly, check if the innermost dim is a multiple of type length
+                if data_ndim < dtype_ndim or data_shape[-dtype_ndim:] != dtype_shape:
+                    if data_shape[-1] == dtype._length_:
+                        target_npshape = (*data_shape[:-1], *dtype_shape)
+                    elif data_shape[-1] % dtype._length_ == 0:
+                        target_npshape = (*data_shape[:-1], data_shape[-1] // dtype._length_, *dtype_shape)
+                    else:
+                        if dtype_ndim == 1:
+                            raise RuntimeError(
+                                f"The inner dimensions of the input data are not compatible with the requested vector type {warp.context.type_str(dtype)}: expected an inner dimension that is a multiple of {dtype._length_}"
+                            )
+                        else:
+                            raise RuntimeError(
+                                f"The inner dimensions of the input data are not compatible with the requested matrix type {warp.context.type_str(dtype)}: expected inner dimensions {dtype._shape_} or a multiple of {dtype._length_}"
+                            )
+            if target_npshape is None:
+                target_npshape = data_shape if shape is None else shape
+            # determine final shape and strides
+            if dtype_ndim > 0:
+                # make sure the inner dims are contiguous for vector/matrix types
+                scalar_size = type_size_in_bytes(dtype._wp_scalar_type_)
+                inner_contiguous = data_strides[-1] == scalar_size
+                if inner_contiguous and dtype_ndim > 1:
+                    inner_contiguous = data_strides[-2] == scalar_size * dtype_shape[-1]
+                shape = target_npshape[:-dtype_ndim] or (1,)
+                strides = data_strides if shape == data_shape else strides_from_shape(shape, dtype)
+            else:
+                shape = target_npshape or (1,)
+                strides = data_strides if shape == data_shape else strides_from_shape(shape, dtype)
+            self._init_from_ptr(data_ptr, dtype, shape, strides, None, device, False, None)
+            # keep a ref to the source data to keep allocation alive
+            self._ref = data
+            return
         # convert input data to ndarray (handles lists, tuples, etc.) and determine dtype
         if dtype == Any:
             # infer dtype from data
@@ -1971,7 +2040,21 @@ class array(Array):
         else:
             strides = tuple(strides)
             is_contiguous = strides == contiguous_strides
-            capacity = shape[0] * strides[0]
+            # To calculate the required capacity, find the dimension with largest stride.
+            # Normally it is the first one, but it could be different (e.g., transposed arrays).
+            max_stride = strides[0]
+            max_dim = 0
+            for i in range(1, ndim):
+                if strides[i] > max_stride:
+                    max_stride = strides[i]
+                    max_dim = i
+            if max_stride > 0:
+                capacity = shape[max_dim] * strides[max_dim]
+            else:
+                # single element storage with zero strides
+                capacity = dtype_size
         allocator = device.get_allocator(pinned=pinned)
         if capacity > 0:
@@ -1990,6 +2073,7 @@ class array(Array):
         self.pinned = pinned if device.is_cpu else False
         self.is_contiguous = is_contiguous
         self.deleter = allocator.deleter
+        self._allocator = allocator
     def _init_annotation(self, dtype, ndim):
         self.dtype = dtype
@@ -2706,6 +2790,52 @@ class array(Array):
         a._ref = self
         return a
+    def ipc_handle(self) -> bytes:
+        """Return an IPC handle of the array as a 64-byte ``bytes`` object
+        :func:`from_ipc_handle` can be used with this handle in another process
+        to obtain a :class:`array` that shares the same underlying memory
+        allocation.
+        IPC is currently only supported on Linux.
+        Additionally, IPC is only supported for arrays allocated using
+        the default memory allocator.
+        :class:`Event` objects created with the ``interprocess=True`` argument
+        may similarly be shared between processes to synchronize GPU work.
+        Example:
+            Temporarily using the default memory allocator to allocate an array
+            and get its IPC handle::
+                with wp.ScopedMempool("cuda:0", False):
+                    test_array = wp.full(1024, value=42.0, dtype=wp.float32, device="cuda:0")
+                    ipc_handle = test_array.ipc_handle()
+        Raises:
+            RuntimeError: The array is not associated with a CUDA device.
+            RuntimeError: The CUDA device does not appear to support IPC.
+            RuntimeError: The array was allocated using the :ref:`mempool memory allocator <mempool_allocators>`.
+        """
+        if self.device is None or not self.device.is_cuda:
+            raise RuntimeError("IPC requires a CUDA device")
+        elif self.device.is_ipc_supported is False:
+            raise RuntimeError("IPC does not appear to be supported on this CUDA device")
+        elif isinstance(self._allocator, warp.context.CudaMempoolAllocator):
+            raise RuntimeError(
+                "Currently, IPC is only supported for arrays using the default memory allocator.\n"
+                "See https://nvidia.github.io/warp/modules/allocators.html for instructions on how to disable\n"
+                f"the mempool allocator on device {self.device}."
+            )
+        # Allocate a buffer for the data (64-element char array)
+        ipc_handle_buffer = (ctypes.c_char * 64)()
+        warp.context.runtime.core.cuda_ipc_get_mem_handle(self.ptr, ipc_handle_buffer)
+        return ipc_handle_buffer.raw
 # aliases for arrays with small dimensions
 def array1d(*args, **kwargs):
@@ -2733,7 +2863,13 @@ def array4d(*args, **kwargs):
 def from_ptr(ptr, length, dtype=None, shape=None, device=None):
     warp.utils.warn(
-        "This version of wp.from_ptr() is deprecated. OmniGraph applications should use from_omni_graph_ptr() instead. In the future, wp.from_ptr() will work only with regular pointers.",
+        """This version of wp.from_ptr() is deprecated. OmniGraph
+    applications should use from_omni_graph_ptr() instead. To create an array
+    from a C pointer, use the array constructor and pass the ptr argument as a
+    uint64 value representing the start address in memory where the existing
+    array resides. For example, if using ctypes, pass
+    ptr=ctypes.cast(pointer, ctypes.POINTER(ctypes.c_size_t)).contents.value.
+    Be sure to also specify the dtype and shape parameters.""",
         category=DeprecationWarning,
     )
@@ -2748,6 +2884,51 @@ def from_ptr(ptr, length, dtype=None, shape=None, device=None):
     )
+def _close_cuda_ipc_handle(ptr, size):
+    warp.context.runtime.core.cuda_ipc_close_mem_handle(ptr)
+def from_ipc_handle(
+    handle: bytes, dtype, shape: Tuple[int, ...], strides: Optional[Tuple[int, ...]] = None, device=None
+) -> array:
+    """Create an array from an IPC handle.
+    The ``dtype``, ``shape``, and optional ``strides`` arguments should
+    match the values from the :class:`array` from which ``handle`` was created.
+    Args:
+        handle: The interprocess memory handle for an existing device memory allocation.
+        dtype: One of the available `data types <#data-types>`_, such as :class:`warp.float32`, :class:`warp.mat33`, or a custom `struct <#structs>`_.
+        shape: Dimensions of the array.
+        strides: Number of bytes in each dimension between successive elements of the array.
+        device (Devicelike): Device to associate with the array.
+    Returns:
+        An array created from the existing memory allocation described by the interprocess memory handle ``handle``.
+        A copy of the underlying data is not made. Modifications to the array's data will be reflected in the
+        original process from which ``handle`` was exported.
+    Raises:
+        RuntimeError: IPC is not supported on ``device``.
+    """
+    try:
+        # Performance note: try first, ask questions later
+        device = warp.context.runtime.get_device(device)
+    except Exception:
+        # Fallback to using the public API for retrieving the device,
+        # which takes take of initializing Warp if needed.
+        device = warp.context.get_device(device)
+    if device.is_ipc_supported is False:
+        raise RuntimeError(f"IPC is not supported on device {device}.")
+    ptr = warp.context.runtime.core.cuda_ipc_open_mem_handle(device.context, handle)
+    return array(ptr=ptr, dtype=dtype, shape=shape, strides=strides, device=device, deleter=_close_cuda_ipc_handle)
 # A base class for non-contiguous arrays, providing the implementation of common methods like
 # contiguous(), to(), numpy(), list(), assign(), zero_(), and fill_().
 class noncontiguous_array_base(Generic[T]):
@@ -2985,25 +3166,38 @@ def array_type_id(a):
         raise ValueError("Invalid array type")
-# tile expression objects
+# tile object
 class Tile:
     alignment = 16
-    def __init__(self, dtype, M, N, op=None, storage="register", layout="rowmajor", strides=None, owner=True):
+    def __init__(self, dtype, shape, op=None, storage="register", layout="rowmajor", strides=None, owner=True):
         self.dtype = type_to_warp(dtype)
-        self.M = M
-        self.N = N
+        self.shape = shape
         self.op = op
         self.storage = storage
         self.layout = layout
+        self.strides = strides
-        if strides is None:
-            if layout == "rowmajor":
-                self.strides = (N, 1)
-            elif layout == "colmajor":
-                self.strides = (1, M)
-        else:
-            self.strides = strides
+        # handle case where shape is concrete (rather than just Any)
+        if isinstance(self.shape, (list, tuple)):
+            if len(shape) == 0:
+                raise RuntimeError("Empty shape specified, must have at least 1 dimension")
+            # compute total size
+            self.size = 1
+            for s in self.shape:
+                self.size *= s
+            # if strides are not provided compute default strides
+            if self.strides is None:
+                self.strides = [1] * len(self.shape)
+                if layout == "rowmajor":
+                    for i in range(len(self.shape) - 2, -1, -1):
+                        self.strides[i] = self.strides[i + 1] * self.shape[i + 1]
+                else:
+                    for i in range(1, len(shape)):
+                        self.strides[i] = self.strides[i - 1] * self.shape[i - 1]
         self.owner = owner
@@ -3012,9 +3206,9 @@ class Tile:
         from warp.codegen import Var
         if self.storage == "register":
-            return f"wp::tile_register_t<{Var.type_to_ctype(self.dtype)},{self.M},{self.N}>"
+            return f"wp::tile_register_t<{Var.type_to_ctype(self.dtype)},wp::tile_layout_register_t<wp::tile_shape_t<{','.join(map(str, self.shape))}>>>"
         elif self.storage == "shared":
-            return f"wp::tile_shared_t<{Var.type_to_ctype(self.dtype)},{self.M},{self.N},{self.strides[0]}, {self.strides[1]}, {'true' if self.owner else 'false'}>"
+            return f"wp::tile_shared_t<{Var.type_to_ctype(self.dtype)},wp::tile_layout_strided_t<wp::tile_shape_t<{','.join(map(str, self.shape))}>, wp::tile_stride_t<{','.join(map(str, self.strides))}>>, {'true' if self.owner else 'false'}>"
         else:
             raise RuntimeError(f"Unrecognized tile storage type {self.storage}")
@@ -3027,24 +3221,33 @@ class Tile:
         elif self.storage == "shared":
             if self.owner:
                 # allocate new shared memory tile
-                return f"wp::tile_alloc_empty<{Var.type_to_ctype(self.dtype)},{self.M},{self.N},{'true' if requires_grad else 'false'}>()"
+                return f"wp::tile_alloc_empty<{Var.type_to_ctype(self.dtype)},wp::tile_shape_t<{','.join(map(str, self.shape))}>,{'true' if requires_grad else 'false'}>()"
             else:
                 # tile will be initialized by another call, e.g.: tile_transpose()
                 return "NULL"
     # return total tile size in bytes
     def size_in_bytes(self):
-        num_bytes = self.align(type_size_in_bytes(self.dtype) * self.M * self.N)
+        num_bytes = self.align(type_size_in_bytes(self.dtype) * self.size)
         return num_bytes
+    @staticmethod
+    def round_up(bytes):
+        return ((bytes + Tile.alignment - 1) // Tile.alignment) * Tile.alignment
     # align tile size to natural boundary, default 16-bytes
     def align(self, bytes):
-        return ((bytes + self.alignment - 1) // self.alignment) * self.alignment
+        return Tile.round_up(bytes)
 class TileZeros(Tile):
-    def __init__(self, dtype, M, N, storage="register"):
-        Tile.__init__(self, dtype, M, N, op="zeros", storage=storage)
+    def __init__(self, dtype, shape, storage="register"):
+        Tile.__init__(self, dtype, shape, op="zeros", storage=storage)
+class TileOnes(Tile):
+    def __init__(self, dtype, shape, storage="register"):
+        Tile.__init__(self, dtype, shape, op="ones", storage=storage)
 class TileRange(Tile):
@@ -3053,32 +3256,39 @@ class TileRange(Tile):
         self.stop = stop
         self.step = step
-        M = 1
-        N = int((stop - start) / step)
+        n = int((stop - start) / step)
-        Tile.__init__(self, dtype, M, N, op="arange", storage=storage)
+        Tile.__init__(self, dtype, shape=(n,), op="arange", storage=storage)
 class TileConstant(Tile):
-    def __init__(self, dtype, M, N):
-        Tile.__init__(self, dtype, M, N, op="constant", storage="register")
+    def __init__(self, dtype, shape):
+        Tile.__init__(self, dtype, shape, op="constant", storage="register")
 class TileLoad(Tile):
-    def __init__(self, array, M, N, storage="register"):
-        Tile.__init__(self, array.dtype, M, N, op="load", storage=storage)
+    def __init__(self, array, shape, storage="register"):
+        Tile.__init__(self, array.dtype, shape, op="load", storage=storage)
 class TileUnaryMap(Tile):
-    def __init__(self, t, storage="register"):
-        Tile.__init__(self, t.dtype, t.M, t.N, op="unary_map", storage=storage)
+    def __init__(self, t, dtype=None, storage="register"):
+        Tile.__init__(self, dtype, t.shape, op="unary_map", storage=storage)
+        # if no output dtype specified then assume it's the same as the first arg
+        if self.dtype is None:
+            self.dtype = t.dtype
         self.t = t
 class TileBinaryMap(Tile):
-    def __init__(self, a, b, storage="register"):
-        Tile.__init__(self, a.dtype, a.M, a.N, op="binary_map", storage=storage)
+    def __init__(self, a, b, dtype=None, storage="register"):
+        Tile.__init__(self, dtype, a.shape, op="binary_map", storage=storage)
+        # if no output dtype specified then assume it's the same as the first arg
+        if self.dtype is None:
+            self.dtype = a.dtype
         self.a = a
         self.b = b
@@ -3086,7 +3296,7 @@ class TileBinaryMap(Tile):
 class TileShared(Tile):
     def __init__(self, t):
-        Tile.__init__(self, t.dtype, t.M, t.N, "shared", storage="shared")
+        Tile.__init__(self, t.dtype, t.shape, "shared", storage="shared")
         self.t = t
@@ -3095,35 +3305,66 @@ def is_tile(t):
     return isinstance(t, Tile)
+bvh_constructor_values = {"sah": 0, "median": 1, "lbvh": 2}
 class Bvh:
     def __new__(cls, *args, **kwargs):
         instance = super(Bvh, cls).__new__(cls)
         instance.id = None
         return instance
-    def __init__(self, lowers, uppers):
+    def __init__(self, lowers: array, uppers: array, constructor: Optional[str] = None):
         """Class representing a bounding volume hierarchy.
+        Depending on which device the input bounds live, it can be either a CPU tree or a GPU tree.
         Attributes:
-            id: Unique identifier for this bvh object, can be passed to kernels.
+            id: Unique identifier for this BVH object, can be passed to kernels.
             device: Device this object lives on, all buffers must live on the same device.
         Args:
-            lowers (:class:`warp.array`): Array of lower bounds :class:`warp.vec3`
-            uppers (:class:`warp.array`): Array of upper bounds :class:`warp.vec3`
+            lowers: Array of lower bounds of data type :class:`warp.vec3`.
+            uppers: Array of upper bounds of data type :class:`warp.vec3`.
+              ``lowers`` and ``uppers`` must live on the same device.
+            constructor: The construction algorithm used to build the tree.
+              Valid choices are ``"sah"``, ``"median"``, ``"lbvh"``, or ``None``.
+              When ``None``, the default constructor will be used (see the note).
+        Note:
+            Explanation of BVH constructors:
+            - ``"sah"``: A CPU-based top-down constructor where the AABBs are split based on Surface Area
+              Heuristics (SAH). Construction takes slightly longer than others but has the best query
+              performance.
+            - ``"median"``: A CPU-based top-down constructor where the AABBs are split based on the median
+              of centroids of primitives in an AABB. This constructor is faster than SAH but offers
+              inferior query performance.
+            - ``"lbvh"``: A GPU-based bottom-up constructor which maximizes parallelism. Construction is very
+              fast, especially for large models. Query performance is slightly slower than ``"sah"``.
+            - ``None``: The constructor will be automatically chosen based on the device where the tree
+              lives. For a GPU tree, the ``"lbvh"`` constructor will be selected; for a CPU tree, the ``"sah"``
+              constructor will be selected.
+            All three constructors are supported for GPU trees. When a CPU-based constructor is selected
+            for a GPU tree, bounds will be copied back to the CPU to run the CPU-based constructor. After
+            construction, the CPU tree will be copied to the GPU.
+            Only ``"sah"`` and ``"median"`` are supported for CPU trees. If ``"lbvh"`` is selected for a CPU tree, a
+            warning message will be issued, and the constructor will automatically fall back to ``"sah"``.
         """
         if len(lowers) != len(uppers):
-            raise RuntimeError("Bvh the same number of lower and upper bounds must be provided")
+            raise RuntimeError("The same number of lower and upper bounds must be provided")
         if lowers.device != uppers.device:
-            raise RuntimeError("Bvh lower and upper bounds must live on the same device")
+            raise RuntimeError("Lower and upper bounds must live on the same device")
         if lowers.dtype != vec3 or not lowers.is_contiguous:
-            raise RuntimeError("Bvh lowers should be a contiguous array of type wp.vec3")
+            raise RuntimeError("lowers should be a contiguous array of type wp.vec3")
         if uppers.dtype != vec3 or not uppers.is_contiguous:
-            raise RuntimeError("Bvh uppers should be a contiguous array of type wp.vec3")
+            raise RuntimeError("uppers should be a contiguous array of type wp.vec3")
         self.device = lowers.device
         self.lowers = lowers
@@ -3137,11 +3378,32 @@ class Bvh:
         self.runtime = warp.context.runtime
+        if constructor is None:
+            if self.device.is_cpu:
+                constructor = "sah"
+            else:
+                constructor = "lbvh"
+        if constructor not in bvh_constructor_values:
+            raise ValueError(f"Unrecognized BVH constructor type: {constructor}")
         if self.device.is_cpu:
-            self.id = self.runtime.core.bvh_create_host(get_data(lowers), get_data(uppers), int(len(lowers)))
+            if constructor == "lbvh":
+                warp.utils.warn(
+                    "LBVH constructor is not available for a CPU tree. Falling back to SAH constructor.", stacklevel=2
+                )
+                constructor = "sah"
+            self.id = self.runtime.core.bvh_create_host(
+                get_data(lowers), get_data(uppers), int(len(lowers)), bvh_constructor_values[constructor]
+            )
         else:
             self.id = self.runtime.core.bvh_create_device(
-                self.device.context, get_data(lowers), get_data(uppers), int(len(lowers))
+                self.device.context,
+                get_data(lowers),
+                get_data(uppers),
+                int(len(lowers)),
+                bvh_constructor_values[constructor],
             )
     def __del__(self):
@@ -3156,7 +3418,10 @@ class Bvh:
                 self.runtime.core.bvh_destroy_device(self.id)
     def refit(self):
-        """Refit the BVH. This should be called after users modify the `lowers` and `uppers` arrays."""
+        """Refit the BVH.
+        This should be called after users modify the ``lowers`` or ``uppers`` arrays.
+        """
         if self.device.is_cpu:
             self.runtime.core.bvh_refit_host(self.id)
@@ -3179,7 +3444,14 @@ class Mesh:
         instance.id = None
         return instance
-    def __init__(self, points=None, indices=None, velocities=None, support_winding_number=False):
+    def __init__(
+        self,
+        points: array,
+        indices: array,
+        velocities: Optional[array] = None,
+        support_winding_number: bool = False,
+        bvh_constructor: Optional[str] = None,
+    ):
         """Class representing a triangle mesh.
         Attributes:
@@ -3187,10 +3459,15 @@ class Mesh:
             device: Device this object lives on, all buffers must live on the same device.
         Args:
-            points (:class:`warp.array`): Array of vertex positions of type :class:`warp.vec3`
-            indices (:class:`warp.array`): Array of triangle indices of type :class:`warp.int32`, should be a 1d array with shape (num_tris * 3)
-            velocities (:class:`warp.array`): Array of vertex velocities of type :class:`warp.vec3` (optional)
-            support_winding_number (bool): If true the mesh will build additional datastructures to support `wp.mesh_query_point_sign_winding_number()` queries
+            points: Array of vertex positions of data type :class:`warp.vec3`.
+            indices: Array of triangle indices of data type :class:`warp.int32`.
+              Should be a 1D array with shape ``(num_tris * 3)``.
+            velocities: Optional array of vertex velocities of data type :class:`warp.vec3`.
+            support_winding_number: If ``True``, the mesh will build additional
+              data structures to support ``wp.mesh_query_point_sign_winding_number()`` queries.
+            bvh_constructor: The construction algorithm for the underlying BVH
+              (see the docstring of :class:`Bvh` for explanation).
+              Valid choices are ``"sah"``, ``"median"``, ``"lbvh"``, or ``None``.
         """
         if points.device != indices.device:
@@ -3215,7 +3492,22 @@ class Mesh:
         self.runtime = warp.context.runtime
+        if bvh_constructor is None:
+            if self.device.is_cpu:
+                bvh_constructor = "sah"
+            else:
+                bvh_constructor = "lbvh"
+        if bvh_constructor not in bvh_constructor_values:
+            raise ValueError(f"Unrecognized BVH constructor type: {bvh_constructor}")
         if self.device.is_cpu:
+            if bvh_constructor == "lbvh":
+                warp.utils.warn(
+                    "LBVH constructor is not available for a CPU tree. Falling back to SAH constructor.", stacklevel=2
+                )
+                bvh_constructor = "sah"
             self.id = self.runtime.core.mesh_create_host(
                 points.__ctype__(),
                 velocities.__ctype__() if velocities else array().__ctype__(),
@@ -3223,6 +3515,7 @@ class Mesh:
                 int(len(points)),
                 int(indices.size / 3),
                 int(support_winding_number),
+                bvh_constructor_values[bvh_constructor],
             )
         else:
             self.id = self.runtime.core.mesh_create_device(
@@ -3233,6 +3526,7 @@ class Mesh:
                 int(len(points)),
                 int(indices.size / 3),
                 int(support_winding_number),
+                bvh_constructor_values[bvh_constructor],
             )
     def __del__(self):
@@ -3247,7 +3541,10 @@ class Mesh:
                 self.runtime.core.mesh_destroy_device(self.id)
     def refit(self):
-        """Refit the BVH to points. This should be called after users modify the `points` data."""
+        """Refit the BVH to points.
+        This should be called after users modify the ``points`` data.
+        """
         if self.device.is_cpu:
             self.runtime.core.mesh_refit_host(self.id)
@@ -3260,9 +3557,9 @@ class Mesh:
         """The array of mesh's vertex positions of type :class:`warp.vec3`.
         The `Mesh.points` property has a custom setter method. Users can modify the vertex positions in-place,
-        but the `refit()` method must be called manually after such modifications. Alternatively, assigning a new array
+        but :meth:`refit` must be called manually after such modifications. Alternatively, assigning a new array
         to this property is also supported. The new array must have the same shape as the original, and once assigned,
-        the `Mesh` class will automatically perform a refit operation based on the new vertex positions.
+        The :class:`Mesh` will automatically perform a refit operation based on the new vertex positions.
         """
         return self._points
@@ -3270,16 +3567,14 @@ class Mesh:
     def points(self, points_new):
         if points_new.device != self._points.device:
             raise RuntimeError(
-                "The new points and the original points must live on the same device, currently "
-                "the new points lives on {} while the old points lives on {}.".format(
-                    points_new.device, self._points.device
-                )
+                "The new points and the original points must live on the same device, the "
+                f"new points are on {points_new.device} while the old points are on {self._points.device}."
             )
         if points_new.ndim != 1 or points_new.shape[0] != self._points.shape[0]:
             raise RuntimeError(
-                "the new points and the original points must have the same shape, currently new points shape is: {},"
-                " while the old points' shape is: {}".format(points_new.shape, self._points.shape)
+                "The new points and the original points must have the same shape, the "
+                f"new points' shape is {points_new.shape}, while the old points' shape is {self._points.shape}."
             )
         self._points = points_new
@@ -3294,7 +3589,7 @@ class Mesh:
         """The array of mesh's velocities of type :class:`warp.vec3`.
         This is a property with a custom setter method. Users can modify the velocities in-place,
-        or assigning a new array to this property. No refitting is needed for changing velocities.
+        or assign a new array to this property. No refitting is needed for changing velocities.
         """
         return self._velocities
@@ -3302,16 +3597,14 @@ class Mesh:
     def velocities(self, velocities_new):
         if velocities_new.device != self._velocities.device:
             raise RuntimeError(
-                "The new points and the original points must live on the same device, currently "
-                "the new points lives on {} while the old points lives on {}.".format(
-                    velocities_new.device, self._velocities.device
-                )
+                "The new points and the original points must live on the same device, the "
+                f"new points are on {velocities_new.device} while the old points are on {self._velocities.device}."
             )
         if velocities_new.ndim != 1 or velocities_new.shape[0] != self._velocities.shape[0]:
             raise RuntimeError(
-                "the new points and the original points must have the same shape, currently new points shape is: {},"
-                " while the old points' shape is: {}".format(velocities_new.shape, self._velocities.shape)
+                "The new points and the original points must have the same shape, the "
+                f"new points' shape is {velocities_new.shape}, while the old points' shape is {self._velocities.shape}."
             )
         self._velocities = velocities_new
@@ -3337,8 +3630,8 @@ class Volume:
         """Class representing a sparse grid.
         Args:
-            data (:class:`warp.array`): Array of bytes representing the volume in NanoVDB format
-            copy (bool): Whether the incoming data will be copied or aliased
+            data: Array of bytes representing the volume in NanoVDB format.
+            copy: Whether the incoming data will be copied or aliased.
         """
         # keep a runtime reference for orderly destruction
@@ -3373,14 +3666,15 @@ class Volume:
                 self.runtime.core.volume_destroy_device(self.id)
     def array(self) -> array:
-        """Returns the raw memory buffer of the Volume as an array"""
+        """Return the raw memory buffer of the :class:`Volume` as an array."""
         buf = ctypes.c_void_p(0)
         size = ctypes.c_uint64(0)
         self.runtime.core.volume_get_buffer_info(self.id, ctypes.byref(buf), ctypes.byref(size))
         return array(ptr=buf.value, dtype=uint8, shape=size.value, device=self.device, owner=False)
     def get_tile_count(self) -> int:
-        """Returns the number of tiles (NanoVDB leaf nodes) of the volume"""
+        """Return the number of tiles (NanoVDB leaf nodes) of the volume."""
         voxel_count, tile_count = (
             ctypes.c_uint64(0),
@@ -3390,11 +3684,12 @@ class Volume:
         return tile_count.value
     def get_tiles(self, out: Optional[array] = None) -> array:
-        """Returns the integer coordinates of all allocated tiles for this volume.
+        """Return the integer coordinates of all allocated tiles for this volume.
         Args:
-            out (:class:`warp.array`, optional): If provided, use the `out` array to store the tile coordinates, otherwise
-                a new array will be allocated. `out` must be a contiguous array of ``tile_count`` ``vec3i`` or ``tile_count x 3`` ``int32``
+            out: If provided, use the `out` array to store the tile coordinates, otherwise
+                a new array will be allocated. ``out`` must be a contiguous array
+                of ``tile_count`` ``vec3i`` or ``tile_count x 3`` ``int32``
                 on the same device as this volume.
         """
@@ -3419,7 +3714,7 @@ class Volume:
         return out
     def get_voxel_count(self) -> int:
-        """Returns the total number of allocated voxels for this volume"""
+        """Return the total number of allocated voxels for this volume"""
         voxel_count, tile_count = (
             ctypes.c_uint64(0),
@@ -3429,10 +3724,10 @@ class Volume:
         return voxel_count.value
     def get_voxels(self, out: Optional[array] = None) -> array:
-        """Returns the integer coordinates of all allocated voxels for this volume.
+        """Return the integer coordinates of all allocated voxels for this volume.
         Args:
-            out (:class:`warp.array`, optional): If provided, use the `out` array to store the voxel coordinates, otherwise
+            out: If provided, use the `out` array to store the voxel coordinates, otherwise
                 a new array will be allocated. `out` must be a contiguous array of ``voxel_count`` ``vec3i`` or ``voxel_count x 3`` ``int32``
                 on the same device as this volume.
         """
@@ -3458,7 +3753,7 @@ class Volume:
         return out
     def get_voxel_size(self) -> Tuple[float, float, float]:
-        """Voxel size, i.e, world coordinates of voxel's diagonal vector"""
+        """Return the voxel size, i.e, world coordinates of voxel's diagonal vector"""
         if self.id == 0:
             raise RuntimeError("Invalid Volume")
@@ -3558,7 +3853,7 @@ class Volume:
         return self.get_grid_info().type_str in Volume._nvdb_index_types
     def get_feature_array_count(self) -> int:
-        """Returns the number of supplemental data arrays stored alongside the grid"""
+        """Return the number of supplemental data arrays stored alongside the grid"""
         return self.runtime.core.volume_get_blind_data_count(self.id)
@@ -3578,7 +3873,7 @@ class Volume:
         """String describing the type of the array values"""
     def get_feature_array_info(self, feature_index: int) -> Volume.FeatureArrayInfo:
-        """Returns the metadata associated to the feature array at `feature_index`"""
+        """Return the metadata associated to the feature array at ``feature_index``."""
         buf = ctypes.c_void_p(0)
         value_count = ctypes.c_uint64(0)
@@ -3606,11 +3901,12 @@ class Volume:
         )
     def feature_array(self, feature_index: int, dtype=None) -> array:
-        """Returns one the grid's feature data arrays as a Warp array
+        """Return one the grid's feature data arrays as a Warp array.
         Args:
             feature_index: Index of the supplemental data array in the grid
-            dtype: Type for the returned Warp array. If not provided, will be deduced from the array metadata.
+            dtype: Data type for the returned Warp array.
+              If not provided, will be deduced from the array metadata.
         """
         info = self.get_feature_array_info(feature_index)
@@ -3641,7 +3937,7 @@ class Volume:
     @classmethod
     def load_from_nvdb(cls, file_or_buffer, device=None) -> Volume:
-        """Creates a Volume object from a serialized NanoVDB file or in-memory buffer.
+        """Create a :class:`Volume` object from a serialized NanoVDB file or in-memory buffer.
         Returns:
@@ -4302,6 +4598,9 @@ def matmul(
 ):
     """Computes a generic matrix-matrix multiplication (GEMM) of the form: `d = alpha * (a @ b) + beta * c`.
+    .. deprecated:: 1.6
+        Use :doc:`tile primitives </modules/tiles>` instead.
     Args:
         a (array2d): two-dimensional array containing matrix A
         b (array2d): two-dimensional array containing matrix B
@@ -4314,6 +4613,12 @@ def matmul(
     """
     from warp.context import runtime
+    warp.utils.warn(
+        "wp.matmul() is deprecated and will be removed in a\nfuture version. Use tile primitives instead.",
+        category=DeprecationWarning,
+        stacklevel=2,
+    )
     device = a.device
     if b.device != device or c.device != device or d.device != device:
@@ -4589,6 +4894,9 @@ def batched_matmul(
 ):
     """Computes a batched generic matrix-matrix multiplication (GEMM) of the form: `d = alpha * (a @ b) + beta * c`.
+    .. deprecated:: 1.6
+        Use :doc:`tile primitives </modules/tiles>` instead.
     Args:
         a (array3d): three-dimensional array containing A matrices. Overall array dimension is {batch_count, M, K}
         b (array3d): three-dimensional array containing B matrices. Overall array dimension is {batch_count, K, N}