PyPI - warp-lang - Versions diffs - 0.11.0__py3-none-manylinux2014_x86_64.whl → 1.0.0__py3-none-manylinux2014_x86_64.whl - Mend

warp-lang 0.11.0__py3-none-manylinux2014_x86_64.whl → 1.0.0__py3-none-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (170) hide show

warp/__init__.py +8 -0
warp/bin/warp-clang.so +0 -0
warp/bin/warp.so +0 -0
warp/build.py +7 -6
warp/build_dll.py +70 -79
warp/builtins.py +10 -6
warp/codegen.py +51 -19
warp/config.py +7 -8
warp/constants.py +3 -0
warp/context.py +948 -245
warp/dlpack.py +198 -113
warp/examples/assets/bunny.usd +0 -0
warp/examples/assets/cartpole.urdf +110 -0
warp/examples/assets/crazyflie.usd +0 -0
warp/examples/assets/cube.usda +42 -0
warp/examples/assets/nv_ant.xml +92 -0
warp/examples/assets/nv_humanoid.xml +183 -0
warp/examples/assets/quadruped.urdf +268 -0
warp/examples/assets/rocks.nvdb +0 -0
warp/examples/assets/rocks.usd +0 -0
warp/examples/assets/sphere.usda +56 -0
warp/examples/assets/torus.usda +105 -0
warp/examples/benchmarks/benchmark_api.py +383 -0
warp/examples/benchmarks/benchmark_cloth.py +279 -0
warp/examples/benchmarks/benchmark_cloth_cupy.py +88 -0
warp/examples/benchmarks/benchmark_cloth_jax.py +100 -0
warp/examples/benchmarks/benchmark_cloth_numba.py +142 -0
warp/examples/benchmarks/benchmark_cloth_numpy.py +77 -0
warp/examples/benchmarks/benchmark_cloth_pytorch.py +86 -0
warp/examples/benchmarks/benchmark_cloth_taichi.py +112 -0
warp/examples/benchmarks/benchmark_cloth_warp.py +146 -0
warp/examples/benchmarks/benchmark_launches.py +295 -0
warp/examples/core/example_dem.py +221 -0
warp/examples/core/example_fluid.py +267 -0
warp/examples/core/example_graph_capture.py +129 -0
warp/examples/core/example_marching_cubes.py +177 -0
warp/examples/core/example_mesh.py +154 -0
warp/examples/core/example_mesh_intersect.py +193 -0
warp/examples/core/example_nvdb.py +169 -0
warp/examples/core/example_raycast.py +89 -0
warp/examples/core/example_raymarch.py +178 -0
warp/examples/core/example_render_opengl.py +141 -0
warp/examples/core/example_sph.py +389 -0
warp/examples/core/example_torch.py +181 -0
warp/examples/core/example_wave.py +249 -0
warp/examples/fem/bsr_utils.py +380 -0
warp/examples/fem/example_apic_fluid.py +391 -0
warp/examples/fem/example_convection_diffusion.py +168 -0
warp/examples/fem/example_convection_diffusion_dg.py +209 -0
warp/examples/fem/example_convection_diffusion_dg0.py +194 -0
warp/examples/fem/example_deformed_geometry.py +159 -0
warp/examples/fem/example_diffusion.py +173 -0
warp/examples/fem/example_diffusion_3d.py +152 -0
warp/examples/fem/example_diffusion_mgpu.py +214 -0
warp/examples/fem/example_mixed_elasticity.py +222 -0
warp/examples/fem/example_navier_stokes.py +243 -0
warp/examples/fem/example_stokes.py +192 -0
warp/examples/fem/example_stokes_transfer.py +249 -0
warp/examples/fem/mesh_utils.py +109 -0
warp/examples/fem/plot_utils.py +287 -0
warp/examples/optim/example_bounce.py +248 -0
warp/examples/optim/example_cloth_throw.py +210 -0
warp/examples/optim/example_diffray.py +535 -0
warp/examples/optim/example_drone.py +850 -0
warp/examples/optim/example_inverse_kinematics.py +169 -0
warp/examples/optim/example_inverse_kinematics_torch.py +170 -0
warp/examples/optim/example_spring_cage.py +234 -0
warp/examples/optim/example_trajectory.py +201 -0
warp/examples/sim/example_cartpole.py +128 -0
warp/examples/sim/example_cloth.py +184 -0
warp/examples/sim/example_granular.py +113 -0
warp/examples/sim/example_granular_collision_sdf.py +185 -0
warp/examples/sim/example_jacobian_ik.py +213 -0
warp/examples/sim/example_particle_chain.py +106 -0
warp/examples/sim/example_quadruped.py +179 -0
warp/examples/sim/example_rigid_chain.py +191 -0
warp/examples/sim/example_rigid_contact.py +176 -0
warp/examples/sim/example_rigid_force.py +126 -0
warp/examples/sim/example_rigid_gyroscopic.py +97 -0
warp/examples/sim/example_rigid_soft_contact.py +124 -0
warp/examples/sim/example_soft_body.py +178 -0
warp/fabric.py +29 -20
warp/fem/cache.py +0 -1
warp/fem/dirichlet.py +0 -2
warp/fem/integrate.py +0 -1
warp/jax.py +45 -0
warp/jax_experimental.py +339 -0
warp/native/builtin.h +12 -0
warp/native/bvh.cu +18 -18
warp/native/clang/clang.cpp +8 -3
warp/native/cuda_util.cpp +94 -5
warp/native/cuda_util.h +35 -6
warp/native/cutlass_gemm.cpp +1 -1
warp/native/cutlass_gemm.cu +4 -1
warp/native/error.cpp +66 -0
warp/native/error.h +27 -0
warp/native/mesh.cu +2 -2
warp/native/reduce.cu +4 -4
warp/native/runlength_encode.cu +2 -2
warp/native/scan.cu +2 -2
warp/native/sparse.cu +0 -1
warp/native/temp_buffer.h +2 -2
warp/native/warp.cpp +95 -60
warp/native/warp.cu +1053 -218
warp/native/warp.h +49 -32
warp/optim/linear.py +33 -16
warp/render/render_opengl.py +202 -101
warp/render/render_usd.py +82 -40
warp/sim/__init__.py +13 -4
warp/sim/articulation.py +4 -5
warp/sim/collide.py +320 -175
warp/sim/import_mjcf.py +25 -30
warp/sim/import_urdf.py +94 -63
warp/sim/import_usd.py +51 -36
warp/sim/inertia.py +3 -2
warp/sim/integrator.py +233 -0
warp/sim/integrator_euler.py +447 -469
warp/sim/integrator_featherstone.py +1991 -0
warp/sim/integrator_xpbd.py +1420 -640
warp/sim/model.py +765 -487
warp/sim/particles.py +2 -1
warp/sim/render.py +35 -13
warp/sim/utils.py +222 -11
warp/stubs.py +8 -0
warp/tape.py +16 -1
warp/tests/aux_test_grad_customs.py +23 -0
warp/tests/test_array.py +190 -1
warp/tests/test_async.py +656 -0
warp/tests/test_bool.py +50 -0
warp/tests/test_dlpack.py +164 -11
warp/tests/test_examples.py +166 -74
warp/tests/test_fem.py +8 -1
warp/tests/test_generics.py +15 -5
warp/tests/test_grad.py +1 -1
warp/tests/test_grad_customs.py +172 -12
warp/tests/test_jax.py +254 -0
warp/tests/test_large.py +29 -6
warp/tests/test_launch.py +25 -0
warp/tests/test_linear_solvers.py +20 -3
warp/tests/test_matmul.py +61 -16
warp/tests/test_matmul_lite.py +13 -13
warp/tests/test_mempool.py +186 -0
warp/tests/test_multigpu.py +3 -0
warp/tests/test_options.py +16 -2
warp/tests/test_peer.py +137 -0
warp/tests/test_print.py +3 -1
warp/tests/test_quat.py +23 -0
warp/tests/test_sim_kinematics.py +97 -0
warp/tests/test_snippet.py +126 -3
warp/tests/test_streams.py +108 -79
warp/tests/test_torch.py +16 -8
warp/tests/test_utils.py +32 -27
warp/tests/test_verify_fp.py +65 -0
warp/tests/test_volume.py +1 -1
warp/tests/unittest_serial.py +2 -0
warp/tests/unittest_suites.py +12 -0
warp/tests/unittest_utils.py +14 -7
warp/thirdparty/unittest_parallel.py +15 -3
warp/torch.py +10 -8
warp/types.py +363 -246
warp/utils.py +143 -19
warp_lang-1.0.0.dist-info/LICENSE.md +126 -0
warp_lang-1.0.0.dist-info/METADATA +394 -0
{warp_lang-0.11.0.dist-info → warp_lang-1.0.0.dist-info}/RECORD +167 -86
warp/sim/optimizer.py +0 -138
warp_lang-0.11.0.dist-info/LICENSE.md +0 -36
warp_lang-0.11.0.dist-info/METADATA +0 -238
/warp/tests/{walkthough_debug.py → walkthrough_debug.py} +0 -0
{warp_lang-0.11.0.dist-info → warp_lang-1.0.0.dist-info}/WHEEL +0 -0
{warp_lang-0.11.0.dist-info → warp_lang-1.0.0.dist-info}/top_level.txt +0 -0

warp/types.py CHANGED Viewed

@@ -98,7 +98,13 @@ def vector(length, dtype):
         # ctypes.Array data for length, shape and c type:
         _length_ = 0 if length is Any else length
         _shape_ = (_length_,)
-        _type_ = ctypes.c_float if dtype in [Scalar, Float] else dtype._type_
+        if dtype is bool:
+            _type_ = ctypes.c_bool
+        elif dtype in [Scalar, Float]:
+            _type_ = ctypes.c_float
+        else:
+            _type_ = dtype._type_
         # warp scalar type:
         _wp_scalar_type_ = dtype
@@ -271,7 +277,13 @@ def matrix(shape, dtype):
     class mat_t(ctypes.Array):
         _length_ = 0 if shape[0] == Any or shape[1] == Any else shape[0] * shape[1]
         _shape_ = (0, 0) if _length_ == 0 else shape
-        _type_ = ctypes.c_float if dtype in [Scalar, Float] else dtype._type_
+        if dtype is bool:
+            _type_ = ctypes.c_bool
+        elif dtype in [Scalar, Float]:
+            _type_ = ctypes.c_float
+        else:
+            _type_ = dtype._type_
         # warp scalar type:
         # used in type checking and when writing out c++ code for constructors:
@@ -391,8 +403,7 @@ def matrix(shape, dtype):
                 iter(v)
             except TypeError:
                 raise TypeError(
-                    f"Expected to assign a slice from a sequence of values "
-                    f"but got `{type(v).__name__}` instead"
+                    f"Expected to assign a slice from a sequence of values " f"but got `{type(v).__name__}` instead"
                 ) from None
             row_start = r * self._shape_[1]
@@ -1103,6 +1114,7 @@ class range_t:
 # definition just for kernel type (cannot be a parameter), see bvh.h
 class bvh_query_t:
     """Object used to track state during BVH traversal."""
     def __init__(self):
         pass
@@ -1110,6 +1122,7 @@ class bvh_query_t:
 # definition just for kernel type (cannot be a parameter), see mesh.h
 class mesh_query_aabb_t:
     """Object used to track state during mesh traversal."""
     def __init__(self):
         pass
@@ -1117,6 +1130,7 @@ class mesh_query_aabb_t:
 # definition just for kernel type (cannot be a parameter), see hash_grid.h
 class hash_grid_query_t:
     """Object used to track state during neighbor traversal."""
     def __init__(self):
         pass
@@ -1250,18 +1264,30 @@ def type_scalar_type(dtype):
     return getattr(dtype, "_wp_scalar_type_", dtype)
+# Cache results of type_size_in_bytes(), because the function is actually quite slow.
+_type_size_cache = {
+    float: 4,
+    int: 4,
+}
 def type_size_in_bytes(dtype):
-    if dtype.__module__ == "ctypes":
-        return ctypes.sizeof(dtype)
-    elif isinstance(dtype, warp.codegen.Struct):
-        return ctypes.sizeof(dtype.ctype)
-    elif dtype == float or dtype == int:
-        return 4
-    elif hasattr(dtype, "_type_"):
-        return getattr(dtype, "_length_", 1) * ctypes.sizeof(dtype._type_)
+    size = _type_size_cache.get(dtype)
-    else:
-        return 0
+    if size is None:
+        if dtype.__module__ == "ctypes":
+            size = ctypes.sizeof(dtype)
+        elif hasattr(dtype, "_type_"):
+            size = getattr(dtype, "_length_", 1) * ctypes.sizeof(dtype._type_)
+        elif isinstance(dtype, warp.codegen.Struct):
+            size = ctypes.sizeof(dtype.ctype)
+        elif dtype == Any:
+            raise TypeError(f"A concrete type is required")
+        else:
+            raise TypeError(f"Invalid data type: {dtype}")
+        _type_size_cache[dtype] = size
+    return size
 def type_to_warp(dtype):
@@ -1399,9 +1425,9 @@ def types_equal(a, b, match_generic=False):
         if match_generic:
             if p1 == Any or p2 == Any:
                 return True
-            if p1 == Scalar and p2 in scalar_types:
+            if p1 == Scalar and p2 in scalar_types + [bool]:
                 return True
-            if p2 == Scalar and p1 in scalar_types:
+            if p2 == Scalar and p1 in scalar_types + [bool]:
                 return True
             if p1 == Scalar and p2 == Scalar:
                 return True
@@ -1454,6 +1480,17 @@ def strides_from_shape(shape: Tuple, dtype):
     return tuple(strides)
+def check_array_shape(shape: Tuple):
+    """Checks that the size in each dimension is positive and less than 2^32."""
+    for dim_index, dim_size in enumerate(shape):
+        if dim_size < 0:
+            raise ValueError(f"Array shapes must be non-negative, got {dim_size} in dimension {dim_index}")
+        if dim_size >= 2**31:
+            raise ValueError("Array shapes must not exceed the maximum representable value of a signed 32-bit integer, "
+                            f"got {dim_size} in dimension {dim_index}.")
 class array(Array):
     # member attributes available during code-gen (e.g.: d = array.shape[0])
     # (initialized when needed)
@@ -1471,7 +1508,8 @@ class array(Array):
         device=None,
         pinned=False,
         copy=True,
-        owner=True,  # TODO: replace with deleter=None
+        owner=False,  # deprecated - pass deleter instead
+        deleter=None,
         ndim=None,
         grad=None,
         requires_grad=False,
@@ -1505,15 +1543,18 @@ class array(Array):
             capacity (int): Maximum size in bytes of the ptr allocation (data should be None)
             device (Devicelike): Device the array lives on
             copy (bool): Whether the incoming data will be copied or aliased, this is only possible when the incoming `data` already lives on the device specified and types match
-            owner (bool): Should the array object try to deallocate memory when it is deleted
+            owner (bool): Should the array object try to deallocate memory when it is deleted (deprecated, pass `deleter` if you wish to transfer ownership to Warp)
+            deleter (Callable): Function to be called when deallocating the array, taking two arguments, pointer and size
             requires_grad (bool): Whether or not gradients will be tracked for this array, see :class:`warp.Tape` for details
             grad (array): The gradient array to use
             pinned (bool): Whether to allocate pinned host memory, which allows asynchronous host-device transfers (only applicable with device="cpu")
         """
-        self.owner = False
+        self.deleter = None
         self.ctype = None
+        # properties
         self._requires_grad = False
         self._grad = None
         # __array_interface__ or __cuda_array_interface__, evaluated lazily and cached
@@ -1547,7 +1588,7 @@ class array(Array):
                 raise RuntimeError("Can only construct arrays with either `data` or `ptr` arguments, not both")
             self._init_from_data(data, dtype, shape, device, copy, pinned)
         elif ptr is not None:
-            self._init_from_ptr(ptr, dtype, shape, strides, capacity, device, owner, pinned)
+            self._init_from_ptr(ptr, dtype, shape, strides, capacity, device, pinned, deleter)
         elif shape is not None:
             self._init_new(dtype, shape, strides, device, pinned)
         else:
@@ -1562,8 +1603,7 @@ class array(Array):
                 # allocate gradient if needed
                 self._requires_grad = requires_grad
                 if requires_grad:
-                    with warp.ScopedStream(self.device.null_stream):
-                        self._alloc_grad()
+                    self._alloc_grad()
     def _init_from_data(self, data, dtype, shape, device, copy, pinned):
         if not hasattr(data, "__len__"):
@@ -1674,11 +1714,16 @@ class array(Array):
             shape = arr.shape or (1,)
             strides = arr.strides or (type_size_in_bytes(dtype),)
-        device = warp.get_device(device)
+        try:
+            # Performance note: try first, ask questions later
+            device = warp.context.runtime.get_device(device)
+        except:
+            warp.context.assert_initialized()
+            raise
         if device.is_cpu and not copy and not pinned:
             # reference numpy memory directly
-            self._init_from_ptr(arr.ctypes.data, dtype, shape, strides, None, device, False, False)
+            self._init_from_ptr(arr.ctypes.data, dtype, shape, strides, None, device, False, None)
             # keep a ref to the source array to keep allocation alive
             self._ref = arr
         else:
@@ -1691,82 +1736,106 @@ class array(Array):
                 strides=strides,
                 device="cpu",
                 copy=False,
-                owner=False,
             )
             warp.copy(self, src)
-    def _init_from_ptr(self, ptr, dtype, shape, strides, capacity, device, owner, pinned):
-        if dtype == Any:
-            raise RuntimeError("A concrete data type is required to create the array")
-        device = warp.get_device(device)
-        size = 1
-        for d in shape:
-            size *= d
-        contiguous_strides = strides_from_shape(shape, dtype)
+    def _init_from_ptr(self, ptr, dtype, shape, strides, capacity, device, pinned, deleter):
+        try:
+            # Performance note: try first, ask questions later
+            device = warp.context.runtime.get_device(device)
+        except:
+            warp.context.assert_initialized()
+            raise
+        check_array_shape(shape)
+        ndim = len(shape)
+        dtype_size = type_size_in_bytes(dtype)
+        # compute size and contiguous strides
+        # Performance note: we could use strides_from_shape() here, but inlining it is faster.
+        contiguous_strides = [None] * ndim
+        i = ndim - 1
+        contiguous_strides[i] = dtype_size
+        size = shape[i]
+        while i > 0:
+            contiguous_strides[i - 1] = contiguous_strides[i] * shape[i]
+            i -= 1
+            size *= shape[i]
+        contiguous_strides = tuple(contiguous_strides)
         if strides is None:
             strides = contiguous_strides
             is_contiguous = True
             if capacity is None:
-                capacity = size * type_size_in_bytes(dtype)
+                capacity = size * dtype_size
         else:
+            strides = tuple(strides)
             is_contiguous = strides == contiguous_strides
             if capacity is None:
                 capacity = shape[0] * strides[0]
         self.dtype = dtype
-        self.ndim = len(shape)
+        self.ndim = ndim
         self.size = size
         self.capacity = capacity
         self.shape = shape
         self.strides = strides
         self.ptr = ptr
         self.device = device
-        self.owner = owner
         self.pinned = pinned if device.is_cpu else False
         self.is_contiguous = is_contiguous
+        self.deleter = deleter
     def _init_new(self, dtype, shape, strides, device, pinned):
-        if dtype == Any:
-            raise RuntimeError("A concrete data type is required to create the array")
-        device = warp.get_device(device)
-        size = 1
-        for d in shape:
-            size *= d
-        contiguous_strides = strides_from_shape(shape, dtype)
+        try:
+            # Performance note: try first, ask questions later
+            device = warp.context.runtime.get_device(device)
+        except:
+            warp.context.assert_initialized()
+            raise
+        check_array_shape(shape)
+        ndim = len(shape)
+        dtype_size = type_size_in_bytes(dtype)
+        # compute size and contiguous strides
+        # Performance note: we could use strides_from_shape() here, but inlining it is faster.
+        contiguous_strides = [None] * ndim
+        i = ndim - 1
+        contiguous_strides[i] = dtype_size
+        size = shape[i]
+        while i > 0:
+            contiguous_strides[i - 1] = contiguous_strides[i] * shape[i]
+            i -= 1
+            size *= shape[i]
+        contiguous_strides = tuple(contiguous_strides)
         if strides is None:
             strides = contiguous_strides
             is_contiguous = True
-            capacity = size * type_size_in_bytes(dtype)
+            capacity = size * dtype_size
         else:
+            strides = tuple(strides)
             is_contiguous = strides == contiguous_strides
             capacity = shape[0] * strides[0]
+        allocator = device.get_allocator(pinned=pinned)
         if capacity > 0:
-            ptr = device.allocator.alloc(capacity, pinned=pinned)
-            if ptr is None:
-                raise RuntimeError(f"Array allocation failed on device: {device} for {capacity} bytes")
+            ptr = allocator.alloc(capacity)
         else:
             ptr = None
         self.dtype = dtype
-        self.ndim = len(shape)
+        self.ndim = ndim
         self.size = size
         self.capacity = capacity
         self.shape = shape
         self.strides = strides
         self.ptr = ptr
         self.device = device
-        self.owner = True
         self.pinned = pinned if device.is_cpu else False
         self.is_contiguous = is_contiguous
+        self.deleter = allocator.deleter
     def _init_annotation(self, dtype, ndim):
         self.dtype = dtype
@@ -1777,12 +1846,20 @@ class array(Array):
         self.strides = (0,) * ndim
         self.ptr = None
         self.device = None
-        self.owner = False
         self.pinned = False
         self.is_contiguous = False
+    def __del__(self):
+        if self.deleter is None:
+            return
+        with self.device.context_guard:
+            self.deleter(self.ptr, self.capacity)
     @property
     def __array_interface__(self):
         # raising an AttributeError here makes hasattr() return False
         if self.device is None or not self.device.is_cpu:
             raise AttributeError(f"__array_interface__ not supported because device is {self.device}")
@@ -1819,6 +1896,7 @@ class array(Array):
     @property
     def __cuda_array_interface__(self):
         # raising an AttributeError here makes hasattr() return False
         if self.device is None or not self.device.is_cuda:
             raise AttributeError(f"__cuda_array_interface__ is not supported because device is {self.device}")
@@ -1845,11 +1923,45 @@ class array(Array):
         return self._array_interface
-    def __del__(self):
-        if self.owner:
-            # use CUDA context guard to avoid side effects during garbage collection
-            with self.device.context_guard:
-                self.device.allocator.free(self.ptr, self.capacity, self.pinned)
+    def __dlpack__(self, stream=None):
+        # See https://data-apis.org/array-api/2022.12/API_specification/generated/array_api.array.__dlpack__.html
+        if self.device is None:
+            raise RuntimeError("Array has no device assigned")
+        if self.device.is_cuda and stream != -1:
+            if not isinstance(stream, int):
+                raise TypeError("DLPack stream must be an integer or None")
+            # assume that the array is being used on its device's current stream
+            array_stream = self.device.stream
+            # the external stream should wait for outstanding operations to complete
+            if stream in (None, 0, 1):
+                external_stream = 0
+            else:
+                external_stream = stream
+            # Performance note: avoid wrapping the external stream in a temporary Stream object
+            if external_stream != array_stream.cuda_stream:
+                warp.context.runtime.core.cuda_stream_wait_stream(
+                    external_stream, array_stream.cuda_stream, array_stream.cached_event.cuda_event
+                )
+        return warp.dlpack.to_dlpack(self)
+    def __dlpack_device__(self):
+        # See https://data-apis.org/array-api/2022.12/API_specification/generated/array_api.array.__dlpack_device__.html
+        if self.device is None:
+            raise RuntimeError("Array has no device assigned")
+        if self.device.is_cuda:
+            return (warp.dlpack.DLDeviceType.kDLCUDA, self.device.ordinal)
+        elif self.pinned:
+            return (warp.dlpack.DLDeviceType.kDLCUDAHost, 0)
+        else:
+            return (warp.dlpack.DLDeviceType.kDLCPU, 0)
     def __len__(self):
         return self.shape[0]
@@ -1949,7 +2061,6 @@ class array(Array):
                 strides=tuple(new_strides),
                 device=self.grad.device,
                 pinned=self.grad.pinned,
-                owner=False,
             )
             # store back-ref to stop data being destroyed
             new_grad._ref = self.grad
@@ -1963,7 +2074,6 @@ class array(Array):
             strides=tuple(new_strides),
             device=self.device,
             pinned=self.pinned,
-            owner=False,
             grad=new_grad,
         )
@@ -2002,7 +2112,7 @@ class array(Array):
         n = other.shape[1]
         c = warp.zeros(shape=(m, n), dtype=self.dtype, device=self.device, requires_grad=True)
         d = warp.zeros(shape=(m, n), dtype=self.dtype, device=self.device, requires_grad=True)
-        matmul(self, other, c, d, device=self.device)
+        matmul(self, other, c, d)
         return d
     @property
@@ -2046,10 +2156,9 @@ class array(Array):
         self.ctype = None
     def _alloc_grad(self):
-        self._grad = array(
+        self._grad = warp.zeros(
             dtype=self.dtype, shape=self.shape, strides=self.strides, device=self.device, pinned=self.pinned
         )
-        self._grad.zero_()
         # trigger re-creation of C-representation
         self.ctype = None
@@ -2246,7 +2355,6 @@ class array(Array):
             device=self.device,
             pinned=self.pinned,
             copy=False,
-            owner=False,
             grad=None if self.grad is None else self.grad.flatten(),
         )
@@ -2308,7 +2416,6 @@ class array(Array):
             device=self.device,
             pinned=self.pinned,
             copy=False,
-            owner=False,
             grad=None if self.grad is None else self.grad.reshape(shape),
         )
@@ -2332,7 +2439,6 @@ class array(Array):
             device=self.device,
             pinned=self.pinned,
             copy=False,
-            owner=False,
             grad=None if self.grad is None else self.grad.view(dtype),
         )
@@ -2384,7 +2490,6 @@ class array(Array):
             device=self.device,
             pinned=self.pinned,
             copy=False,
-            owner=False,
             grad=None if self.grad is None else self.grad.transpose(axes=axes),
         )
@@ -2427,7 +2532,6 @@ def from_ptr(ptr, length, dtype=None, shape=None, device=None):
         ptr=0 if ptr == 0 else ctypes.cast(ptr, ctypes.POINTER(ctypes.c_size_t)).contents.value,
         shape=shape,
         device=device,
-        owner=False,
         requires_grad=False,
     )
@@ -2682,6 +2786,8 @@ class Bvh:
             uppers (:class:`warp.array`): Array of upper bounds :class:`warp.vec3`
         """
+        self.id = 0
         if len(lowers) != len(uppers):
             raise RuntimeError("Bvh the same number of lower and upper bounds must be provided")
@@ -2704,39 +2810,35 @@ class Bvh:
             else:
                 return ctypes.c_void_p(0)
-        from warp.context import runtime
+        self.runtime = warp.context.runtime
         if self.device.is_cpu:
-            self.id = runtime.core.bvh_create_host(get_data(lowers), get_data(uppers), int(len(lowers)))
+            self.id = self.runtime.core.bvh_create_host(get_data(lowers), get_data(uppers), int(len(lowers)))
         else:
-            self.id = runtime.core.bvh_create_device(
+            self.id = self.runtime.core.bvh_create_device(
                 self.device.context, get_data(lowers), get_data(uppers), int(len(lowers))
             )
     def __del__(self):
-        try:
-            from warp.context import runtime
-            if self.device.is_cpu:
-                runtime.core.bvh_destroy_host(self.id)
-            else:
-                # use CUDA context guard to avoid side effects during garbage collection
-                with self.device.context_guard:
-                    runtime.core.bvh_destroy_device(self.id)
+        if not self.id:
+            return
-        except Exception:
-            pass
+        if self.device.is_cpu:
+            self.runtime.core.bvh_destroy_host(self.id)
+        else:
+            # use CUDA context guard to avoid side effects during garbage collection
+            with self.device.context_guard:
+                self.runtime.core.bvh_destroy_device(self.id)
     def refit(self):
         """Refit the BVH. This should be called after users modify the `lowers` and `uppers` arrays."""
-        from warp.context import runtime
         if self.device.is_cpu:
-            runtime.core.bvh_refit_host(self.id)
+            self.runtime.core.bvh_refit_host(self.id)
         else:
-            runtime.core.bvh_refit_device(self.id)
-            runtime.verify_cuda_device(self.device)
+            self.runtime.core.bvh_refit_device(self.id)
+            self.runtime.verify_cuda_device(self.device)
 class Mesh:
@@ -2762,6 +2864,8 @@ class Mesh:
             support_winding_number (bool): If true the mesh will build additional datastructures to support `wp.mesh_query_point_sign_winding_number()` queries
         """
+        self.id = 0
         if points.device != indices.device:
             raise RuntimeError("Mesh points and indices must live on the same device")
@@ -2782,10 +2886,10 @@ class Mesh:
         self.velocities = velocities
         self.indices = indices
-        from warp.context import runtime
+        self.runtime = warp.context.runtime
         if self.device.is_cpu:
-            self.id = runtime.core.mesh_create_host(
+            self.id = self.runtime.core.mesh_create_host(
                 points.__ctype__(),
                 velocities.__ctype__() if velocities else array().__ctype__(),
                 indices.__ctype__(),
@@ -2794,7 +2898,7 @@ class Mesh:
                 int(support_winding_number),
             )
         else:
-            self.id = runtime.core.mesh_create_device(
+            self.id = self.runtime.core.mesh_create_device(
                 self.device.context,
                 points.__ctype__(),
                 velocities.__ctype__() if velocities else array().__ctype__(),
@@ -2805,28 +2909,25 @@ class Mesh:
             )
     def __del__(self):
-        try:
-            from warp.context import runtime
-            if self.device.is_cpu:
-                runtime.core.mesh_destroy_host(self.id)
-            else:
-                # use CUDA context guard to avoid side effects during garbage collection
-                with self.device.context_guard:
-                    runtime.core.mesh_destroy_device(self.id)
-        except Exception:
-            pass
+        if not self.id:
+            return
+        if self.device.is_cpu:
+            self.runtime.core.mesh_destroy_host(self.id)
+        else:
+            # use CUDA context guard to avoid side effects during garbage collection
+            with self.device.context_guard:
+                self.runtime.core.mesh_destroy_device(self.id)
     def refit(self):
         """Refit the BVH to points. This should be called after users modify the `points` data."""
-        from warp.context import runtime
         if self.device.is_cpu:
-            runtime.core.mesh_refit_host(self.id)
+            self.runtime.core.mesh_refit_host(self.id)
         else:
-            runtime.core.mesh_refit_device(self.id)
-            runtime.verify_cuda_device(self.device)
+            self.runtime.core.mesh_refit_device(self.id)
+            self.runtime.verify_cuda_device(self.device)
 class Volume:
@@ -2844,9 +2945,8 @@ class Volume:
         self.id = 0
-        from warp.context import runtime
-        self.context = runtime
+        # keep a runtime reference for orderly destruction
+        self.runtime = warp.context.runtime
         if data is None:
             return
@@ -2856,9 +2956,9 @@ class Volume:
         self.device = data.device
         if self.device.is_cpu:
-            self.id = self.context.core.volume_create_host(ctypes.cast(data.ptr, ctypes.c_void_p), data.size)
+            self.id = self.runtime.core.volume_create_host(ctypes.cast(data.ptr, ctypes.c_void_p), data.size)
         else:
-            self.id = self.context.core.volume_create_device(
+            self.id = self.runtime.core.volume_create_device(
                 self.device.context, ctypes.cast(data.ptr, ctypes.c_void_p), data.size
             )
@@ -2866,31 +2966,26 @@ class Volume:
             raise RuntimeError("Failed to create volume from input array")
     def __del__(self):
-        if self.id == 0:
-            return
-        try:
-            from warp.context import runtime
-            if self.device.is_cpu:
-                runtime.core.volume_destroy_host(self.id)
-            else:
-                # use CUDA context guard to avoid side effects during garbage collection
-                with self.device.context_guard:
-                    runtime.core.volume_destroy_device(self.id)
+        if not self.id:
+            return
-        except Exception:
-            pass
+        if self.device.is_cpu:
+            self.runtime.core.volume_destroy_host(self.id)
+        else:
+            # use CUDA context guard to avoid side effects during garbage collection
+            with self.device.context_guard:
+                self.runtime.core.volume_destroy_device(self.id)
     def array(self) -> array:
         """Returns the raw memory buffer of the Volume as an array"""
         buf = ctypes.c_void_p(0)
         size = ctypes.c_uint64(0)
         if self.device.is_cpu:
-            self.context.core.volume_get_buffer_info_host(self.id, ctypes.byref(buf), ctypes.byref(size))
+            self.runtime.core.volume_get_buffer_info_host(self.id, ctypes.byref(buf), ctypes.byref(size))
         else:
-            self.context.core.volume_get_buffer_info_device(self.id, ctypes.byref(buf), ctypes.byref(size))
-        return array(ptr=buf.value, dtype=uint8, shape=size.value, device=self.device, owner=False)
+            self.runtime.core.volume_get_buffer_info_device(self.id, ctypes.byref(buf), ctypes.byref(size))
+        return array(ptr=buf.value, dtype=uint8, shape=size.value, device=self.device)
     def get_tiles(self) -> array:
         if self.id == 0:
@@ -2899,18 +2994,24 @@ class Volume:
         buf = ctypes.c_void_p(0)
         size = ctypes.c_uint64(0)
         if self.device.is_cpu:
-            self.context.core.volume_get_tiles_host(self.id, ctypes.byref(buf), ctypes.byref(size))
+            self.runtime.core.volume_get_tiles_host(self.id, ctypes.byref(buf), ctypes.byref(size))
+            deleter = self.device.default_allocator.deleter
         else:
-            self.context.core.volume_get_tiles_device(self.id, ctypes.byref(buf), ctypes.byref(size))
+            self.runtime.core.volume_get_tiles_device(self.id, ctypes.byref(buf), ctypes.byref(size))
+            if self.device.is_mempool_supported:
+                deleter = self.device.mempool_allocator.deleter
+            else:
+                deleter = self.device.default_allocator.deleter
         num_tiles = size.value // (3 * 4)
-        return array(ptr=buf.value, dtype=int32, shape=(num_tiles, 3), device=self.device, owner=True)
+        return array(ptr=buf.value, dtype=int32, shape=(num_tiles, 3), device=self.device, deleter=deleter)
     def get_voxel_size(self) -> Tuple[float, float, float]:
         if self.id == 0:
             raise RuntimeError("Invalid Volume")
         dx, dy, dz = ctypes.c_float(0), ctypes.c_float(0), ctypes.c_float(0)
-        self.context.core.volume_get_voxel_size(self.id, ctypes.byref(dx), ctypes.byref(dy), ctypes.byref(dz))
+        self.runtime.core.volume_get_voxel_size(self.id, ctypes.byref(dx), ctypes.byref(dy), ctypes.byref(dz))
         return (dx.value, dy.value, dz.value)
     @classmethod
@@ -3109,9 +3210,7 @@ class Volume:
             device (Devicelike): The CUDA device to create the volume on, e.g.: "cuda" or "cuda:0".
         """
-        from warp.context import runtime
-        device = runtime.get_device(device)
+        device = warp.get_device(device)
         if voxel_size <= 0.0:
             raise RuntimeError(f"Voxel size must be positive! Got {voxel_size}")
@@ -3130,7 +3229,7 @@ class Volume:
         volume.device = device
         in_world_space = tile_points.dtype == vec3
         if hasattr(bg_value, "__len__"):
-            volume.id = volume.context.core.volume_v_from_tiles_device(
+            volume.id = volume.runtime.core.volume_v_from_tiles_device(
                 volume.device.context,
                 ctypes.c_void_p(tile_points.ptr),
                 tile_points.shape[0],
@@ -3144,7 +3243,7 @@ class Volume:
                 in_world_space,
             )
         elif isinstance(bg_value, int):
-            volume.id = volume.context.core.volume_i_from_tiles_device(
+            volume.id = volume.runtime.core.volume_i_from_tiles_device(
                 volume.device.context,
                 ctypes.c_void_p(tile_points.ptr),
                 tile_points.shape[0],
@@ -3156,7 +3255,7 @@ class Volume:
                 in_world_space,
             )
         else:
-            volume.id = volume.context.core.volume_f_from_tiles_device(
+            volume.id = volume.runtime.core.volume_f_from_tiles_device(
                 volume.device.context,
                 ctypes.c_void_p(tile_points.ptr),
                 tile_points.shape[0],
@@ -3194,6 +3293,7 @@ class mesh_query_point_t:
         :func:`mesh_query_point_sign_normal`,
         and :func:`mesh_query_point_sign_winding_number`.
     """
     from warp.codegen import Var
     vars = {
@@ -3222,6 +3322,7 @@ class mesh_query_ray_t:
     See Also:
         :func:`mesh_query_ray`.
     """
     from warp.codegen import Var
     vars = {
@@ -3243,7 +3344,6 @@ def matmul(
     alpha: float = 1.0,
     beta: float = 0.0,
     allow_tf32x3_arith: builtins.bool = False,
-    device=None,
 ):
     """Computes a generic matrix-matrix multiplication (GEMM) of the form: `d = alpha * (a @ b) + beta * c`.
@@ -3256,14 +3356,12 @@ def matmul(
         beta (float): parameter beta of GEMM
         allow_tf32x3_arith (bool): whether to use CUTLASS's 3xTF32 GEMMs, which enable accuracy similar to FP32
                                    while using Tensor Cores
-        device: device we want to use to multiply matrices. Defaults to active runtime device. If "cpu", resorts to using numpy multiplication.
     """
     from warp.context import runtime
-    if device is None:
-        device = runtime.get_device(device)
+    device = a.device
-    if a.device != device or b.device != device or c.device != device or d.device != device:
+    if b.device != device or c.device != device or d.device != device:
         raise RuntimeError("Matrices A, B, C, and D must all be on the same device as the runtime device.")
     if a.dtype != b.dtype or a.dtype != c.dtype or a.dtype != d.dtype:
@@ -3271,7 +3369,12 @@ def matmul(
             "wp.matmul currently only supports operation between {A, B, C, D} matrices of the same type."
         )
-    if (not a.is_contiguous and not a.is_transposed) or (not b.is_contiguous and not b.is_transposed) or (not c.is_contiguous) or (not d.is_contiguous):
+    if (
+        (not a.is_contiguous and not a.is_transposed)
+        or (not b.is_contiguous and not b.is_transposed)
+        or (not c.is_contiguous)
+        or (not d.is_contiguous)
+    ):
         raise RuntimeError(
             "wp.matmul is only valid for contiguous arrays, with the exception that A and/or B may be transposed."
         )
@@ -3286,9 +3389,7 @@ def matmul(
     if runtime.tape:
         runtime.tape.record_func(
-            backward=lambda: adj_matmul(
-                a, b, c, a.grad, b.grad, c.grad, d.grad, alpha, beta, allow_tf32x3_arith, device
-            ),
+            backward=lambda: adj_matmul(a, b, c, a.grad, b.grad, c.grad, d.grad, alpha, beta, allow_tf32x3_arith),
             arrays=[a, b, c, d],
         )
@@ -3299,6 +3400,7 @@ def matmul(
     cc = device.arch
     ret = runtime.core.cutlass_gemm(
+        device.context,
         cc,
         m,
         n,
@@ -3330,7 +3432,6 @@ def adj_matmul(
     alpha: float = 1.0,
     beta: float = 0.0,
     allow_tf32x3_arith: builtins.bool = False,
-    device=None,
 ):
     """Computes the adjoint of a generic matrix-matrix multiplication (GEMM) of the form: `d = alpha * (a @ b) + beta * c`.
         note: the adjoint of parameter alpha is not included but can be computed as `adj_alpha = np.sum(np.concatenate(np.multiply(a @ b, adj_d)))`.
@@ -3348,16 +3449,13 @@ def adj_matmul(
         beta (float): parameter beta of GEMM
         allow_tf32x3_arith (bool): whether to use CUTLASS's 3xTF32 GEMMs, which enable accuracy similar to FP32
                                    while using Tensor Cores
-        device: device we want to use to multiply matrices. Defaults to active runtime device. If "cpu", resorts to using numpy multiplication.
     """
     from warp.context import runtime
-    if device is None:
-        device = runtime.get_device(device)
+    device = a.device
     if (
-        a.device != device
-        or b.device != device
+        b.device != device
         or c.device != device
         or adj_a.device != device
         or adj_b.device != device
@@ -3392,7 +3490,7 @@ def adj_matmul(
         raise RuntimeError(
             "wp.matmul is only valid for contiguous arrays, with the exception that A and/or B and their associated adjoints may be transposed."
         )
     m = a.shape[0]
     n = b.shape[1]
     k = a.shape[1]
@@ -3423,6 +3521,7 @@ def adj_matmul(
     # adj_a
     if not a.is_transposed:
         ret = runtime.core.cutlass_gemm(
+            device.context,
             cc,
             m,
             k,
@@ -3443,6 +3542,7 @@ def adj_matmul(
             raise RuntimeError("adj_matmul failed.")
     else:
         ret = runtime.core.cutlass_gemm(
+            device.context,
             cc,
             k,
             m,
@@ -3465,6 +3565,7 @@ def adj_matmul(
     # adj_b
     if not b.is_transposed:
         ret = runtime.core.cutlass_gemm(
+            device.context,
             cc,
             k,
             n,
@@ -3485,6 +3586,7 @@ def adj_matmul(
             raise RuntimeError("adj_matmul failed.")
     else:
         ret = runtime.core.cutlass_gemm(
+            device.context,
             cc,
             n,
             k,
@@ -3502,7 +3604,7 @@ def adj_matmul(
             1,
         )
         if not ret:
-            raise RuntimeError("adj_matmul failed.")
+            raise RuntimeError("adj_matmul failed.")
     # adj_c
     warp.launch(
@@ -3510,7 +3612,7 @@ def adj_matmul(
         dim=adj_c.shape,
         inputs=[adj_c, adj_d, adj_d.dtype(beta)],
         device=device,
-        record_tape=False
+        record_tape=False,
     )
@@ -3522,7 +3624,6 @@ def batched_matmul(
     alpha: float = 1.0,
     beta: float = 0.0,
     allow_tf32x3_arith: builtins.bool = False,
-    device=None,
 ):
     """Computes a batched generic matrix-matrix multiplication (GEMM) of the form: `d = alpha * (a @ b) + beta * c`.
@@ -3535,14 +3636,12 @@ def batched_matmul(
         beta (float): parameter beta of GEMM
         allow_tf32x3_arith (bool): whether to use CUTLASS's 3xTF32 GEMMs, which enable accuracy similar to FP32
                                    while using Tensor Cores
-        device: device we want to use to multiply matrices. Defaults to active runtime device. If "cpu", resorts to using numpy multiplication.
     """
     from warp.context import runtime
-    if device is None:
-        device = runtime.get_device(device)
+    device = a.device
-    if a.device != device or b.device != device or c.device != device or d.device != device:
+    if b.device != device or c.device != device or d.device != device:
         raise RuntimeError("Matrices A, B, C, and D must all be on the same device as the runtime device.")
     if a.dtype != b.dtype or a.dtype != c.dtype or a.dtype != d.dtype:
@@ -3550,7 +3649,12 @@ def batched_matmul(
             "wp.batched_matmul currently only supports operation between {A, B, C, D} matrices of the same type."
         )
-    if (not a.is_contiguous and not a.is_transposed) or (not b.is_contiguous and not b.is_transposed) or (not c.is_contiguous) or (not d.is_contiguous):
+    if (
+        (not a.is_contiguous and not a.is_transposed)
+        or (not b.is_contiguous and not b.is_transposed)
+        or (not c.is_contiguous)
+        or (not d.is_contiguous)
+    ):
         raise RuntimeError(
             "wp.matmul is only valid for contiguous arrays, with the exception that A and/or B may be transposed."
         )
@@ -3567,7 +3671,7 @@ def batched_matmul(
     if runtime.tape:
         runtime.tape.record_func(
             backward=lambda: adj_batched_matmul(
-                a, b, c, a.grad, b.grad, c.grad, d.grad, alpha, beta, allow_tf32x3_arith, device
+                a, b, c, a.grad, b.grad, c.grad, d.grad, alpha, beta, allow_tf32x3_arith
             ),
             arrays=[a, b, c, d],
         )
@@ -3587,15 +3691,16 @@ def batched_matmul(
         idx_start = i * max_batch_count
         idx_end = (i + 1) * max_batch_count if i < iters - 1 else batch_count
         ret = runtime.core.cutlass_gemm(
+            device.context,
             cc,
             m,
             n,
             k,
             type_typestr(a.dtype).encode(),
-            ctypes.c_void_p(a[idx_start:idx_end,:,:].ptr),
-            ctypes.c_void_p(b[idx_start:idx_end,:,:].ptr),
-            ctypes.c_void_p(c[idx_start:idx_end,:,:].ptr),
-            ctypes.c_void_p(d[idx_start:idx_end,:,:].ptr),
+            ctypes.c_void_p(a[idx_start:idx_end, :, :].ptr),
+            ctypes.c_void_p(b[idx_start:idx_end, :, :].ptr),
+            ctypes.c_void_p(c[idx_start:idx_end, :, :].ptr),
+            ctypes.c_void_p(d[idx_start:idx_end, :, :].ptr),
             alpha,
             beta,
             not a.is_transposed,
@@ -3605,18 +3710,19 @@ def batched_matmul(
         )
         if not ret:
             raise RuntimeError("Batched matmul failed.")
     idx_start = iters * max_batch_count
     ret = runtime.core.cutlass_gemm(
+        device.context,
         cc,
         m,
         n,
         k,
         type_typestr(a.dtype).encode(),
-        ctypes.c_void_p(a[idx_start:,:,:].ptr),
-        ctypes.c_void_p(b[idx_start:,:,:].ptr),
-        ctypes.c_void_p(c[idx_start:,:,:].ptr),
-        ctypes.c_void_p(d[idx_start:,:,:].ptr),
+        ctypes.c_void_p(a[idx_start:, :, :].ptr),
+        ctypes.c_void_p(b[idx_start:, :, :].ptr),
+        ctypes.c_void_p(c[idx_start:, :, :].ptr),
+        ctypes.c_void_p(d[idx_start:, :, :].ptr),
         alpha,
         beta,
         not a.is_transposed,
@@ -3625,7 +3731,7 @@ def batched_matmul(
         remainder,
     )
     if not ret:
-        raise RuntimeError("Batched matmul failed.")
+        raise RuntimeError("Batched matmul failed.")
 def adj_batched_matmul(
@@ -3639,7 +3745,6 @@ def adj_batched_matmul(
     alpha: float = 1.0,
     beta: float = 0.0,
     allow_tf32x3_arith: builtins.bool = False,
-    device=None,
 ):
     """Computes a batched generic matrix-matrix multiplication (GEMM) of the form: `d = alpha * (a @ b) + beta * c`.
@@ -3655,16 +3760,13 @@ def adj_batched_matmul(
         beta (float): parameter beta of GEMM
         allow_tf32x3_arith (bool): whether to use CUTLASS's 3xTF32 GEMMs, which enable accuracy similar to FP32
                                    while using Tensor Cores
-        device: device we want to use to multiply matrices. Defaults to active runtime device. If "cpu", resorts to using numpy multiplication.
     """
     from warp.context import runtime
-    if device is None:
-        device = runtime.get_device(device)
+    device = a.device
     if (
-        a.device != device
-        or b.device != device
+        b.device != device
         or c.device != device
         or adj_a.device != device
         or adj_b.device != device
@@ -3739,15 +3841,16 @@ def adj_batched_matmul(
         # adj_a
         if not a.is_transposed:
             ret = runtime.core.cutlass_gemm(
+                device.context,
                 cc,
                 m,
                 k,
                 n,
                 type_typestr(a.dtype).encode(),
-                ctypes.c_void_p(adj_d[idx_start:idx_end,:,:].ptr),
-                ctypes.c_void_p(b[idx_start:idx_end,:,:].ptr),
-                ctypes.c_void_p(adj_a[idx_start:idx_end,:,:].ptr),
-                ctypes.c_void_p(adj_a[idx_start:idx_end,:,:].ptr),
+                ctypes.c_void_p(adj_d[idx_start:idx_end, :, :].ptr),
+                ctypes.c_void_p(b[idx_start:idx_end, :, :].ptr),
+                ctypes.c_void_p(adj_a[idx_start:idx_end, :, :].ptr),
+                ctypes.c_void_p(adj_a[idx_start:idx_end, :, :].ptr),
                 alpha,
                 1.0,
                 True,
@@ -3759,15 +3862,16 @@ def adj_batched_matmul(
                 raise RuntimeError("adj_matmul failed.")
         else:
             ret = runtime.core.cutlass_gemm(
+                device.context,
                 cc,
                 k,
                 m,
                 n,
                 type_typestr(a.dtype).encode(),
-                ctypes.c_void_p(b[idx_start:idx_end,:,:].ptr),
-                ctypes.c_void_p(adj_d[idx_start:idx_end,:,:].ptr),
-                ctypes.c_void_p(adj_a[idx_start:idx_end,:,:].ptr),
-                ctypes.c_void_p(adj_a[idx_start:idx_end,:,:].ptr),
+                ctypes.c_void_p(b[idx_start:idx_end, :, :].ptr),
+                ctypes.c_void_p(adj_d[idx_start:idx_end, :, :].ptr),
+                ctypes.c_void_p(adj_a[idx_start:idx_end, :, :].ptr),
+                ctypes.c_void_p(adj_a[idx_start:idx_end, :, :].ptr),
                 alpha,
                 1.0,
                 not b.is_transposed,
@@ -3781,15 +3885,16 @@ def adj_batched_matmul(
         # adj_b
         if not b.is_transposed:
             ret = runtime.core.cutlass_gemm(
+                device.context,
                 cc,
                 k,
                 n,
                 m,
                 type_typestr(a.dtype).encode(),
-                ctypes.c_void_p(a[idx_start:idx_end,:,:].ptr),
-                ctypes.c_void_p(adj_d[idx_start:idx_end,:,:].ptr),
-                ctypes.c_void_p(adj_b[idx_start:idx_end,:,:].ptr),
-                ctypes.c_void_p(adj_b[idx_start:idx_end,:,:].ptr),
+                ctypes.c_void_p(a[idx_start:idx_end, :, :].ptr),
+                ctypes.c_void_p(adj_d[idx_start:idx_end, :, :].ptr),
+                ctypes.c_void_p(adj_b[idx_start:idx_end, :, :].ptr),
+                ctypes.c_void_p(adj_b[idx_start:idx_end, :, :].ptr),
                 alpha,
                 1.0,
                 a.is_transposed,
@@ -3801,15 +3906,16 @@ def adj_batched_matmul(
                 raise RuntimeError("adj_matmul failed.")
         else:
             ret = runtime.core.cutlass_gemm(
+                device.context,
                 cc,
                 n,
                 k,
                 m,
                 type_typestr(a.dtype).encode(),
-                ctypes.c_void_p(adj_d[idx_start:idx_end,:,:].ptr),
-                ctypes.c_void_p(a[idx_start:idx_end,:,:].ptr),
-                ctypes.c_void_p(adj_b[idx_start:idx_end,:,:].ptr),
-                ctypes.c_void_p(adj_b[idx_start:idx_end,:,:].ptr),
+                ctypes.c_void_p(adj_d[idx_start:idx_end, :, :].ptr),
+                ctypes.c_void_p(a[idx_start:idx_end, :, :].ptr),
+                ctypes.c_void_p(adj_b[idx_start:idx_end, :, :].ptr),
+                ctypes.c_void_p(adj_b[idx_start:idx_end, :, :].ptr),
                 alpha,
                 1.0,
                 False,
@@ -3818,22 +3924,23 @@ def adj_batched_matmul(
                 max_batch_count,
             )
             if not ret:
-                raise RuntimeError("adj_matmul failed.")
+                raise RuntimeError("adj_matmul failed.")
     idx_start = iters * max_batch_count
     # adj_a
     if not a.is_transposed:
         ret = runtime.core.cutlass_gemm(
+            device.context,
             cc,
             m,
             k,
             n,
             type_typestr(a.dtype).encode(),
-            ctypes.c_void_p(adj_d[idx_start:,:,:].ptr),
-            ctypes.c_void_p(b[idx_start:,:,:].ptr),
-            ctypes.c_void_p(adj_a[idx_start:,:,:].ptr),
-            ctypes.c_void_p(adj_a[idx_start:,:,:].ptr),
+            ctypes.c_void_p(adj_d[idx_start:, :, :].ptr),
+            ctypes.c_void_p(b[idx_start:, :, :].ptr),
+            ctypes.c_void_p(adj_a[idx_start:, :, :].ptr),
+            ctypes.c_void_p(adj_a[idx_start:, :, :].ptr),
             alpha,
             1.0,
             True,
@@ -3845,15 +3952,16 @@ def adj_batched_matmul(
             raise RuntimeError("adj_matmul failed.")
     else:
         ret = runtime.core.cutlass_gemm(
+            device.context,
             cc,
             k,
             m,
             n,
             type_typestr(a.dtype).encode(),
-            ctypes.c_void_p(b[idx_start:,:,:].ptr),
-            ctypes.c_void_p(adj_d[idx_start:,:,:].ptr),
-            ctypes.c_void_p(adj_a[idx_start:,:,:].ptr),
-            ctypes.c_void_p(adj_a[idx_start:,:,:].ptr),
+            ctypes.c_void_p(b[idx_start:, :, :].ptr),
+            ctypes.c_void_p(adj_d[idx_start:, :, :].ptr),
+            ctypes.c_void_p(adj_a[idx_start:, :, :].ptr),
+            ctypes.c_void_p(adj_a[idx_start:, :, :].ptr),
             alpha,
             1.0,
             not b.is_transposed,
@@ -3867,15 +3975,16 @@ def adj_batched_matmul(
     # adj_b
     if not b.is_transposed:
         ret = runtime.core.cutlass_gemm(
+            device.context,
             cc,
             k,
             n,
             m,
             type_typestr(a.dtype).encode(),
-            ctypes.c_void_p(a[idx_start:,:,:].ptr),
-            ctypes.c_void_p(adj_d[idx_start:,:,:].ptr),
-            ctypes.c_void_p(adj_b[idx_start:,:,:].ptr),
-            ctypes.c_void_p(adj_b[idx_start:,:,:].ptr),
+            ctypes.c_void_p(a[idx_start:, :, :].ptr),
+            ctypes.c_void_p(adj_d[idx_start:, :, :].ptr),
+            ctypes.c_void_p(adj_b[idx_start:, :, :].ptr),
+            ctypes.c_void_p(adj_b[idx_start:, :, :].ptr),
             alpha,
             1.0,
             a.is_transposed,
@@ -3887,15 +3996,16 @@ def adj_batched_matmul(
             raise RuntimeError("adj_matmul failed.")
     else:
         ret = runtime.core.cutlass_gemm(
+            device.context,
             cc,
             n,
             k,
             m,
             type_typestr(a.dtype).encode(),
-            ctypes.c_void_p(adj_d[idx_start:,:,:].ptr),
-            ctypes.c_void_p(a[idx_start:,:,:].ptr),
-            ctypes.c_void_p(adj_b[idx_start:,:,:].ptr),
-            ctypes.c_void_p(adj_b[idx_start:,:,:].ptr),
+            ctypes.c_void_p(adj_d[idx_start:, :, :].ptr),
+            ctypes.c_void_p(a[idx_start:, :, :].ptr),
+            ctypes.c_void_p(adj_b[idx_start:, :, :].ptr),
+            ctypes.c_void_p(adj_b[idx_start:, :, :].ptr),
             alpha,
             1.0,
             False,
@@ -3904,7 +4014,7 @@ def adj_batched_matmul(
             remainder,
         )
         if not ret:
-            raise RuntimeError("adj_matmul failed.")
+            raise RuntimeError("adj_matmul failed.")
     # adj_c
     warp.launch(
@@ -3912,9 +4022,10 @@ def adj_batched_matmul(
         dim=adj_c.shape,
         inputs=[adj_c, adj_d, adj_d.dtype(beta)],
         device=device,
-        record_tape=False
+        record_tape=False,
     )
 class HashGrid:
     def __init__(self, dim_x, dim_y, dim_z, device=None):
         """Class representing a hash grid object for accelerated point queries.
@@ -3929,14 +4040,16 @@ class HashGrid:
             dim_z (int): Number of cells in z-axis
         """
-        from warp.context import runtime
+        self.id = 0
+        self.runtime = warp.context.runtime
-        self.device = runtime.get_device(device)
+        self.device = self.runtime.get_device(device)
         if self.device.is_cpu:
-            self.id = runtime.core.hash_grid_create_host(dim_x, dim_y, dim_z)
+            self.id = self.runtime.core.hash_grid_create_host(dim_x, dim_y, dim_z)
         else:
-            self.id = runtime.core.hash_grid_create_device(self.device.context, dim_x, dim_y, dim_z)
+            self.id = self.runtime.core.hash_grid_create_device(self.device.context, dim_x, dim_y, dim_z)
         # indicates whether the grid data has been reserved for use by a kernel
         self.reserved = False
@@ -3954,43 +4067,45 @@ class HashGrid:
                             the radius used when performing queries.
         """
-        from warp.context import runtime
         if self.device.is_cpu:
-            runtime.core.hash_grid_update_host(self.id, radius, ctypes.cast(points.ptr, ctypes.c_void_p), len(points))
+            self.runtime.core.hash_grid_update_host(
+                self.id, radius, ctypes.cast(points.ptr, ctypes.c_void_p), len(points)
+            )
         else:
-            runtime.core.hash_grid_update_device(self.id, radius, ctypes.cast(points.ptr, ctypes.c_void_p), len(points))
+            self.runtime.core.hash_grid_update_device(
+                self.id, radius, ctypes.cast(points.ptr, ctypes.c_void_p), len(points)
+            )
         self.reserved = True
     def reserve(self, num_points):
-        from warp.context import runtime
         if self.device.is_cpu:
-            runtime.core.hash_grid_reserve_host(self.id, num_points)
+            self.runtime.core.hash_grid_reserve_host(self.id, num_points)
         else:
-            runtime.core.hash_grid_reserve_device(self.id, num_points)
+            self.runtime.core.hash_grid_reserve_device(self.id, num_points)
         self.reserved = True
     def __del__(self):
-        try:
-            from warp.context import runtime
-            if self.device.is_cpu:
-                runtime.core.hash_grid_destroy_host(self.id)
-            else:
-                # use CUDA context guard to avoid side effects during garbage collection
-                with self.device.context_guard:
-                    runtime.core.hash_grid_destroy_device(self.id)
+        if not self.id:
+            return
-        except Exception:
-            pass
+        if self.device.is_cpu:
+            self.runtime.core.hash_grid_destroy_host(self.id)
+        else:
+            # use CUDA context guard to avoid side effects during garbage collection
+            with self.device.context_guard:
+                self.runtime.core.hash_grid_destroy_device(self.id)
 class MarchingCubes:
     def __init__(self, nx: int, ny: int, nz: int, max_verts: int, max_tris: int, device=None):
-        from warp.context import runtime
-        self.device = runtime.get_device(device)
+        self.id = 0
+        self.runtime = warp.context.runtime
+        self.device = self.runtime.get_device(device)
         if not self.device.is_cuda:
             raise RuntimeError("Only CUDA devices are supported for marching cubes")
@@ -4003,10 +4118,10 @@ class MarchingCubes:
         self.max_tris = max_tris
         # bindings to warp.so
-        self.alloc = runtime.core.marching_cubes_create_device
+        self.alloc = self.runtime.core.marching_cubes_create_device
         self.alloc.argtypes = [ctypes.c_void_p]
         self.alloc.restype = ctypes.c_uint64
-        self.free = runtime.core.marching_cubes_destroy_device
+        self.free = self.runtime.core.marching_cubes_destroy_device
         from warp.context import zeros
@@ -4017,6 +4132,10 @@ class MarchingCubes:
         self.id = ctypes.c_uint64(self.alloc(self.device.context))
     def __del__(self):
+        if not self.id:
+            return
         # use CUDA context guard to avoid side effects during garbage collection
         with self.device.context_guard:
             # destroy surfacer
@@ -4031,15 +4150,13 @@ class MarchingCubes:
         self.max_tris = max_tris
     def surface(self, field: array(dtype=float), threshold: float):
-        from warp.context import runtime
         # WP_API int marching_cubes_surface_host(const float* field, int nx, int ny, int nz, float threshold, wp::vec3* verts, int* triangles, int max_verts, int max_tris, int* out_num_verts, int* out_num_tris);
         num_verts = ctypes.c_int(0)
         num_tris = ctypes.c_int(0)
-        runtime.core.marching_cubes_surface_device.restype = ctypes.c_int
+        self.runtime.core.marching_cubes_surface_device.restype = ctypes.c_int
-        error = runtime.core.marching_cubes_surface_device(
+        error = self.runtime.core.marching_cubes_surface_device(
             self.id,
             ctypes.cast(field.ptr, ctypes.c_void_p),
             self.nx,
@@ -4155,7 +4272,7 @@ def infer_argument_types(args, template_types, arg_names=None):
         arg_name = arg_names[i] if arg_names else str(i)
         if arg_type in warp.types.array_types:
             arg_types.append(arg_type(dtype=arg.dtype, ndim=arg.ndim))
-        elif arg_type in warp.types.scalar_types:
+        elif arg_type in warp.types.scalar_types + [bool]:
             arg_types.append(arg_type)
         elif arg_type in [int, float]:
             # canonicalize type