PyPI - warp-lang - Versions diffs - 1.5.1__py3-none-macosx_10_13_universal2.whl → 1.6.1__py3-none-macosx_10_13_universal2.whl - Mend

warp-lang 1.5.1__py3-none-macosx_10_13_universal2.whl → 1.6.1__py3-none-macosx_10_13_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (131) hide show

warp/__init__.py +5 -0
warp/autograd.py +414 -191
warp/bin/libwarp-clang.dylib +0 -0
warp/bin/libwarp.dylib +0 -0
warp/build.py +40 -12
warp/build_dll.py +13 -6
warp/builtins.py +1077 -481
warp/codegen.py +250 -122
warp/config.py +65 -21
warp/context.py +500 -149
warp/examples/assets/square_cloth.usd +0 -0
warp/examples/benchmarks/benchmark_gemm.py +27 -18
warp/examples/benchmarks/benchmark_interop_paddle.py +3 -3
warp/examples/benchmarks/benchmark_interop_torch.py +3 -3
warp/examples/core/example_marching_cubes.py +1 -1
warp/examples/core/example_mesh.py +1 -1
warp/examples/core/example_torch.py +18 -34
warp/examples/core/example_wave.py +1 -1
warp/examples/fem/example_apic_fluid.py +1 -0
warp/examples/fem/example_mixed_elasticity.py +1 -1
warp/examples/optim/example_bounce.py +1 -1
warp/examples/optim/example_cloth_throw.py +1 -1
warp/examples/optim/example_diffray.py +4 -15
warp/examples/optim/example_drone.py +1 -1
warp/examples/optim/example_softbody_properties.py +392 -0
warp/examples/optim/example_trajectory.py +1 -3
warp/examples/optim/example_walker.py +5 -0
warp/examples/sim/example_cartpole.py +0 -2
warp/examples/sim/example_cloth_self_contact.py +314 -0
warp/examples/sim/example_granular_collision_sdf.py +4 -5
warp/examples/sim/example_jacobian_ik.py +0 -2
warp/examples/sim/example_quadruped.py +5 -2
warp/examples/tile/example_tile_cholesky.py +79 -0
warp/examples/tile/example_tile_convolution.py +2 -2
warp/examples/tile/example_tile_fft.py +2 -2
warp/examples/tile/example_tile_filtering.py +3 -3
warp/examples/tile/example_tile_matmul.py +4 -4
warp/examples/tile/example_tile_mlp.py +12 -12
warp/examples/tile/example_tile_nbody.py +191 -0
warp/examples/tile/example_tile_walker.py +319 -0
warp/math.py +147 -0
warp/native/array.h +12 -0
warp/native/builtin.h +0 -1
warp/native/bvh.cpp +149 -70
warp/native/bvh.cu +287 -68
warp/native/bvh.h +195 -85
warp/native/clang/clang.cpp +6 -2
warp/native/crt.h +1 -0
warp/native/cuda_util.cpp +35 -0
warp/native/cuda_util.h +5 -0
warp/native/exports.h +40 -40
warp/native/intersect.h +17 -0
warp/native/mat.h +57 -3
warp/native/mathdx.cpp +19 -0
warp/native/mesh.cpp +25 -8
warp/native/mesh.cu +153 -101
warp/native/mesh.h +482 -403
warp/native/quat.h +40 -0
warp/native/solid_angle.h +7 -0
warp/native/sort.cpp +85 -0
warp/native/sort.cu +34 -0
warp/native/sort.h +3 -1
warp/native/spatial.h +11 -0
warp/native/tile.h +1189 -664
warp/native/tile_reduce.h +8 -6
warp/native/vec.h +41 -0
warp/native/warp.cpp +8 -1
warp/native/warp.cu +263 -40
warp/native/warp.h +19 -5
warp/optim/linear.py +22 -4
warp/render/render_opengl.py +132 -59
warp/render/render_usd.py +10 -2
warp/sim/__init__.py +6 -1
warp/sim/collide.py +289 -32
warp/sim/import_urdf.py +20 -5
warp/sim/integrator_euler.py +25 -7
warp/sim/integrator_featherstone.py +147 -35
warp/sim/integrator_vbd.py +842 -40
warp/sim/model.py +173 -112
warp/sim/render.py +2 -2
warp/stubs.py +249 -116
warp/tape.py +28 -30
warp/tests/aux_test_module_unload.py +15 -0
warp/tests/{test_sim_grad.py → flaky_test_sim_grad.py} +104 -63
warp/tests/test_array.py +100 -0
warp/tests/test_assert.py +242 -0
warp/tests/test_codegen.py +14 -61
warp/tests/test_collision.py +8 -8
warp/tests/test_examples.py +16 -1
warp/tests/test_grad_debug.py +87 -2
warp/tests/test_hash_grid.py +1 -1
warp/tests/test_ipc.py +116 -0
warp/tests/test_launch.py +77 -26
warp/tests/test_mat.py +213 -168
warp/tests/test_math.py +47 -1
warp/tests/test_matmul.py +11 -7
warp/tests/test_matmul_lite.py +4 -4
warp/tests/test_mesh.py +84 -60
warp/tests/test_mesh_query_aabb.py +165 -0
warp/tests/test_mesh_query_point.py +328 -286
warp/tests/test_mesh_query_ray.py +134 -121
warp/tests/test_mlp.py +2 -2
warp/tests/test_operators.py +43 -0
warp/tests/test_overwrite.py +6 -5
warp/tests/test_quat.py +77 -0
warp/tests/test_reload.py +29 -0
warp/tests/test_sim_grad_bounce_linear.py +204 -0
warp/tests/test_static.py +16 -0
warp/tests/test_tape.py +25 -0
warp/tests/test_tile.py +134 -191
warp/tests/test_tile_load.py +399 -0
warp/tests/test_tile_mathdx.py +61 -8
warp/tests/test_tile_mlp.py +17 -17
warp/tests/test_tile_reduce.py +24 -18
warp/tests/test_tile_shared_memory.py +66 -17
warp/tests/test_tile_view.py +165 -0
warp/tests/test_torch.py +35 -0
warp/tests/test_utils.py +36 -24
warp/tests/test_vec.py +110 -0
warp/tests/unittest_suites.py +29 -4
warp/tests/unittest_utils.py +30 -11
warp/thirdparty/unittest_parallel.py +5 -2
warp/types.py +419 -111
warp/utils.py +9 -5
{warp_lang-1.5.1.dist-info → warp_lang-1.6.1.dist-info}/METADATA +86 -45
{warp_lang-1.5.1.dist-info → warp_lang-1.6.1.dist-info}/RECORD +129 -118
{warp_lang-1.5.1.dist-info → warp_lang-1.6.1.dist-info}/WHEEL +1 -1
warp/examples/benchmarks/benchmark_tile.py +0 -179
warp/native/tile_gemm.h +0 -341
{warp_lang-1.5.1.dist-info → warp_lang-1.6.1.dist-info}/LICENSE.md +0 -0
{warp_lang-1.5.1.dist-info → warp_lang-1.6.1.dist-info}/top_level.txt +0 -0

warp/context.py CHANGED Viewed

@@ -5,6 +5,8 @@
 # distribution of this software and related documentation without an express
 # license agreement from NVIDIA CORPORATION is strictly prohibited.
+from __future__ import annotations
 import ast
 import ctypes
 import errno
@@ -32,6 +34,7 @@ import warp
 import warp.build
 import warp.codegen
 import warp.config
+from warp.types import launch_bounds_t
 # represents either a built-in or user-defined function
@@ -393,7 +396,8 @@ class Function:
             if not warp.codegen.func_match_args(f, arg_types, kwarg_types):
                 continue
-            if len(f.input_types) != len(arg_types):
+            acceptable_arg_num = len(f.input_types) - len(f.defaults) <= len(arg_types) <= len(f.input_types)
+            if not acceptable_arg_num:
                 continue
             # try to match the given types to the function template types
@@ -410,6 +414,10 @@ class Function:
                 arg_names = f.input_types.keys()
                 overload_annotations = dict(zip(arg_names, arg_types))
+                # add defaults
+                for k, d in f.defaults.items():
+                    if k not in overload_annotations:
+                        overload_annotations[k] = warp.codegen.strip_reference(warp.codegen.get_arg_type(d))
                 ovl = shallowcopy(f)
                 ovl.adj = warp.codegen.Adjoint(f.func, overload_annotations)
@@ -753,8 +761,15 @@ def func(f):
     scope_locals = inspect.currentframe().f_back.f_locals
     m = get_module(f.__module__)
+    doc = getattr(f, "__doc__", "") or ""
     Function(
-        func=f, key=name, namespace="", module=m, value_func=None, scope_locals=scope_locals
+        func=f,
+        key=name,
+        namespace="",
+        module=m,
+        value_func=None,
+        scope_locals=scope_locals,
+        doc=doc.strip(),
     )  # value_type not known yet, will be inferred during Adjoint.build()
     # use the top of the list of overloads for this key
@@ -1059,7 +1074,8 @@ def overload(kernel, arg_types=Union[None, Dict[str, Any], List[Any]]):
         raise RuntimeError("wp.overload() called with invalid argument!")
-builtin_functions = {}
+# native functions that are part of the Warp API
+builtin_functions: Dict[str, Function] = {}
 def get_generic_vtypes():
@@ -1328,6 +1344,28 @@ def add_builtin(
             setattr(warp, key, func)
+def register_api_function(
+    function: Function,
+    group: str = "Other",
+    hidden=False,
+):
+    """Main entry point to register a Warp Python function to be part of the Warp API and appear in the documentation.
+    Args:
+        function (Function): Warp function to be registered.
+        group (str): Classification used for the documentation.
+        input_types (Mapping[str, Any]): Signature of the user-facing function.
+            Variadic arguments are supported by prefixing the parameter names
+            with asterisks as in `*args` and `**kwargs`. Generic arguments are
+            supported with types such as `Any`, `Float`, `Scalar`, etc.
+        value_type (Any): Type returned by the function.
+        hidden (bool): Whether to add that function into the documentation.
+    """
+    function.group = group
+    function.hidden = hidden
+    builtin_functions[function.key] = function
 # global dictionary of modules
 user_modules = {}
@@ -1561,6 +1599,7 @@ class ModuleBuilder:
         self.options = options
         self.module = module
         self.deferred_functions = []
+        self.fatbins = {}  # map from <some identifier> to fatbins, to add at link time
         self.ltoirs = {}  # map from lto symbol to lto binary
         self.ltoirs_decl = {}  # map from lto symbol to lto forward declaration
@@ -1675,7 +1714,7 @@ class ModuleBuilder:
         for kernel in self.kernels:
             source += warp.codegen.codegen_kernel(kernel, device=device, options=self.options)
-            source += warp.codegen.codegen_module(kernel, device=device)
+            source += warp.codegen.codegen_module(kernel, device=device, options=self.options)
         # add headers
         if device == "cpu":
@@ -1728,20 +1767,26 @@ class ModuleExec:
         name = kernel.get_mangled_name()
+        options = dict(kernel.module.options)
+        options.update(kernel.options)
         if self.device.is_cuda:
             forward_name = name + "_cuda_kernel_forward"
             forward_kernel = runtime.core.cuda_get_kernel(
                 self.device.context, self.handle, forward_name.encode("utf-8")
             )
-            backward_name = name + "_cuda_kernel_backward"
-            backward_kernel = runtime.core.cuda_get_kernel(
-                self.device.context, self.handle, backward_name.encode("utf-8")
-            )
+            if options["enable_backward"]:
+                backward_name = name + "_cuda_kernel_backward"
+                backward_kernel = runtime.core.cuda_get_kernel(
+                    self.device.context, self.handle, backward_name.encode("utf-8")
+                )
+            else:
+                backward_kernel = None
             # look up the required shared memory size for each kernel from module metadata
             forward_smem_bytes = self.meta[forward_name + "_smem_bytes"]
-            backward_smem_bytes = self.meta[backward_name + "_smem_bytes"]
+            backward_smem_bytes = self.meta[backward_name + "_smem_bytes"] if options["enable_backward"] else 0
             # configure kernels maximum shared memory size
             max_smem_bytes = runtime.core.cuda_get_max_shared_memory(self.device.context)
@@ -1751,9 +1796,6 @@ class ModuleExec:
                     f"Warning: Failed to configure kernel dynamic shared memory for this device, tried to configure {forward_name} kernel for {forward_smem_bytes} bytes, but maximum available is {max_smem_bytes}"
                 )
-            options = dict(kernel.module.options)
-            options.update(kernel.options)
             if options["enable_backward"] and not runtime.core.cuda_configure_kernel_shared_memory(
                 backward_kernel, backward_smem_bytes
             ):
@@ -1768,9 +1810,14 @@ class ModuleExec:
             forward = (
                 func(runtime.llvm.lookup(self.handle.encode("utf-8"), (name + "_cpu_forward").encode("utf-8"))) or None
             )
-            backward = (
-                func(runtime.llvm.lookup(self.handle.encode("utf-8"), (name + "_cpu_backward").encode("utf-8"))) or None
-            )
+            if options["enable_backward"]:
+                backward = (
+                    func(runtime.llvm.lookup(self.handle.encode("utf-8"), (name + "_cpu_backward").encode("utf-8")))
+                    or None
+                )
+            else:
+                backward = None
             hooks = KernelHooks(forward, backward)
@@ -1803,13 +1850,13 @@ class Module:
         self._live_kernels = weakref.WeakSet()
         # executable modules currently loaded
-        self.execs = {}  # (device.context: ModuleExec)
+        self.execs = {}  # ((device.context, blockdim): ModuleExec)
         # set of device contexts where the build has failed
         self.failed_builds = set()
-        # hash data, including the module hash
-        self.hasher = None
+        # hash data, including the module hash. Module may store multiple hashes (one per block_dim used)
+        self.hashers = {}
         # LLVM executable modules are identified using strings.  Since it's possible for multiple
         # executable versions to be loaded at the same time, we need a way to ensure uniqueness.
@@ -1822,6 +1869,8 @@ class Module:
             "max_unroll": warp.config.max_unroll,
             "enable_backward": warp.config.enable_backward,
             "fast_math": False,
+            "fuse_fp": True,
+            "lineinfo": False,
             "cuda_output": None,  # supported values: "ptx", "cubin", or None (automatic)
             "mode": warp.config.mode,
             "block_dim": 256,
@@ -1965,28 +2014,27 @@ class Module:
     def hash_module(self):
         # compute latest hash
-        self.hasher = ModuleHasher(self)
-        return self.hasher.get_module_hash()
+        block_dim = self.options["block_dim"]
+        self.hashers[block_dim] = ModuleHasher(self)
+        return self.hashers[block_dim].get_module_hash()
     def load(self, device, block_dim=None) -> ModuleExec:
         device = runtime.get_device(device)
-        # re-compile module if tile size (blockdim) changes
-        # todo: it would be better to have a method such as `module.get_kernel(block_dim=N)`
-        # that can return a single kernel instance with a given block size
+        # update module options if launching with a new block dim
         if block_dim is not None:
-            if self.options["block_dim"] != block_dim:
-                self.unload()
             self.options["block_dim"] = block_dim
+        active_block_dim = self.options["block_dim"]
         # compute the hash if needed
-        if self.hasher is None:
-            self.hasher = ModuleHasher(self)
+        if active_block_dim not in self.hashers:
+            self.hashers[active_block_dim] = ModuleHasher(self)
         # check if executable module is already loaded and not stale
-        exec = self.execs.get(device.context)
+        exec = self.execs.get((device.context, active_block_dim))
         if exec is not None:
-            if exec.module_hash == self.hasher.module_hash:
+            if exec.module_hash == self.hashers[active_block_dim].get_module_hash():
                 return exec
         # quietly avoid repeated build attempts to reduce error spew
@@ -1994,10 +2042,11 @@ class Module:
             return None
         module_name = "wp_" + self.name
-        module_hash = self.hasher.module_hash
+        module_hash = self.hashers[active_block_dim].get_module_hash()
         # use a unique module path using the module short hash
-        module_dir = os.path.join(warp.config.kernel_cache_dir, f"{module_name}_{module_hash.hex()[:7]}")
+        module_name_short = f"{module_name}_{module_hash.hex()[:7]}"
+        module_dir = os.path.join(warp.config.kernel_cache_dir, module_name_short)
         with warp.ScopedTimer(
             f"Module {self.name} {module_hash.hex()[:7]} load on device '{device}'", active=not warp.config.quiet
@@ -2005,7 +2054,7 @@ class Module:
             # -----------------------------------------------------------
             # determine output paths
             if device.is_cpu:
-                output_name = "module_codegen.o"
+                output_name = f"{module_name_short}.o"
                 output_arch = None
             elif device.is_cuda:
@@ -2025,10 +2074,10 @@ class Module:
                 if use_ptx:
                     output_arch = min(device.arch, warp.config.ptx_target_arch)
-                    output_name = f"module_codegen.sm{output_arch}.ptx"
+                    output_name = f"{module_name_short}.sm{output_arch}.ptx"
                 else:
                     output_arch = device.arch
-                    output_name = f"module_codegen.sm{output_arch}.cubin"
+                    output_name = f"{module_name_short}.sm{output_arch}.cubin"
             # final object binary path
             binary_path = os.path.join(module_dir, output_name)
@@ -2050,7 +2099,7 @@ class Module:
                     # Some of the Tile codegen, such as cuFFTDx and cuBLASDx, requires knowledge of the target arch
                     "output_arch": output_arch,
                 }
-                builder = ModuleBuilder(self, builder_options, hasher=self.hasher)
+                builder = ModuleBuilder(self, builder_options, hasher=self.hashers[active_block_dim])
                 # create a temporary (process unique) dir for build outputs before moving to the binary dir
                 build_dir = os.path.join(
@@ -2066,7 +2115,7 @@ class Module:
                 if device.is_cpu:
                     # build
                     try:
-                        source_code_path = os.path.join(build_dir, "module_codegen.cpp")
+                        source_code_path = os.path.join(build_dir, f"{module_name_short}.cpp")
                         # write cpp sources
                         cpp_source = builder.codegen("cpu")
@@ -2084,6 +2133,7 @@ class Module:
                                 mode=self.options["mode"],
                                 fast_math=self.options["fast_math"],
                                 verify_fp=warp.config.verify_fp,
+                                fuse_fp=self.options["fuse_fp"],
                             )
                     except Exception as e:
@@ -2094,7 +2144,7 @@ class Module:
                 elif device.is_cuda:
                     # build
                     try:
-                        source_code_path = os.path.join(build_dir, "module_codegen.cu")
+                        source_code_path = os.path.join(build_dir, f"{module_name_short}.cu")
                         # write cuda sources
                         cu_source = builder.codegen("cuda")
@@ -2111,9 +2161,12 @@ class Module:
                                 output_arch,
                                 output_path,
                                 config=self.options["mode"],
-                                fast_math=self.options["fast_math"],
                                 verify_fp=warp.config.verify_fp,
+                                fast_math=self.options["fast_math"],
+                                fuse_fp=self.options["fuse_fp"],
+                                lineinfo=self.options["lineinfo"],
                                 ltoirs=builder.ltoirs.values(),
+                                fatbins=builder.fatbins.values(),
                             )
                     except Exception as e:
@@ -2125,7 +2178,7 @@ class Module:
                 # build meta data
                 meta = builder.build_meta()
-                meta_path = os.path.join(build_dir, "module_codegen.meta")
+                meta_path = os.path.join(build_dir, f"{module_name_short}.meta")
                 with open(meta_path, "w") as meta_file:
                     json.dump(meta, meta_file)
@@ -2189,7 +2242,7 @@ class Module:
             # -----------------------------------------------------------
             # Load CPU or CUDA binary
-            meta_path = os.path.join(module_dir, "module_codegen.meta")
+            meta_path = os.path.join(module_dir, f"{module_name_short}.meta")
             with open(meta_path, "r") as meta_file:
                 meta = json.load(meta_file)
@@ -2199,13 +2252,13 @@ class Module:
                 self.cpu_exec_id += 1
                 runtime.llvm.load_obj(binary_path.encode("utf-8"), module_handle.encode("utf-8"))
                 module_exec = ModuleExec(module_handle, module_hash, device, meta)
-                self.execs[None] = module_exec
+                self.execs[(None, active_block_dim)] = module_exec
             elif device.is_cuda:
                 cuda_module = warp.build.load_cuda(binary_path, device)
                 if cuda_module is not None:
                     module_exec = ModuleExec(cuda_module, module_hash, device, meta)
-                    self.execs[device.context] = module_exec
+                    self.execs[(device.context, active_block_dim)] = module_exec
                 else:
                     module_load_timer.extra_msg = " (error)"
                     raise Exception(f"Failed to load CUDA module '{self.name}'")
@@ -2227,14 +2280,14 @@ class Module:
     def mark_modified(self):
         # clear hash data
-        self.hasher = None
+        self.hashers = {}
         # clear build failures
         self.failed_builds = set()
     # lookup kernel entry points based on name, called after compilation / module load
     def get_kernel_hooks(self, kernel, device):
-        module_exec = self.execs.get(device.context)
+        module_exec = self.execs.get((device.context, self.options["block_dim"]))
         if module_exec is not None:
             return module_exec.get_kernel_hooks(kernel)
         else:
@@ -2353,6 +2406,7 @@ class Event:
         DEFAULT = 0x0
         BLOCKING_SYNC = 0x1
         DISABLE_TIMING = 0x2
+        INTERPROCESS = 0x4
     def __new__(cls, *args, **kwargs):
         """Creates a new event instance."""
@@ -2360,7 +2414,9 @@ class Event:
         instance.owner = False
         return instance
-    def __init__(self, device: "Devicelike" = None, cuda_event=None, enable_timing: bool = False):
+    def __init__(
+        self, device: "Devicelike" = None, cuda_event=None, enable_timing: bool = False, interprocess: bool = False
+    ):
         """Initializes the event on a CUDA device.
         Args:
@@ -2372,6 +2428,12 @@ class Event:
               :func:`~warp.get_event_elapsed_time` can be used to measure the
               time between two events created with ``enable_timing=True`` and
               recorded onto streams.
+            interprocess: If ``True`` this event may be used as an interprocess event.
+        Raises:
+            RuntimeError: The event could not be created.
+            ValueError: The combination of ``enable_timing=True`` and
+                ``interprocess=True`` is not allowed.
         """
         device = get_device(device)
@@ -2386,11 +2448,48 @@ class Event:
             flags = Event.Flags.DEFAULT
             if not enable_timing:
                 flags |= Event.Flags.DISABLE_TIMING
+            if interprocess:
+                if enable_timing:
+                    raise ValueError("The combination of 'enable_timing=True' and 'interprocess=True' is not allowed.")
+                flags |= Event.Flags.INTERPROCESS
             self.cuda_event = runtime.core.cuda_event_create(device.context, flags)
             if not self.cuda_event:
                 raise RuntimeError(f"Failed to create event on device {device}")
             self.owner = True
+    def ipc_handle(self) -> bytes:
+        """Return a CUDA IPC handle of the event as a 64-byte ``bytes`` object.
+        The event must have been created with ``interprocess=True`` in order to
+        obtain a valid interprocess handle.
+        IPC is currently only supported on Linux.
+        Example:
+            Create an event and get its IPC handle::
+                e1 = wp.Event(interprocess=True)
+                event_handle = e1.ipc_handle()
+        Raises:
+            RuntimeError: Device does not support IPC.
+        """
+        if self.device.is_ipc_supported is not False:
+            # Allocate a buffer for the data (64-element char array)
+            ipc_handle_buffer = (ctypes.c_char * 64)()
+            warp.context.runtime.core.cuda_ipc_get_event_handle(self.device.context, self.cuda_event, ipc_handle_buffer)
+            if ipc_handle_buffer.raw == bytes(64):
+                warp.utils.warn("IPC event handle appears to be invalid. Was interprocess=True used?")
+            return ipc_handle_buffer.raw
+        else:
+            raise RuntimeError(f"Device {self.device} does not support IPC.")
     def __del__(self):
         if not self.owner:
             return
@@ -2538,23 +2637,27 @@ class Device:
     """A device to allocate Warp arrays and to launch kernels on.
     Attributes:
-        ordinal: A Warp-specific integer label for the device. ``-1`` for CPU devices.
-        name: A string label for the device. By default, CPU devices will be named according to the processor name,
+        ordinal (int): A Warp-specific label for the device. ``-1`` for CPU devices.
+        name (str): A label for the device. By default, CPU devices will be named according to the processor name,
             or ``"CPU"`` if the processor name cannot be determined.
-        arch: An integer representing the compute capability version number calculated as
-            ``10 * major + minor``. ``0`` for CPU devices.
-        is_uva: A boolean indicating whether the device supports unified addressing.
+        arch (int): The compute capability version number calculated as ``10 * major + minor``.
+            ``0`` for CPU devices.
+        is_uva (bool): Indicates whether the device supports unified addressing.
             ``False`` for CPU devices.
-        is_cubin_supported: A boolean indicating whether Warp's version of NVRTC can directly
+        is_cubin_supported (bool): Indicates whether Warp's version of NVRTC can directly
             generate CUDA binary files (cubin) for this device's architecture. ``False`` for CPU devices.
-        is_mempool_supported: A boolean indicating whether the device supports using the
-            ``cuMemAllocAsync`` and ``cuMemPool`` family of APIs for stream-ordered memory allocations. ``False`` for
-            CPU devices.
-        is_primary: A boolean indicating whether this device's CUDA context is also the
-            device's primary context.
-        uuid: A string representing the UUID of the CUDA device. The UUID is in the same format used by
-            ``nvidia-smi -L``. ``None`` for CPU devices.
-        pci_bus_id: A string identifier for the CUDA device in the format ``[domain]:[bus]:[device]``, in which
+        is_mempool_supported (bool): Indicates whether the device supports using the ``cuMemAllocAsync`` and
+            ``cuMemPool`` family of APIs for stream-ordered memory allocations. ``False`` for CPU devices.
+        is_ipc_supported (Optional[bool]): Indicates whether the device supports IPC.
+            - ``True`` if supported.
+            - ``False`` if not supported.
+            - ``None`` if IPC support could not be determined (e.g. CUDA 11).
+        is_primary (bool): Indicates whether this device's CUDA context is also the device's primary context.
+        uuid (str): The UUID of the CUDA device. The UUID is in the same format used by ``nvidia-smi -L``.
+            ``None`` for CPU devices.
+        pci_bus_id (str): An identifier for the CUDA device in the format ``[domain]:[bus]:[device]``, in which
             ``domain``, ``bus``, and ``device`` are all hexadecimal values. ``None`` for CPU devices.
     """
@@ -2587,6 +2690,7 @@ class Device:
             self.is_uva = False
             self.is_mempool_supported = False
             self.is_mempool_enabled = False
+            self.is_ipc_supported = False  # TODO: Support IPC for CPU arrays
             self.is_cubin_supported = False
             self.uuid = None
             self.pci_bus_id = None
@@ -2602,8 +2706,14 @@ class Device:
             # CUDA device
             self.name = runtime.core.cuda_device_get_name(ordinal).decode()
             self.arch = runtime.core.cuda_device_get_arch(ordinal)
-            self.is_uva = runtime.core.cuda_device_is_uva(ordinal)
-            self.is_mempool_supported = runtime.core.cuda_device_is_mempool_supported(ordinal)
+            self.is_uva = runtime.core.cuda_device_is_uva(ordinal) > 0
+            self.is_mempool_supported = runtime.core.cuda_device_is_mempool_supported(ordinal) > 0
+            if platform.system() == "Linux":
+                # Use None when IPC support cannot be determined
+                ipc_support_api_query = runtime.core.cuda_device_is_ipc_supported(ordinal)
+                self.is_ipc_supported = bool(ipc_support_api_query) if ipc_support_api_query >= 0 else None
+            else:
+                self.is_ipc_supported = False
             if warp.config.enable_mempools_at_init:
                 # enable if supported
                 self.is_mempool_enabled = self.is_mempool_supported
@@ -3084,6 +3194,9 @@ class Runtime:
             self.core.radix_sort_pairs_int_host.argtypes = [ctypes.c_uint64, ctypes.c_uint64, ctypes.c_int]
             self.core.radix_sort_pairs_int_device.argtypes = [ctypes.c_uint64, ctypes.c_uint64, ctypes.c_int]
+            self.core.radix_sort_pairs_float_host.argtypes = [ctypes.c_uint64, ctypes.c_uint64, ctypes.c_int]
+            self.core.radix_sort_pairs_float_device.argtypes = [ctypes.c_uint64, ctypes.c_uint64, ctypes.c_int]
             self.core.runlength_encode_int_host.argtypes = [
                 ctypes.c_uint64,
                 ctypes.c_uint64,
@@ -3100,10 +3213,16 @@ class Runtime:
             ]
             self.core.bvh_create_host.restype = ctypes.c_uint64
-            self.core.bvh_create_host.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int]
+            self.core.bvh_create_host.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int]
             self.core.bvh_create_device.restype = ctypes.c_uint64
-            self.core.bvh_create_device.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int]
+            self.core.bvh_create_device.argtypes = [
+                ctypes.c_void_p,
+                ctypes.c_void_p,
+                ctypes.c_void_p,
+                ctypes.c_int,
+                ctypes.c_int,
+            ]
             self.core.bvh_destroy_host.argtypes = [ctypes.c_uint64]
             self.core.bvh_destroy_device.argtypes = [ctypes.c_uint64]
@@ -3119,6 +3238,7 @@ class Runtime:
                 ctypes.c_int,
                 ctypes.c_int,
                 ctypes.c_int,
+                ctypes.c_int,
             ]
             self.core.mesh_create_device.restype = ctypes.c_uint64
@@ -3130,6 +3250,7 @@ class Runtime:
                 ctypes.c_int,
                 ctypes.c_int,
                 ctypes.c_int,
+                ctypes.c_int,
             ]
             self.core.mesh_destroy_host.argtypes = [ctypes.c_uint64]
@@ -3367,6 +3488,8 @@ class Runtime:
             self.core.cuda_device_is_uva.restype = ctypes.c_int
             self.core.cuda_device_is_mempool_supported.argtypes = [ctypes.c_int]
             self.core.cuda_device_is_mempool_supported.restype = ctypes.c_int
+            self.core.cuda_device_is_ipc_supported.argtypes = [ctypes.c_int]
+            self.core.cuda_device_is_ipc_supported.restype = ctypes.c_int
             self.core.cuda_device_set_mempool_release_threshold.argtypes = [ctypes.c_int, ctypes.c_uint64]
             self.core.cuda_device_set_mempool_release_threshold.restype = ctypes.c_int
             self.core.cuda_device_get_mempool_release_threshold.argtypes = [ctypes.c_int]
@@ -3420,6 +3543,22 @@ class Runtime:
             self.core.cuda_set_mempool_access_enabled.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.c_int]
             self.core.cuda_set_mempool_access_enabled.restype = ctypes.c_int
+            # inter-process communication
+            self.core.cuda_ipc_get_mem_handle.argtypes = [ctypes.c_void_p, ctypes.POINTER(ctypes.c_char)]
+            self.core.cuda_ipc_get_mem_handle.restype = None
+            self.core.cuda_ipc_open_mem_handle.argtypes = [ctypes.c_void_p, ctypes.POINTER(ctypes.c_char)]
+            self.core.cuda_ipc_open_mem_handle.restype = ctypes.c_void_p
+            self.core.cuda_ipc_close_mem_handle.argtypes = [ctypes.c_void_p]
+            self.core.cuda_ipc_close_mem_handle.restype = None
+            self.core.cuda_ipc_get_event_handle.argtypes = [
+                ctypes.c_void_p,
+                ctypes.c_void_p,
+                ctypes.POINTER(ctypes.c_char),
+            ]
+            self.core.cuda_ipc_get_event_handle.restype = None
+            self.core.cuda_ipc_open_event_handle.argtypes = [ctypes.c_void_p, ctypes.POINTER(ctypes.c_char)]
+            self.core.cuda_ipc_open_event_handle.restype = ctypes.c_void_p
             self.core.cuda_stream_create.argtypes = [ctypes.c_void_p, ctypes.c_int]
             self.core.cuda_stream_create.restype = ctypes.c_void_p
             self.core.cuda_stream_destroy.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
@@ -3467,6 +3606,7 @@ class Runtime:
             self.core.cuda_compile_program.argtypes = [
                 ctypes.c_char_p,  # cuda_src
+                ctypes.c_char_p,  # program name
                 ctypes.c_int,  # arch
                 ctypes.c_char_p,  # include_dir
                 ctypes.c_int,  # num_cuda_include_dirs
@@ -3475,10 +3615,13 @@ class Runtime:
                 ctypes.c_bool,  # verbose
                 ctypes.c_bool,  # verify_fp
                 ctypes.c_bool,  # fast_math
+                ctypes.c_bool,  # fuse_fp
+                ctypes.c_bool,  # lineinfo
                 ctypes.c_char_p,  # output_path
                 ctypes.c_size_t,  # num_ltoirs
                 ctypes.POINTER(ctypes.c_char_p),  # ltoirs
                 ctypes.POINTER(ctypes.c_size_t),  # ltoir_sizes
+                ctypes.POINTER(ctypes.c_int),  # ltoir_input_types, each of type nvJitLinkInputType
             ]
             self.core.cuda_compile_program.restype = ctypes.c_size_t
@@ -3518,6 +3661,22 @@ class Runtime:
             ]
             self.core.cuda_compile_dot.restype = ctypes.c_bool
+            self.core.cuda_compile_solver.argtypes = [
+                ctypes.c_char_p,  # universal fatbin
+                ctypes.c_char_p,  # lto
+                ctypes.c_char_p,  # function name
+                ctypes.c_int,  # num include dirs
+                ctypes.POINTER(ctypes.c_char_p),  # include dirs
+                ctypes.c_char_p,  # mathdx include dir
+                ctypes.c_int,  # arch
+                ctypes.c_int,  # M
+                ctypes.c_int,  # N
+                ctypes.c_int,  # precision
+                ctypes.c_int,  # fill_mode
+                ctypes.c_int,  # num threads
+            ]
+            self.core.cuda_compile_fft.restype = ctypes.c_bool
             self.core.cuda_load_module.argtypes = [ctypes.c_void_p, ctypes.c_char_p]
             self.core.cuda_load_module.restype = ctypes.c_void_p
@@ -4868,6 +5027,40 @@ def from_numpy(
     )
+def event_from_ipc_handle(handle, device: "Devicelike" = None) -> Event:
+    """Create an event from an IPC handle.
+    Args:
+        handle: The interprocess event handle for an existing CUDA event.
+        device (Devicelike): Device to associate with the array.
+    Returns:
+        An event created from the interprocess event handle ``handle``.
+    Raises:
+        RuntimeError: IPC is not supported on ``device``.
+    """
+    try:
+        # Performance note: try first, ask questions later
+        device = warp.context.runtime.get_device(device)
+    except Exception:
+        # Fallback to using the public API for retrieving the device,
+        # which takes take of initializing Warp if needed.
+        device = warp.context.get_device(device)
+    if device.is_ipc_supported is False:
+        raise RuntimeError(f"IPC is not supported on device {device}.")
+    event = Event(
+        device=device, cuda_event=warp.context.runtime.core.cuda_ipc_open_event_handle(device.context, handle)
+    )
+    # Events created from IPC handles must be freed with cuEventDestroy
+    event.owner = True
+    return event
 # given a kernel destination argument type and a value convert
 #  to a c-type that can be passed to a kernel
 def pack_arg(kernel, arg_type, arg_name, value, device, adjoint=False):
@@ -4949,6 +5142,9 @@ def pack_arg(kernel, arg_type, arg_name, value, device, adjoint=False):
     # try to convert to a value type (vec3, mat33, etc)
     elif issubclass(arg_type, ctypes.Array):
+        # simple value types don't have gradient arrays, but native built-in signatures still expect a non-null adjoint value of the correct type
+        if value is None and adjoint:
+            return arg_type(0)
         if warp.types.types_equal(type(value), arg_type):
             return value
         else:
@@ -4958,9 +5154,6 @@ def pack_arg(kernel, arg_type, arg_name, value, device, adjoint=False):
             except Exception as e:
                 raise ValueError(f"Failed to convert argument for param {arg_name} to {type_str(arg_type)}") from e
-    elif isinstance(value, bool):
-        return ctypes.c_bool(value)
     elif isinstance(value, arg_type):
         try:
             # try to pack as a scalar type
@@ -4975,6 +5168,9 @@ def pack_arg(kernel, arg_type, arg_name, value, device, adjoint=False):
             ) from e
     else:
+        # scalar args don't have gradient arrays, but native built-in signatures still expect a non-null scalar adjoint
+        if value is None and adjoint:
+            return arg_type._type_(0)
         try:
             # try to pack as a scalar type
             if arg_type is warp.types.float16:
@@ -4992,8 +5188,23 @@ def pack_arg(kernel, arg_type, arg_name, value, device, adjoint=False):
 # represents all data required for a kernel launch
 # so that launches can be replayed quickly, use `wp.launch(..., record_cmd=True)`
 class Launch:
+    """Represents all data required for a kernel launch so that launches can be replayed quickly.
+    Users should not directly instantiate this class, instead use
+    ``wp.launch(..., record_cmd=True)`` to record a launch.
+    """
     def __init__(
-        self, kernel, device, hooks=None, params=None, params_addr=None, bounds=None, max_blocks=0, block_dim=256
+        self,
+        kernel,
+        device: Device,
+        hooks: Optional[KernelHooks] = None,
+        params: Optional[Sequence[Any]] = None,
+        params_addr: Optional[Sequence[ctypes.c_void_p]] = None,
+        bounds: Optional[launch_bounds_t] = None,
+        max_blocks: int = 0,
+        block_dim: int = 256,
+        adjoint: bool = False,
     ):
         # retain the module executable so it doesn't get unloaded
         self.module_exec = kernel.module.load(device)
@@ -5006,13 +5217,14 @@ class Launch:
         # if not specified set a zero bound
         if not bounds:
-            bounds = warp.types.launch_bounds_t(0)
+            bounds = launch_bounds_t(0)
         # if not specified then build a list of default value params for args
         if not params:
             params = []
             params.append(bounds)
+            # Pack forward parameters
             for a in kernel.adj.args:
                 if isinstance(a.type, warp.types.array):
                     params.append(a.type.__ctype__())
@@ -5021,6 +5233,18 @@ class Launch:
                 else:
                     params.append(pack_arg(kernel, a.type, a.label, 0, device, False))
+            # Pack adjoint parameters if adjoint=True
+            if adjoint:
+                for a in kernel.adj.args:
+                    if isinstance(a.type, warp.types.array):
+                        params.append(a.type.__ctype__())
+                    elif isinstance(a.type, warp.codegen.Struct):
+                        params.append(a.type().__ctype__())
+                    else:
+                        # For primitive types in adjoint mode, initialize with 0
+                        params.append(pack_arg(kernel, a.type, a.label, 0, device, True))
+            # Create array of parameter addresses
             kernel_args = [ctypes.c_void_p(ctypes.addressof(x)) for x in params]
             kernel_params = (ctypes.c_void_p * len(kernel_args))(*kernel_args)
@@ -5030,13 +5254,30 @@ class Launch:
         self.hooks = hooks
         self.params = params
         self.params_addr = params_addr
-        self.device = device
-        self.bounds = bounds
-        self.max_blocks = max_blocks
-        self.block_dim = block_dim
+        self.device: Device = device
+        """The device to launch on.
+        This should not be changed after the launch object is created.
+        """
+        self.bounds: launch_bounds_t = bounds
+        """The launch bounds. Update with :meth:`set_dim`."""
-    def set_dim(self, dim):
-        self.bounds = warp.types.launch_bounds_t(dim)
+        self.max_blocks: int = max_blocks
+        """The maximum number of CUDA thread blocks to use."""
+        self.block_dim: int = block_dim
+        """The number of threads per block."""
+        self.adjoint: bool = adjoint
+        """Whether to run the adjoint kernel instead of the forward kernel."""
+    def set_dim(self, dim: Union[int, List[int], Tuple[int, ...]]):
+        """Set the launch dimensions.
+        Args:
+            dim: The dimensions of the launch.
+        """
+        self.bounds = launch_bounds_t(dim)
         # launch bounds always at index 0
         self.params[0] = self.bounds
@@ -5045,22 +5286,36 @@ class Launch:
         if self.params_addr:
             self.params_addr[0] = ctypes.c_void_p(ctypes.addressof(self.bounds))
-    # set kernel param at an index, will convert to ctype as necessary
-    def set_param_at_index(self, index, value):
+    def set_param_at_index(self, index: int, value: Any, adjoint: bool = False):
+        """Set a kernel parameter at an index.
+        Args:
+            index: The index of the param to set.
+            value: The value to set the param to.
+        """
         arg_type = self.kernel.adj.args[index].type
         arg_name = self.kernel.adj.args[index].label
-        carg = pack_arg(self.kernel, arg_type, arg_name, value, self.device, False)
+        carg = pack_arg(self.kernel, arg_type, arg_name, value, self.device, adjoint)
-        self.params[index + 1] = carg
+        if adjoint:
+            params_index = index + len(self.kernel.adj.args) + 1
+        else:
+            params_index = index + 1
+        self.params[params_index] = carg
         # for CUDA kernels we need to update the address to each arg
         if self.params_addr:
-            self.params_addr[index + 1] = ctypes.c_void_p(ctypes.addressof(carg))
+            self.params_addr[params_index] = ctypes.c_void_p(ctypes.addressof(carg))
+    def set_param_at_index_from_ctype(self, index: int, value: Union[ctypes.Structure, int, float]):
+        """Set a kernel parameter at an index without any type conversion.
-    # set kernel param at an index without any type conversion
-    # args must be passed as ctypes or basic int / float types
-    def set_param_at_index_from_ctype(self, index, value):
+        Args:
+            index: The index of the param to set.
+            value: The value to set the param to.
+        """
         if isinstance(value, ctypes.Structure):
             # not sure how to directly assign struct->struct without reallocating using ctypes
             self.params[index + 1] = value
@@ -5072,32 +5327,62 @@ class Launch:
         else:
             self.params[index + 1].__init__(value)
-    # set kernel param by argument name
-    def set_param_by_name(self, name, value):
+    def set_param_by_name(self, name: str, value: Any, adjoint: bool = False):
+        """Set a kernel parameter by argument name.
+        Args:
+            name: The name of the argument to set.
+            value: The value to set the argument to.
+            adjoint: If ``True``, set the adjoint of this parameter instead of the forward parameter.
+        """
         for i, arg in enumerate(self.kernel.adj.args):
             if arg.label == name:
-                self.set_param_at_index(i, value)
+                self.set_param_at_index(i, value, adjoint)
+                return
-    # set kernel param by argument name with no type conversions
-    def set_param_by_name_from_ctype(self, name, value):
+        raise ValueError(f"Argument '{name}' not found in kernel '{self.kernel.key}'")
+    def set_param_by_name_from_ctype(self, name: str, value: ctypes.Structure):
+        """Set a kernel parameter by argument name with no type conversions.
+        Args:
+            name: The name of the argument to set.
+            value: The value to set the argument to.
+        """
         # lookup argument index
         for i, arg in enumerate(self.kernel.adj.args):
             if arg.label == name:
                 self.set_param_at_index_from_ctype(i, value)
-    # set all params
-    def set_params(self, values):
+    def set_params(self, values: Sequence[Any]):
+        """Set all parameters.
+        Args:
+            values: A list of values to set the params to.
+        """
         for i, v in enumerate(values):
             self.set_param_at_index(i, v)
-    # set all params without performing type-conversions
-    def set_params_from_ctypes(self, values):
+    def set_params_from_ctypes(self, values: Sequence[ctypes.Structure]):
+        """Set all parameters without performing type-conversions.
+        Args:
+            values: A list of ctypes or basic int / float types.
+        """
         for i, v in enumerate(values):
             self.set_param_at_index_from_ctype(i, v)
-    def launch(self, stream=None) -> Any:
+    def launch(self, stream: Optional[Stream] = None) -> None:
+        """Launch the kernel.
+        Args:
+            stream: The stream to launch on.
+        """
         if self.device.is_cpu:
-            self.hooks.forward(*self.params)
+            if self.adjoint:
+                self.hooks.backward(*self.params)
+            else:
+                self.hooks.forward(*self.params)
         else:
             if stream is None:
                 stream = self.device.stream
@@ -5110,32 +5395,44 @@ class Launch:
                 if graph is not None:
                     graph.retain_module_exec(self.module_exec)
-            runtime.core.cuda_launch_kernel(
-                self.device.context,
-                self.hooks.forward,
-                self.bounds.size,
-                self.max_blocks,
-                self.block_dim,
-                self.hooks.forward_smem_bytes,
-                self.params_addr,
-                stream.cuda_stream,
-            )
+            if self.adjoint:
+                runtime.core.cuda_launch_kernel(
+                    self.device.context,
+                    self.hooks.backward,
+                    self.bounds.size,
+                    self.max_blocks,
+                    self.block_dim,
+                    self.hooks.backward_smem_bytes,
+                    self.params_addr,
+                    stream.cuda_stream,
+                )
+            else:
+                runtime.core.cuda_launch_kernel(
+                    self.device.context,
+                    self.hooks.forward,
+                    self.bounds.size,
+                    self.max_blocks,
+                    self.block_dim,
+                    self.hooks.forward_smem_bytes,
+                    self.params_addr,
+                    stream.cuda_stream,
+                )
 def launch(
     kernel,
-    dim: Tuple[int],
+    dim: Union[int, Sequence[int]],
     inputs: Sequence = [],
     outputs: Sequence = [],
     adj_inputs: Sequence = [],
     adj_outputs: Sequence = [],
     device: Devicelike = None,
-    stream: Stream = None,
-    adjoint=False,
-    record_tape=True,
-    record_cmd=False,
-    max_blocks=0,
-    block_dim=256,
+    stream: Optional[Stream] = None,
+    adjoint: bool = False,
+    record_tape: bool = True,
+    record_cmd: bool = False,
+    max_blocks: int = 0,
+    block_dim: int = 256,
 ):
     """Launch a Warp kernel on the target device
@@ -5143,18 +5440,23 @@ def launch(
     Args:
         kernel: The name of a Warp kernel function, decorated with the ``@wp.kernel`` decorator
-        dim: The number of threads to launch the kernel, can be an integer, or a Tuple of ints with max of 4 dimensions
+        dim: The number of threads to launch the kernel, can be an integer or a
+          sequence of integers with a maximum of 4 dimensions.
         inputs: The input parameters to the kernel (optional)
         outputs: The output parameters (optional)
         adj_inputs: The adjoint inputs (optional)
         adj_outputs: The adjoint outputs (optional)
-        device: The device to launch on (optional)
-        stream: The stream to launch on (optional)
-        adjoint: Whether to run forward or backward pass (typically use False)
-        record_tape: When true the launch will be recorded the global wp.Tape() object when present
-        record_cmd: When True the launch will be returned as a ``Launch`` command object, the launch will not occur until the user calls ``cmd.launch()``
-        max_blocks: The maximum number of CUDA thread blocks to use. Only has an effect for CUDA kernel launches.
-            If negative or zero, the maximum hardware value will be used.
+        device: The device to launch on.
+        stream: The stream to launch on.
+        adjoint: Whether to run forward or backward pass (typically use ``False``).
+        record_tape: When ``True``, the launch will be recorded the global
+          :class:`wp.Tape() <warp.Tape>` object when present.
+        record_cmd: When ``True``, the launch will return a :class:`Launch`
+          object. The launch will not occur until the user calls
+          :meth:`Launch.launch()`.
+        max_blocks: The maximum number of CUDA thread blocks to use.
+          Only has an effect for CUDA kernel launches.
+          If negative or zero, the maximum hardware value will be used.
         block_dim: The number of threads per block.
     """
@@ -5175,7 +5477,7 @@ def launch(
         print(f"kernel: {kernel.key} dim: {dim} inputs: {inputs} outputs: {outputs} device: {device}")
     # construct launch bounds
-    bounds = warp.types.launch_bounds_t(dim)
+    bounds = launch_bounds_t(dim)
     if bounds.size > 0:
         # first param is the number of threads
@@ -5232,6 +5534,17 @@ def launch(
                         f"Failed to find backward kernel '{kernel.key}' from module '{kernel.module.name}' for device '{device}'"
                     )
+                if record_cmd:
+                    launch = Launch(
+                        kernel=kernel,
+                        hooks=hooks,
+                        params=params,
+                        params_addr=None,
+                        bounds=bounds,
+                        device=device,
+                        adjoint=adjoint,
+                    )
+                    return launch
                 hooks.backward(*params)
             else:
@@ -5242,7 +5555,13 @@ def launch(
                 if record_cmd:
                     launch = Launch(
-                        kernel=kernel, hooks=hooks, params=params, params_addr=None, bounds=bounds, device=device
+                        kernel=kernel,
+                        hooks=hooks,
+                        params=params,
+                        params_addr=None,
+                        bounds=bounds,
+                        device=device,
+                        adjoint=adjoint,
                     )
                     return launch
                 else:
@@ -5269,16 +5588,30 @@ def launch(
                         f"Failed to find backward kernel '{kernel.key}' from module '{kernel.module.name}' for device '{device}'"
                     )
-                runtime.core.cuda_launch_kernel(
-                    device.context,
-                    hooks.backward,
-                    bounds.size,
-                    max_blocks,
-                    block_dim,
-                    hooks.backward_smem_bytes,
-                    kernel_params,
-                    stream.cuda_stream,
-                )
+                if record_cmd:
+                    launch = Launch(
+                        kernel=kernel,
+                        hooks=hooks,
+                        params=params,
+                        params_addr=kernel_params,
+                        bounds=bounds,
+                        device=device,
+                        max_blocks=max_blocks,
+                        block_dim=block_dim,
+                        adjoint=adjoint,
+                    )
+                    return launch
+                else:
+                    runtime.core.cuda_launch_kernel(
+                        device.context,
+                        hooks.backward,
+                        bounds.size,
+                        max_blocks,
+                        block_dim,
+                        hooks.backward_smem_bytes,
+                        kernel_params,
+                        stream.cuda_stream,
+                    )
             else:
                 if hooks.forward is None:
@@ -5298,7 +5631,6 @@ def launch(
                         block_dim=block_dim,
                     )
                     return launch
                 else:
                     # launch
                     runtime.core.cuda_launch_kernel(
@@ -6034,14 +6366,19 @@ def export_functions_rst(file):  # pragma: no cover
     # build dictionary of all functions by group
     groups = {}
-    for _k, f in builtin_functions.items():
+    functions = list(builtin_functions.values())
+    for f in functions:
         # build dict of groups
         if f.group not in groups:
             groups[f.group] = []
-        # append all overloads to the group
-        for o in f.overloads:
-            groups[f.group].append(o)
+        if hasattr(f, "overloads"):
+            # append all overloads to the group
+            for o in f.overloads:
+                groups[f.group].append(o)
+        else:
+            groups[f.group].append(f)
     # Keep track of what function and query types have been written
     written_functions = set()
@@ -6061,6 +6398,10 @@ def export_functions_rst(file):  # pragma: no cover
         print("---------------", file=file)
         for f in g:
+            if f.func:
+                # f is a Warp function written in Python, we can use autofunction
+                print(f".. autofunction:: {f.func.__module__}.{f.key}", file=file)
+                continue
             for f_prefix, query_type in query_types:
                 if f.key.startswith(f_prefix) and query_type not in written_query_types:
                     print(f".. autoclass:: {query_type}", file=file)
@@ -6118,24 +6459,32 @@ def export_stubs(file):  # pragma: no cover
     print(header, file=file)
     print(file=file)
-    for k, g in builtin_functions.items():
-        for f in g.overloads:
-            args = ", ".join(f"{k}: {type_str(v)}" for k, v in f.input_types.items())
+    def add_stub(f):
+        args = ", ".join(f"{k}: {type_str(v)}" for k, v in f.input_types.items())
-            return_str = ""
+        return_str = ""
-            if f.hidden:  # or f.generic:
-                continue
+        if f.hidden:  # or f.generic:
+            return
+        return_type = f.value_type
+        if f.value_func:
             return_type = f.value_func(None, None)
-            if return_type:
-                return_str = " -> " + type_str(return_type)
-            print("@over", file=file)
-            print(f"def {f.key}({args}){return_str}:", file=file)
-            print(f'    """{f.doc}', file=file)
-            print('    """', file=file)
-            print("    ...\n\n", file=file)
+        if return_type:
+            return_str = " -> " + type_str(return_type)
+        print("@over", file=file)
+        print(f"def {f.key}({args}){return_str}:", file=file)
+        print(f'    """{f.doc}', file=file)
+        print('    """', file=file)
+        print("    ...\n\n", file=file)
+    for g in builtin_functions.values():
+        if hasattr(g, "overloads"):
+            for f in g.overloads:
+                add_stub(f)
+        else:
+            add_stub(g)
 def export_builtins(file: io.TextIOBase):  # pragma: no cover
@@ -6161,6 +6510,8 @@ def export_builtins(file: io.TextIOBase):  # pragma: no cover
     file.write('extern "C" {\n\n')
     for k, g in builtin_functions.items():
+        if not hasattr(g, "overloads"):
+            continue
         for f in g.overloads:
             if not f.export or f.generic:
                 continue