PyPI - warp-lang - Versions diffs - 1.5.0__py3-none-macosx_10_13_universal2.whl → 1.6.0__py3-none-macosx_10_13_universal2.whl - Mend

warp-lang 1.5.0__py3-none-macosx_10_13_universal2.whl → 1.6.0__py3-none-macosx_10_13_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (132) hide show

warp/__init__.py +5 -0
warp/autograd.py +414 -191
warp/bin/libwarp-clang.dylib +0 -0
warp/bin/libwarp.dylib +0 -0
warp/build.py +40 -12
warp/build_dll.py +13 -6
warp/builtins.py +1124 -497
warp/codegen.py +261 -136
warp/config.py +1 -1
warp/context.py +357 -119
warp/examples/assets/square_cloth.usd +0 -0
warp/examples/benchmarks/benchmark_gemm.py +27 -18
warp/examples/benchmarks/benchmark_interop_paddle.py +3 -3
warp/examples/benchmarks/benchmark_interop_torch.py +3 -3
warp/examples/core/example_torch.py +18 -34
warp/examples/fem/example_apic_fluid.py +1 -0
warp/examples/fem/example_mixed_elasticity.py +1 -1
warp/examples/optim/example_bounce.py +1 -1
warp/examples/optim/example_cloth_throw.py +1 -1
warp/examples/optim/example_diffray.py +4 -15
warp/examples/optim/example_drone.py +1 -1
warp/examples/optim/example_softbody_properties.py +392 -0
warp/examples/optim/example_trajectory.py +1 -3
warp/examples/optim/example_walker.py +5 -0
warp/examples/sim/example_cartpole.py +0 -2
warp/examples/sim/example_cloth.py +3 -1
warp/examples/sim/example_cloth_self_contact.py +260 -0
warp/examples/sim/example_granular_collision_sdf.py +4 -5
warp/examples/sim/example_jacobian_ik.py +0 -2
warp/examples/sim/example_quadruped.py +5 -2
warp/examples/tile/example_tile_cholesky.py +79 -0
warp/examples/tile/example_tile_convolution.py +2 -2
warp/examples/tile/example_tile_fft.py +2 -2
warp/examples/tile/example_tile_filtering.py +3 -3
warp/examples/tile/example_tile_matmul.py +4 -4
warp/examples/tile/example_tile_mlp.py +12 -12
warp/examples/tile/example_tile_nbody.py +180 -0
warp/examples/tile/example_tile_walker.py +319 -0
warp/fem/geometry/geometry.py +0 -2
warp/math.py +147 -0
warp/native/array.h +12 -0
warp/native/builtin.h +0 -1
warp/native/bvh.cpp +149 -70
warp/native/bvh.cu +287 -68
warp/native/bvh.h +195 -85
warp/native/clang/clang.cpp +5 -1
warp/native/coloring.cpp +5 -1
warp/native/cuda_util.cpp +91 -53
warp/native/cuda_util.h +5 -0
warp/native/exports.h +40 -40
warp/native/intersect.h +17 -0
warp/native/mat.h +41 -0
warp/native/mathdx.cpp +19 -0
warp/native/mesh.cpp +25 -8
warp/native/mesh.cu +153 -101
warp/native/mesh.h +482 -403
warp/native/quat.h +40 -0
warp/native/solid_angle.h +7 -0
warp/native/sort.cpp +85 -0
warp/native/sort.cu +34 -0
warp/native/sort.h +3 -1
warp/native/spatial.h +11 -0
warp/native/tile.h +1187 -669
warp/native/tile_reduce.h +8 -6
warp/native/vec.h +41 -0
warp/native/warp.cpp +8 -1
warp/native/warp.cu +263 -40
warp/native/warp.h +19 -5
warp/optim/linear.py +22 -4
warp/render/render_opengl.py +130 -64
warp/sim/__init__.py +6 -1
warp/sim/collide.py +270 -26
warp/sim/import_urdf.py +8 -8
warp/sim/integrator_euler.py +25 -7
warp/sim/integrator_featherstone.py +154 -35
warp/sim/integrator_vbd.py +842 -40
warp/sim/model.py +134 -72
warp/sparse.py +1 -1
warp/stubs.py +265 -132
warp/tape.py +28 -30
warp/tests/aux_test_module_unload.py +15 -0
warp/tests/{test_sim_grad.py → flaky_test_sim_grad.py} +104 -63
warp/tests/test_array.py +74 -0
warp/tests/test_assert.py +242 -0
warp/tests/test_codegen.py +14 -61
warp/tests/test_collision.py +2 -2
warp/tests/test_coloring.py +12 -2
warp/tests/test_examples.py +12 -1
warp/tests/test_func.py +21 -4
warp/tests/test_grad_debug.py +87 -2
warp/tests/test_hash_grid.py +1 -1
warp/tests/test_ipc.py +116 -0
warp/tests/test_lerp.py +13 -87
warp/tests/test_mat.py +138 -167
warp/tests/test_math.py +47 -1
warp/tests/test_matmul.py +17 -16
warp/tests/test_matmul_lite.py +10 -15
warp/tests/test_mesh.py +84 -60
warp/tests/test_mesh_query_aabb.py +165 -0
warp/tests/test_mesh_query_point.py +328 -286
warp/tests/test_mesh_query_ray.py +134 -121
warp/tests/test_mlp.py +2 -2
warp/tests/test_operators.py +43 -0
warp/tests/test_overwrite.py +47 -2
warp/tests/test_quat.py +77 -0
warp/tests/test_reload.py +29 -0
warp/tests/test_sim_grad_bounce_linear.py +204 -0
warp/tests/test_smoothstep.py +17 -83
warp/tests/test_static.py +19 -3
warp/tests/test_tape.py +25 -0
warp/tests/test_tile.py +178 -191
warp/tests/test_tile_load.py +356 -0
warp/tests/test_tile_mathdx.py +61 -8
warp/tests/test_tile_mlp.py +17 -17
warp/tests/test_tile_reduce.py +24 -18
warp/tests/test_tile_shared_memory.py +66 -17
warp/tests/test_tile_view.py +165 -0
warp/tests/test_torch.py +35 -0
warp/tests/test_utils.py +36 -24
warp/tests/test_vec.py +110 -0
warp/tests/unittest_suites.py +29 -4
warp/tests/unittest_utils.py +30 -13
warp/thirdparty/unittest_parallel.py +2 -2
warp/types.py +411 -101
warp/utils.py +10 -7
{warp_lang-1.5.0.dist-info → warp_lang-1.6.0.dist-info}/METADATA +92 -69
{warp_lang-1.5.0.dist-info → warp_lang-1.6.0.dist-info}/RECORD +130 -119
{warp_lang-1.5.0.dist-info → warp_lang-1.6.0.dist-info}/WHEEL +1 -1
warp/examples/benchmarks/benchmark_tile.py +0 -179
warp/native/tile_gemm.h +0 -341
{warp_lang-1.5.0.dist-info → warp_lang-1.6.0.dist-info}/LICENSE.md +0 -0
{warp_lang-1.5.0.dist-info → warp_lang-1.6.0.dist-info}/top_level.txt +0 -0

warp/context.py CHANGED Viewed

@@ -5,8 +5,11 @@
 # distribution of this software and related documentation without an express
 # license agreement from NVIDIA CORPORATION is strictly prohibited.
+from __future__ import annotations
 import ast
 import ctypes
+import errno
 import functools
 import hashlib
 import inspect
@@ -17,6 +20,7 @@ import operator
 import os
 import platform
 import sys
+import time
 import types
 import typing
 import weakref
@@ -238,24 +242,23 @@ class Function:
         # in a way that is compatible with Python's semantics.
         signature_params = []
         signature_default_param_kind = inspect.Parameter.POSITIONAL_OR_KEYWORD
-        for param_name in self.input_types.keys():
-            if param_name.startswith("**"):
-                param_name = param_name[2:]
+        for raw_param_name in self.input_types.keys():
+            if raw_param_name.startswith("**"):
+                param_name = raw_param_name[2:]
                 param_kind = inspect.Parameter.VAR_KEYWORD
-            elif param_name.startswith("*"):
-                param_name = param_name[1:]
+            elif raw_param_name.startswith("*"):
+                param_name = raw_param_name[1:]
                 param_kind = inspect.Parameter.VAR_POSITIONAL
                 # Once a variadic argument like `*args` is found, any following
                 # arguments need to be passed using keywords.
                 signature_default_param_kind = inspect.Parameter.KEYWORD_ONLY
             else:
+                param_name = raw_param_name
                 param_kind = signature_default_param_kind
-            param = param = inspect.Parameter(
-                param_name,
-                param_kind,
-                default=self.defaults.get(param_name, inspect.Parameter.empty),
+            param = inspect.Parameter(
+                param_name, param_kind, default=self.defaults.get(param_name, inspect.Parameter.empty)
             )
             signature_params.append(param)
         self.signature = inspect.Signature(signature_params)
@@ -294,22 +297,22 @@ class Function:
         if hasattr(self, "user_overloads") and len(self.user_overloads):
             # user-defined function with overloads
+            bound_args = self.signature.bind(*args, **kwargs)
+            if self.defaults:
+                warp.codegen.apply_defaults(bound_args, self.defaults)
-            if len(kwargs):
-                raise RuntimeError(
-                    f"Error calling function '{self.key}', keyword arguments are not supported for user-defined overloads."
-                )
+            arguments = tuple(bound_args.arguments.values())
             # try and find a matching overload
             for overload in self.user_overloads.values():
-                if len(overload.input_types) != len(args):
+                if len(overload.input_types) != len(arguments):
                     continue
                 template_types = list(overload.input_types.values())
                 arg_names = list(overload.input_types.keys())
                 try:
                     # attempt to unify argument types with function template types
-                    warp.types.infer_argument_types(args, template_types, arg_names)
-                    return overload.func(*args)
+                    warp.types.infer_argument_types(arguments, template_types, arg_names)
+                    return overload.func(*arguments)
                 except Exception:
                     continue
@@ -392,7 +395,8 @@ class Function:
             if not warp.codegen.func_match_args(f, arg_types, kwarg_types):
                 continue
-            if len(f.input_types) != len(arg_types):
+            acceptable_arg_num = len(f.input_types) - len(f.defaults) <= len(arg_types) <= len(f.input_types)
+            if not acceptable_arg_num:
                 continue
             # try to match the given types to the function template types
@@ -409,6 +413,10 @@ class Function:
                 arg_names = f.input_types.keys()
                 overload_annotations = dict(zip(arg_names, arg_types))
+                # add defaults
+                for k, d in f.defaults.items():
+                    if k not in overload_annotations:
+                        overload_annotations[k] = warp.codegen.strip_reference(warp.codegen.get_arg_type(d))
                 ovl = shallowcopy(f)
                 ovl.adj = warp.codegen.Adjoint(f.func, overload_annotations)
@@ -509,11 +517,10 @@ def call_builtin(func: Function, *params) -> Tuple[bool, Any]:
                 if elem_count != arg_type._length_:
                     return (False, None)
-                # Retrieve the element type of the sequence while ensuring
-                # that it's homogeneous.
+                # Retrieve the element type of the sequence while ensuring that it's homogeneous.
                 elem_type = type(arr[0])
-                for i in range(1, elem_count):
-                    if type(arr[i]) is not elem_type:
+                for array_index in range(1, elem_count):
+                    if type(arr[array_index]) is not elem_type:
                         raise ValueError("All array elements must share the same type.")
                 expected_elem_type = arg_type._wp_scalar_type_
@@ -543,10 +550,10 @@ def call_builtin(func: Function, *params) -> Tuple[bool, Any]:
                 c_param = arg_type()
                 if warp.types.type_is_matrix(arg_type):
                     rows, cols = arg_type._shape_
-                    for i in range(rows):
-                        idx_start = i * cols
+                    for row_index in range(rows):
+                        idx_start = row_index * cols
                         idx_end = idx_start + cols
-                        c_param[i] = arr[idx_start:idx_end]
+                        c_param[row_index] = arr[idx_start:idx_end]
                 else:
                     c_param[:] = arr
@@ -753,8 +760,15 @@ def func(f):
     scope_locals = inspect.currentframe().f_back.f_locals
     m = get_module(f.__module__)
+    doc = getattr(f, "__doc__", "") or ""
     Function(
-        func=f, key=name, namespace="", module=m, value_func=None, scope_locals=scope_locals
+        func=f,
+        key=name,
+        namespace="",
+        module=m,
+        value_func=None,
+        scope_locals=scope_locals,
+        doc=doc.strip(),
     )  # value_type not known yet, will be inferred during Adjoint.build()
     # use the top of the list of overloads for this key
@@ -1059,7 +1073,8 @@ def overload(kernel, arg_types=Union[None, Dict[str, Any], List[Any]]):
         raise RuntimeError("wp.overload() called with invalid argument!")
-builtin_functions = {}
+# native functions that are part of the Warp API
+builtin_functions: Dict[str, Function] = {}
 def get_generic_vtypes():
@@ -1239,16 +1254,16 @@ def add_builtin(
                 typelists.append(l)
             for arg_types in itertools.product(*typelists):
-                arg_types = dict(zip(input_types.keys(), arg_types))
+                concrete_arg_types = dict(zip(input_types.keys(), arg_types))
                 # Some of these argument lists won't work, eg if the function is mul(), we won't be
                 # able to do a matrix vector multiplication for a mat22 and a vec3. The `constraint`
                 # function determines which combinations are valid:
                 if constraint:
-                    if constraint(arg_types) is False:
+                    if constraint(concrete_arg_types) is False:
                         continue
-                return_type = value_func(arg_types, None)
+                return_type = value_func(concrete_arg_types, None)
                 # The return_type might just be vector_t(length=3,dtype=wp.float32), so we've got to match that
                 # in the list of hard coded types so it knows it's returning one of them:
@@ -1266,7 +1281,7 @@ def add_builtin(
                 # finally we can generate a function call for these concrete types:
                 add_builtin(
                     key,
-                    input_types=arg_types,
+                    input_types=concrete_arg_types,
                     value_type=return_type,
                     value_func=value_func if return_type is Any else None,
                     export_func=export_func,
@@ -1328,6 +1343,28 @@ def add_builtin(
             setattr(warp, key, func)
+def register_api_function(
+    function: Function,
+    group: str = "Other",
+    hidden=False,
+):
+    """Main entry point to register a Warp Python function to be part of the Warp API and appear in the documentation.
+    Args:
+        function (Function): Warp function to be registered.
+        group (str): Classification used for the documentation.
+        input_types (Mapping[str, Any]): Signature of the user-facing function.
+            Variadic arguments are supported by prefixing the parameter names
+            with asterisks as in `*args` and `**kwargs`. Generic arguments are
+            supported with types such as `Any`, `Float`, `Scalar`, etc.
+        value_type (Any): Type returned by the function.
+        hidden (bool): Whether to add that function into the documentation.
+    """
+    function.group = group
+    function.hidden = hidden
+    builtin_functions[function.key] = function
 # global dictionary of modules
 user_modules = {}
@@ -1561,6 +1598,7 @@ class ModuleBuilder:
         self.options = options
         self.module = module
         self.deferred_functions = []
+        self.fatbins = {}  # map from <some identifier> to fatbins, to add at link time
         self.ltoirs = {}  # map from lto symbol to lto binary
         self.ltoirs_decl = {}  # map from lto symbol to lto forward declaration
@@ -1675,7 +1713,7 @@ class ModuleBuilder:
         for kernel in self.kernels:
             source += warp.codegen.codegen_kernel(kernel, device=device, options=self.options)
-            source += warp.codegen.codegen_module(kernel, device=device)
+            source += warp.codegen.codegen_module(kernel, device=device, options=self.options)
         # add headers
         if device == "cpu":
@@ -1728,20 +1766,26 @@ class ModuleExec:
         name = kernel.get_mangled_name()
+        options = dict(kernel.module.options)
+        options.update(kernel.options)
         if self.device.is_cuda:
             forward_name = name + "_cuda_kernel_forward"
             forward_kernel = runtime.core.cuda_get_kernel(
                 self.device.context, self.handle, forward_name.encode("utf-8")
             )
-            backward_name = name + "_cuda_kernel_backward"
-            backward_kernel = runtime.core.cuda_get_kernel(
-                self.device.context, self.handle, backward_name.encode("utf-8")
-            )
+            if options["enable_backward"]:
+                backward_name = name + "_cuda_kernel_backward"
+                backward_kernel = runtime.core.cuda_get_kernel(
+                    self.device.context, self.handle, backward_name.encode("utf-8")
+                )
+            else:
+                backward_kernel = None
             # look up the required shared memory size for each kernel from module metadata
             forward_smem_bytes = self.meta[forward_name + "_smem_bytes"]
-            backward_smem_bytes = self.meta[backward_name + "_smem_bytes"]
+            backward_smem_bytes = self.meta[backward_name + "_smem_bytes"] if options["enable_backward"] else 0
             # configure kernels maximum shared memory size
             max_smem_bytes = runtime.core.cuda_get_max_shared_memory(self.device.context)
@@ -1751,9 +1795,6 @@ class ModuleExec:
                     f"Warning: Failed to configure kernel dynamic shared memory for this device, tried to configure {forward_name} kernel for {forward_smem_bytes} bytes, but maximum available is {max_smem_bytes}"
                 )
-            options = dict(kernel.module.options)
-            options.update(kernel.options)
             if options["enable_backward"] and not runtime.core.cuda_configure_kernel_shared_memory(
                 backward_kernel, backward_smem_bytes
             ):
@@ -1768,9 +1809,14 @@ class ModuleExec:
             forward = (
                 func(runtime.llvm.lookup(self.handle.encode("utf-8"), (name + "_cpu_forward").encode("utf-8"))) or None
             )
-            backward = (
-                func(runtime.llvm.lookup(self.handle.encode("utf-8"), (name + "_cpu_backward").encode("utf-8"))) or None
-            )
+            if options["enable_backward"]:
+                backward = (
+                    func(runtime.llvm.lookup(self.handle.encode("utf-8"), (name + "_cpu_backward").encode("utf-8")))
+                    or None
+                )
+            else:
+                backward = None
             hooks = KernelHooks(forward, backward)
@@ -1803,13 +1849,13 @@ class Module:
         self._live_kernels = weakref.WeakSet()
         # executable modules currently loaded
-        self.execs = {}  # (device.context: ModuleExec)
+        self.execs = {}  # ((device.context, blockdim): ModuleExec)
         # set of device contexts where the build has failed
         self.failed_builds = set()
-        # hash data, including the module hash
-        self.hasher = None
+        # hash data, including the module hash. Module may store multiple hashes (one per block_dim used)
+        self.hashers = {}
         # LLVM executable modules are identified using strings.  Since it's possible for multiple
         # executable versions to be loaded at the same time, we need a way to ensure uniqueness.
@@ -1822,6 +1868,8 @@ class Module:
             "max_unroll": warp.config.max_unroll,
             "enable_backward": warp.config.enable_backward,
             "fast_math": False,
+            "fuse_fp": True,
+            "lineinfo": False,
             "cuda_output": None,  # supported values: "ptx", "cubin", or None (automatic)
             "mode": warp.config.mode,
             "block_dim": 256,
@@ -1965,28 +2013,27 @@ class Module:
     def hash_module(self):
         # compute latest hash
-        self.hasher = ModuleHasher(self)
-        return self.hasher.get_module_hash()
+        block_dim = self.options["block_dim"]
+        self.hashers[block_dim] = ModuleHasher(self)
+        return self.hashers[block_dim].get_module_hash()
     def load(self, device, block_dim=None) -> ModuleExec:
         device = runtime.get_device(device)
-        # re-compile module if tile size (blockdim) changes
-        # todo: it would be better to have a method such as `module.get_kernel(block_dim=N)`
-        # that can return a single kernel instance with a given block size
+        # update module options if launching with a new block dim
         if block_dim is not None:
-            if self.options["block_dim"] != block_dim:
-                self.unload()
             self.options["block_dim"] = block_dim
+        active_block_dim = self.options["block_dim"]
         # compute the hash if needed
-        if self.hasher is None:
-            self.hasher = ModuleHasher(self)
+        if active_block_dim not in self.hashers:
+            self.hashers[active_block_dim] = ModuleHasher(self)
         # check if executable module is already loaded and not stale
-        exec = self.execs.get(device.context)
+        exec = self.execs.get((device.context, active_block_dim))
         if exec is not None:
-            if exec.module_hash == self.hasher.module_hash:
+            if exec.module_hash == self.hashers[active_block_dim].get_module_hash():
                 return exec
         # quietly avoid repeated build attempts to reduce error spew
@@ -1994,10 +2041,11 @@ class Module:
             return None
         module_name = "wp_" + self.name
-        module_hash = self.hasher.module_hash
+        module_hash = self.hashers[active_block_dim].get_module_hash()
         # use a unique module path using the module short hash
-        module_dir = os.path.join(warp.config.kernel_cache_dir, f"{module_name}_{module_hash.hex()[:7]}")
+        module_name_short = f"{module_name}_{module_hash.hex()[:7]}"
+        module_dir = os.path.join(warp.config.kernel_cache_dir, module_name_short)
         with warp.ScopedTimer(
             f"Module {self.name} {module_hash.hex()[:7]} load on device '{device}'", active=not warp.config.quiet
@@ -2005,7 +2053,7 @@ class Module:
             # -----------------------------------------------------------
             # determine output paths
             if device.is_cpu:
-                output_name = "module_codegen.o"
+                output_name = f"{module_name_short}.o"
                 output_arch = None
             elif device.is_cuda:
@@ -2025,10 +2073,10 @@ class Module:
                 if use_ptx:
                     output_arch = min(device.arch, warp.config.ptx_target_arch)
-                    output_name = f"module_codegen.sm{output_arch}.ptx"
+                    output_name = f"{module_name_short}.sm{output_arch}.ptx"
                 else:
                     output_arch = device.arch
-                    output_name = f"module_codegen.sm{output_arch}.cubin"
+                    output_name = f"{module_name_short}.sm{output_arch}.cubin"
             # final object binary path
             binary_path = os.path.join(module_dir, output_name)
@@ -2050,7 +2098,7 @@ class Module:
                     # Some of the Tile codegen, such as cuFFTDx and cuBLASDx, requires knowledge of the target arch
                     "output_arch": output_arch,
                 }
-                builder = ModuleBuilder(self, builder_options, hasher=self.hasher)
+                builder = ModuleBuilder(self, builder_options, hasher=self.hashers[active_block_dim])
                 # create a temporary (process unique) dir for build outputs before moving to the binary dir
                 build_dir = os.path.join(
@@ -2066,7 +2114,7 @@ class Module:
                 if device.is_cpu:
                     # build
                     try:
-                        source_code_path = os.path.join(build_dir, "module_codegen.cpp")
+                        source_code_path = os.path.join(build_dir, f"{module_name_short}.cpp")
                         # write cpp sources
                         cpp_source = builder.codegen("cpu")
@@ -2084,6 +2132,7 @@ class Module:
                                 mode=self.options["mode"],
                                 fast_math=self.options["fast_math"],
                                 verify_fp=warp.config.verify_fp,
+                                fuse_fp=self.options["fuse_fp"],
                             )
                     except Exception as e:
@@ -2094,7 +2143,7 @@ class Module:
                 elif device.is_cuda:
                     # build
                     try:
-                        source_code_path = os.path.join(build_dir, "module_codegen.cu")
+                        source_code_path = os.path.join(build_dir, f"{module_name_short}.cu")
                         # write cuda sources
                         cu_source = builder.codegen("cuda")
@@ -2111,9 +2160,12 @@ class Module:
                                 output_arch,
                                 output_path,
                                 config=self.options["mode"],
-                                fast_math=self.options["fast_math"],
                                 verify_fp=warp.config.verify_fp,
+                                fast_math=self.options["fast_math"],
+                                fuse_fp=self.options["fuse_fp"],
+                                lineinfo=self.options["lineinfo"],
                                 ltoirs=builder.ltoirs.values(),
+                                fatbins=builder.fatbins.values(),
                             )
                     except Exception as e:
@@ -2125,7 +2177,7 @@ class Module:
                 # build meta data
                 meta = builder.build_meta()
-                meta_path = os.path.join(build_dir, "module_codegen.meta")
+                meta_path = os.path.join(build_dir, f"{module_name_short}.meta")
                 with open(meta_path, "w") as meta_file:
                     json.dump(meta, meta_file)
@@ -2133,12 +2185,34 @@ class Module:
                 # -----------------------------------------------------------
                 # update cache
-                try:
-                    # Copy process-specific build directory to a process-independent location
-                    os.rename(build_dir, module_dir)
-                except (OSError, FileExistsError):
-                    # another process likely updated the module dir first
-                    pass
+                def safe_rename(src, dst, attempts=5, delay=0.1):
+                    for i in range(attempts):
+                        try:
+                            os.rename(src, dst)
+                            return
+                        except FileExistsError:
+                            return
+                        except OSError as e:
+                            if e.errno == errno.ENOTEMPTY:
+                                # if directory exists we assume another process
+                                # got there first, in which case we will copy
+                                # our output to the directory manually in second step
+                                return
+                            else:
+                                # otherwise assume directory creation failed e.g.: access denied
+                                # on Windows we see occasional failures to rename directories due to
+                                # some process holding a lock on a file to be moved to workaround
+                                # this we make multiple attempts to rename with some delay
+                                if i < attempts - 1:
+                                    time.sleep(delay)
+                                else:
+                                    print(
+                                        f"Could not update Warp cache with module binaries, trying to rename {build_dir} to {module_dir}, error {e}"
+                                    )
+                                    raise e
+                # try to move process outputs to cache
+                safe_rename(build_dir, module_dir)
                 if os.path.exists(module_dir):
                     if not os.path.exists(binary_path):
@@ -2167,7 +2241,7 @@ class Module:
             # -----------------------------------------------------------
             # Load CPU or CUDA binary
-            meta_path = os.path.join(module_dir, "module_codegen.meta")
+            meta_path = os.path.join(module_dir, f"{module_name_short}.meta")
             with open(meta_path, "r") as meta_file:
                 meta = json.load(meta_file)
@@ -2177,13 +2251,13 @@ class Module:
                 self.cpu_exec_id += 1
                 runtime.llvm.load_obj(binary_path.encode("utf-8"), module_handle.encode("utf-8"))
                 module_exec = ModuleExec(module_handle, module_hash, device, meta)
-                self.execs[None] = module_exec
+                self.execs[(None, active_block_dim)] = module_exec
             elif device.is_cuda:
                 cuda_module = warp.build.load_cuda(binary_path, device)
                 if cuda_module is not None:
                     module_exec = ModuleExec(cuda_module, module_hash, device, meta)
-                    self.execs[device.context] = module_exec
+                    self.execs[(device.context, active_block_dim)] = module_exec
                 else:
                     module_load_timer.extra_msg = " (error)"
                     raise Exception(f"Failed to load CUDA module '{self.name}'")
@@ -2205,14 +2279,14 @@ class Module:
     def mark_modified(self):
         # clear hash data
-        self.hasher = None
+        self.hashers = {}
         # clear build failures
         self.failed_builds = set()
     # lookup kernel entry points based on name, called after compilation / module load
     def get_kernel_hooks(self, kernel, device):
-        module_exec = self.execs.get(device.context)
+        module_exec = self.execs.get((device.context, self.options["block_dim"]))
         if module_exec is not None:
             return module_exec.get_kernel_hooks(kernel)
         else:
@@ -2331,6 +2405,7 @@ class Event:
         DEFAULT = 0x0
         BLOCKING_SYNC = 0x1
         DISABLE_TIMING = 0x2
+        INTERPROCESS = 0x4
     def __new__(cls, *args, **kwargs):
         """Creates a new event instance."""
@@ -2338,7 +2413,9 @@ class Event:
         instance.owner = False
         return instance
-    def __init__(self, device: "Devicelike" = None, cuda_event=None, enable_timing: bool = False):
+    def __init__(
+        self, device: "Devicelike" = None, cuda_event=None, enable_timing: bool = False, interprocess: bool = False
+    ):
         """Initializes the event on a CUDA device.
         Args:
@@ -2350,6 +2427,12 @@ class Event:
               :func:`~warp.get_event_elapsed_time` can be used to measure the
               time between two events created with ``enable_timing=True`` and
               recorded onto streams.
+            interprocess: If ``True`` this event may be used as an interprocess event.
+        Raises:
+            RuntimeError: The event could not be created.
+            ValueError: The combination of ``enable_timing=True`` and
+                ``interprocess=True`` is not allowed.
         """
         device = get_device(device)
@@ -2364,11 +2447,48 @@ class Event:
             flags = Event.Flags.DEFAULT
             if not enable_timing:
                 flags |= Event.Flags.DISABLE_TIMING
+            if interprocess:
+                if enable_timing:
+                    raise ValueError("The combination of 'enable_timing=True' and 'interprocess=True' is not allowed.")
+                flags |= Event.Flags.INTERPROCESS
             self.cuda_event = runtime.core.cuda_event_create(device.context, flags)
             if not self.cuda_event:
                 raise RuntimeError(f"Failed to create event on device {device}")
             self.owner = True
+    def ipc_handle(self) -> bytes:
+        """Return a CUDA IPC handle of the event as a 64-byte ``bytes`` object.
+        The event must have been created with ``interprocess=True`` in order to
+        obtain a valid interprocess handle.
+        IPC is currently only supported on Linux.
+        Example:
+            Create an event and get its IPC handle::
+                e1 = wp.Event(interprocess=True)
+                event_handle = e1.ipc_handle()
+        Raises:
+            RuntimeError: Device does not support IPC.
+        """
+        if self.device.is_ipc_supported is not False:
+            # Allocate a buffer for the data (64-element char array)
+            ipc_handle_buffer = (ctypes.c_char * 64)()
+            warp.context.runtime.core.cuda_ipc_get_event_handle(self.device.context, self.cuda_event, ipc_handle_buffer)
+            if ipc_handle_buffer.raw == bytes(64):
+                warp.utils.warn("IPC event handle appears to be invalid. Was interprocess=True used?")
+            return ipc_handle_buffer.raw
+        else:
+            raise RuntimeError(f"Device {self.device} does not support IPC.")
     def __del__(self):
         if not self.owner:
             return
@@ -2516,23 +2636,27 @@ class Device:
     """A device to allocate Warp arrays and to launch kernels on.
     Attributes:
-        ordinal: A Warp-specific integer label for the device. ``-1`` for CPU devices.
-        name: A string label for the device. By default, CPU devices will be named according to the processor name,
+        ordinal (int): A Warp-specific label for the device. ``-1`` for CPU devices.
+        name (str): A label for the device. By default, CPU devices will be named according to the processor name,
             or ``"CPU"`` if the processor name cannot be determined.
-        arch: An integer representing the compute capability version number calculated as
-            ``10 * major + minor``. ``0`` for CPU devices.
-        is_uva: A boolean indicating whether the device supports unified addressing.
+        arch (int): The compute capability version number calculated as ``10 * major + minor``.
+            ``0`` for CPU devices.
+        is_uva (bool): Indicates whether the device supports unified addressing.
             ``False`` for CPU devices.
-        is_cubin_supported: A boolean indicating whether Warp's version of NVRTC can directly
+        is_cubin_supported (bool): Indicates whether Warp's version of NVRTC can directly
             generate CUDA binary files (cubin) for this device's architecture. ``False`` for CPU devices.
-        is_mempool_supported: A boolean indicating whether the device supports using the
-            ``cuMemAllocAsync`` and ``cuMemPool`` family of APIs for stream-ordered memory allocations. ``False`` for
-            CPU devices.
-        is_primary: A boolean indicating whether this device's CUDA context is also the
-            device's primary context.
-        uuid: A string representing the UUID of the CUDA device. The UUID is in the same format used by
-            ``nvidia-smi -L``. ``None`` for CPU devices.
-        pci_bus_id: A string identifier for the CUDA device in the format ``[domain]:[bus]:[device]``, in which
+        is_mempool_supported (bool): Indicates whether the device supports using the ``cuMemAllocAsync`` and
+            ``cuMemPool`` family of APIs for stream-ordered memory allocations. ``False`` for CPU devices.
+        is_ipc_supported (Optional[bool]): Indicates whether the device supports IPC.
+            - ``True`` if supported.
+            - ``False`` if not supported.
+            - ``None`` if IPC support could not be determined (e.g. CUDA 11).
+        is_primary (bool): Indicates whether this device's CUDA context is also the device's primary context.
+        uuid (str): The UUID of the CUDA device. The UUID is in the same format used by ``nvidia-smi -L``.
+            ``None`` for CPU devices.
+        pci_bus_id (str): An identifier for the CUDA device in the format ``[domain]:[bus]:[device]``, in which
             ``domain``, ``bus``, and ``device`` are all hexadecimal values. ``None`` for CPU devices.
     """
@@ -2565,6 +2689,7 @@ class Device:
             self.is_uva = False
             self.is_mempool_supported = False
             self.is_mempool_enabled = False
+            self.is_ipc_supported = False  # TODO: Support IPC for CPU arrays
             self.is_cubin_supported = False
             self.uuid = None
             self.pci_bus_id = None
@@ -2580,8 +2705,14 @@ class Device:
             # CUDA device
             self.name = runtime.core.cuda_device_get_name(ordinal).decode()
             self.arch = runtime.core.cuda_device_get_arch(ordinal)
-            self.is_uva = runtime.core.cuda_device_is_uva(ordinal)
-            self.is_mempool_supported = runtime.core.cuda_device_is_mempool_supported(ordinal)
+            self.is_uva = runtime.core.cuda_device_is_uva(ordinal) > 0
+            self.is_mempool_supported = runtime.core.cuda_device_is_mempool_supported(ordinal) > 0
+            if platform.system() == "Linux":
+                # Use None when IPC support cannot be determined
+                ipc_support_api_query = runtime.core.cuda_device_is_ipc_supported(ordinal)
+                self.is_ipc_supported = bool(ipc_support_api_query) if ipc_support_api_query >= 0 else None
+            else:
+                self.is_ipc_supported = False
             if warp.config.enable_mempools_at_init:
                 # enable if supported
                 self.is_mempool_enabled = self.is_mempool_supported
@@ -3062,6 +3193,9 @@ class Runtime:
             self.core.radix_sort_pairs_int_host.argtypes = [ctypes.c_uint64, ctypes.c_uint64, ctypes.c_int]
             self.core.radix_sort_pairs_int_device.argtypes = [ctypes.c_uint64, ctypes.c_uint64, ctypes.c_int]
+            self.core.radix_sort_pairs_float_host.argtypes = [ctypes.c_uint64, ctypes.c_uint64, ctypes.c_int]
+            self.core.radix_sort_pairs_float_device.argtypes = [ctypes.c_uint64, ctypes.c_uint64, ctypes.c_int]
             self.core.runlength_encode_int_host.argtypes = [
                 ctypes.c_uint64,
                 ctypes.c_uint64,
@@ -3078,10 +3212,16 @@ class Runtime:
             ]
             self.core.bvh_create_host.restype = ctypes.c_uint64
-            self.core.bvh_create_host.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int]
+            self.core.bvh_create_host.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int, ctypes.c_int]
             self.core.bvh_create_device.restype = ctypes.c_uint64
-            self.core.bvh_create_device.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int]
+            self.core.bvh_create_device.argtypes = [
+                ctypes.c_void_p,
+                ctypes.c_void_p,
+                ctypes.c_void_p,
+                ctypes.c_int,
+                ctypes.c_int,
+            ]
             self.core.bvh_destroy_host.argtypes = [ctypes.c_uint64]
             self.core.bvh_destroy_device.argtypes = [ctypes.c_uint64]
@@ -3097,6 +3237,7 @@ class Runtime:
                 ctypes.c_int,
                 ctypes.c_int,
                 ctypes.c_int,
+                ctypes.c_int,
             ]
             self.core.mesh_create_device.restype = ctypes.c_uint64
@@ -3108,6 +3249,7 @@ class Runtime:
                 ctypes.c_int,
                 ctypes.c_int,
                 ctypes.c_int,
+                ctypes.c_int,
             ]
             self.core.mesh_destroy_host.argtypes = [ctypes.c_uint64]
@@ -3345,6 +3487,8 @@ class Runtime:
             self.core.cuda_device_is_uva.restype = ctypes.c_int
             self.core.cuda_device_is_mempool_supported.argtypes = [ctypes.c_int]
             self.core.cuda_device_is_mempool_supported.restype = ctypes.c_int
+            self.core.cuda_device_is_ipc_supported.argtypes = [ctypes.c_int]
+            self.core.cuda_device_is_ipc_supported.restype = ctypes.c_int
             self.core.cuda_device_set_mempool_release_threshold.argtypes = [ctypes.c_int, ctypes.c_uint64]
             self.core.cuda_device_set_mempool_release_threshold.restype = ctypes.c_int
             self.core.cuda_device_get_mempool_release_threshold.argtypes = [ctypes.c_int]
@@ -3398,6 +3542,22 @@ class Runtime:
             self.core.cuda_set_mempool_access_enabled.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.c_int]
             self.core.cuda_set_mempool_access_enabled.restype = ctypes.c_int
+            # inter-process communication
+            self.core.cuda_ipc_get_mem_handle.argtypes = [ctypes.c_void_p, ctypes.POINTER(ctypes.c_char)]
+            self.core.cuda_ipc_get_mem_handle.restype = None
+            self.core.cuda_ipc_open_mem_handle.argtypes = [ctypes.c_void_p, ctypes.POINTER(ctypes.c_char)]
+            self.core.cuda_ipc_open_mem_handle.restype = ctypes.c_void_p
+            self.core.cuda_ipc_close_mem_handle.argtypes = [ctypes.c_void_p]
+            self.core.cuda_ipc_close_mem_handle.restype = None
+            self.core.cuda_ipc_get_event_handle.argtypes = [
+                ctypes.c_void_p,
+                ctypes.c_void_p,
+                ctypes.POINTER(ctypes.c_char),
+            ]
+            self.core.cuda_ipc_get_event_handle.restype = None
+            self.core.cuda_ipc_open_event_handle.argtypes = [ctypes.c_void_p, ctypes.POINTER(ctypes.c_char)]
+            self.core.cuda_ipc_open_event_handle.restype = ctypes.c_void_p
             self.core.cuda_stream_create.argtypes = [ctypes.c_void_p, ctypes.c_int]
             self.core.cuda_stream_create.restype = ctypes.c_void_p
             self.core.cuda_stream_destroy.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
@@ -3445,6 +3605,7 @@ class Runtime:
             self.core.cuda_compile_program.argtypes = [
                 ctypes.c_char_p,  # cuda_src
+                ctypes.c_char_p,  # program name
                 ctypes.c_int,  # arch
                 ctypes.c_char_p,  # include_dir
                 ctypes.c_int,  # num_cuda_include_dirs
@@ -3453,10 +3614,13 @@ class Runtime:
                 ctypes.c_bool,  # verbose
                 ctypes.c_bool,  # verify_fp
                 ctypes.c_bool,  # fast_math
+                ctypes.c_bool,  # fuse_fp
+                ctypes.c_bool,  # lineinfo
                 ctypes.c_char_p,  # output_path
                 ctypes.c_size_t,  # num_ltoirs
                 ctypes.POINTER(ctypes.c_char_p),  # ltoirs
                 ctypes.POINTER(ctypes.c_size_t),  # ltoir_sizes
+                ctypes.POINTER(ctypes.c_int),  # ltoir_input_types, each of type nvJitLinkInputType
             ]
             self.core.cuda_compile_program.restype = ctypes.c_size_t
@@ -3496,6 +3660,22 @@ class Runtime:
             ]
             self.core.cuda_compile_dot.restype = ctypes.c_bool
+            self.core.cuda_compile_solver.argtypes = [
+                ctypes.c_char_p,  # universal fatbin
+                ctypes.c_char_p,  # lto
+                ctypes.c_char_p,  # function name
+                ctypes.c_int,  # num include dirs
+                ctypes.POINTER(ctypes.c_char_p),  # include dirs
+                ctypes.c_char_p,  # mathdx include dir
+                ctypes.c_int,  # arch
+                ctypes.c_int,  # M
+                ctypes.c_int,  # N
+                ctypes.c_int,  # precision
+                ctypes.c_int,  # fill_mode
+                ctypes.c_int,  # num threads
+            ]
+            self.core.cuda_compile_fft.restype = ctypes.c_bool
             self.core.cuda_load_module.argtypes = [ctypes.c_void_p, ctypes.c_char_p]
             self.core.cuda_load_module.restype = ctypes.c_void_p
@@ -4074,7 +4254,7 @@ def set_mempool_enabled(device: Devicelike, enable: bool) -> None:
     They should generally be enabled, but there is a rare caveat.  Copying data between different GPUs
     may fail during graph capture if the memory was allocated using pooled allocators and memory pool
     access is not enabled between the two GPUs.  This is an internal CUDA limitation that is not related
-    to Warp.  The preferred solution is to enable memory pool access using `warp.set_mempool_access_enabled()`.
+    to Warp.  The preferred solution is to enable memory pool access using :func:`set_mempool_access_enabled`.
     If peer access is not supported, then the default CUDA allocators must be used to pre-allocate the memory
     prior to graph capture.
     """
@@ -4846,6 +5026,40 @@ def from_numpy(
     )
+def event_from_ipc_handle(handle, device: "Devicelike" = None) -> Event:
+    """Create an event from an IPC handle.
+    Args:
+        handle: The interprocess event handle for an existing CUDA event.
+        device (Devicelike): Device to associate with the array.
+    Returns:
+        An event created from the interprocess event handle ``handle``.
+    Raises:
+        RuntimeError: IPC is not supported on ``device``.
+    """
+    try:
+        # Performance note: try first, ask questions later
+        device = warp.context.runtime.get_device(device)
+    except Exception:
+        # Fallback to using the public API for retrieving the device,
+        # which takes take of initializing Warp if needed.
+        device = warp.context.get_device(device)
+    if device.is_ipc_supported is False:
+        raise RuntimeError(f"IPC is not supported on device {device}.")
+    event = Event(
+        device=device, cuda_event=warp.context.runtime.core.cuda_ipc_open_event_handle(device.context, handle)
+    )
+    # Events created from IPC handles must be freed with cuEventDestroy
+    event.owner = True
+    return event
 # given a kernel destination argument type and a value convert
 #  to a c-type that can be passed to a kernel
 def pack_arg(kernel, arg_type, arg_name, value, device, adjoint=False):
@@ -4927,6 +5141,9 @@ def pack_arg(kernel, arg_type, arg_name, value, device, adjoint=False):
     # try to convert to a value type (vec3, mat33, etc)
     elif issubclass(arg_type, ctypes.Array):
+        # simple value types don't have gradient arrays, but native built-in signatures still expect a non-null adjoint value of the correct type
+        if value is None and adjoint:
+            return arg_type(0)
         if warp.types.types_equal(type(value), arg_type):
             return value
         else:
@@ -4936,9 +5153,6 @@ def pack_arg(kernel, arg_type, arg_name, value, device, adjoint=False):
             except Exception as e:
                 raise ValueError(f"Failed to convert argument for param {arg_name} to {type_str(arg_type)}") from e
-    elif isinstance(value, bool):
-        return ctypes.c_bool(value)
     elif isinstance(value, arg_type):
         try:
             # try to pack as a scalar type
@@ -4953,6 +5167,9 @@ def pack_arg(kernel, arg_type, arg_name, value, device, adjoint=False):
             ) from e
     else:
+        # scalar args don't have gradient arrays, but native built-in signatures still expect a non-null scalar adjoint
+        if value is None and adjoint:
+            return arg_type._type_(0)
         try:
             # try to pack as a scalar type
             if arg_type is warp.types.float16:
@@ -5272,6 +5489,8 @@ def launch(
                         params_addr=kernel_params,
                         bounds=bounds,
                         device=device,
+                        max_blocks=max_blocks,
+                        block_dim=block_dim,
                     )
                     return launch
@@ -5355,7 +5574,7 @@ def launch_tiled(*args, **kwargs):
     kwargs["dim"] = dim + [kwargs["block_dim"]]
     # forward to original launch method
-    launch(*args, **kwargs)
+    return launch(*args, **kwargs)
 def synchronize():
@@ -6010,14 +6229,19 @@ def export_functions_rst(file):  # pragma: no cover
     # build dictionary of all functions by group
     groups = {}
-    for _k, f in builtin_functions.items():
+    functions = list(builtin_functions.values())
+    for f in functions:
         # build dict of groups
         if f.group not in groups:
             groups[f.group] = []
-        # append all overloads to the group
-        for o in f.overloads:
-            groups[f.group].append(o)
+        if hasattr(f, "overloads"):
+            # append all overloads to the group
+            for o in f.overloads:
+                groups[f.group].append(o)
+        else:
+            groups[f.group].append(f)
     # Keep track of what function and query types have been written
     written_functions = set()
@@ -6037,6 +6261,10 @@ def export_functions_rst(file):  # pragma: no cover
         print("---------------", file=file)
         for f in g:
+            if f.func:
+                # f is a Warp function written in Python, we can use autofunction
+                print(f".. autofunction:: {f.func.__module__}.{f.key}", file=file)
+                continue
             for f_prefix, query_type in query_types:
                 if f.key.startswith(f_prefix) and query_type not in written_query_types:
                     print(f".. autoclass:: {query_type}", file=file)
@@ -6094,24 +6322,32 @@ def export_stubs(file):  # pragma: no cover
     print(header, file=file)
     print(file=file)
-    for k, g in builtin_functions.items():
-        for f in g.overloads:
-            args = ", ".join(f"{k}: {type_str(v)}" for k, v in f.input_types.items())
+    def add_stub(f):
+        args = ", ".join(f"{k}: {type_str(v)}" for k, v in f.input_types.items())
-            return_str = ""
+        return_str = ""
-            if f.hidden:  # or f.generic:
-                continue
+        if f.hidden:  # or f.generic:
+            return
+        return_type = f.value_type
+        if f.value_func:
             return_type = f.value_func(None, None)
-            if return_type:
-                return_str = " -> " + type_str(return_type)
-            print("@over", file=file)
-            print(f"def {f.key}({args}){return_str}:", file=file)
-            print(f'    """{f.doc}', file=file)
-            print('    """', file=file)
-            print("    ...\n\n", file=file)
+        if return_type:
+            return_str = " -> " + type_str(return_type)
+        print("@over", file=file)
+        print(f"def {f.key}({args}){return_str}:", file=file)
+        print(f'    """{f.doc}', file=file)
+        print('    """', file=file)
+        print("    ...\n\n", file=file)
+    for g in builtin_functions.values():
+        if hasattr(g, "overloads"):
+            for f in g.overloads:
+                add_stub(f)
+        else:
+            add_stub(g)
 def export_builtins(file: io.TextIOBase):  # pragma: no cover
@@ -6137,6 +6373,8 @@ def export_builtins(file: io.TextIOBase):  # pragma: no cover
     file.write('extern "C" {\n\n')
     for k, g in builtin_functions.items():
+        if not hasattr(g, "overloads"):
+            continue
         for f in g.overloads:
             if not f.export or f.generic:
                 continue