PyPI - warp-lang - Versions diffs - 1.3.3__py3-none-manylinux2014_aarch64.whl → 1.4.1__py3-none-manylinux2014_aarch64.whl - Mend

warp-lang 1.3.3__py3-none-manylinux2014_aarch64.whl → 1.4.1__py3-none-manylinux2014_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (110) hide show

warp/__init__.py +6 -0
warp/autograd.py +59 -6
warp/bin/warp.so +0 -0
warp/build_dll.py +8 -10
warp/builtins.py +103 -3
warp/codegen.py +447 -53
warp/config.py +1 -1
warp/context.py +682 -405
warp/dlpack.py +2 -0
warp/examples/benchmarks/benchmark_cloth.py +10 -0
warp/examples/core/example_render_opengl.py +12 -10
warp/examples/fem/example_adaptive_grid.py +251 -0
warp/examples/fem/example_apic_fluid.py +1 -1
warp/examples/fem/example_diffusion_3d.py +2 -2
warp/examples/fem/example_magnetostatics.py +1 -1
warp/examples/fem/example_streamlines.py +1 -0
warp/examples/fem/utils.py +25 -5
warp/examples/sim/example_cloth.py +50 -6
warp/fem/__init__.py +2 -0
warp/fem/adaptivity.py +493 -0
warp/fem/field/field.py +2 -1
warp/fem/field/nodal_field.py +18 -26
warp/fem/field/test.py +4 -4
warp/fem/field/trial.py +4 -4
warp/fem/geometry/__init__.py +1 -0
warp/fem/geometry/adaptive_nanogrid.py +843 -0
warp/fem/geometry/nanogrid.py +55 -28
warp/fem/space/__init__.py +1 -1
warp/fem/space/nanogrid_function_space.py +69 -35
warp/fem/utils.py +118 -107
warp/jax_experimental.py +28 -15
warp/native/array.h +0 -1
warp/native/builtin.h +103 -6
warp/native/bvh.cu +4 -2
warp/native/cuda_util.cpp +14 -0
warp/native/cuda_util.h +2 -0
warp/native/error.cpp +4 -2
warp/native/exports.h +99 -0
warp/native/mat.h +97 -0
warp/native/mesh.cpp +36 -0
warp/native/mesh.cu +52 -1
warp/native/mesh.h +1 -0
warp/native/quat.h +43 -0
warp/native/range.h +11 -2
warp/native/spatial.h +6 -0
warp/native/vec.h +74 -0
warp/native/warp.cpp +2 -1
warp/native/warp.cu +10 -3
warp/native/warp.h +8 -1
warp/paddle.py +382 -0
warp/sim/__init__.py +1 -0
warp/sim/collide.py +519 -0
warp/sim/integrator_euler.py +18 -5
warp/sim/integrator_featherstone.py +5 -5
warp/sim/integrator_vbd.py +1026 -0
warp/sim/integrator_xpbd.py +2 -6
warp/sim/model.py +50 -25
warp/sparse.py +9 -7
warp/stubs.py +459 -0
warp/tape.py +2 -0
warp/tests/aux_test_dependent.py +1 -0
warp/tests/aux_test_name_clash1.py +32 -0
warp/tests/aux_test_name_clash2.py +32 -0
warp/tests/aux_test_square.py +1 -0
warp/tests/test_array.py +188 -0
warp/tests/test_async.py +3 -3
warp/tests/test_atomic.py +6 -0
warp/tests/test_closest_point_edge_edge.py +93 -1
warp/tests/test_codegen.py +93 -15
warp/tests/test_codegen_instancing.py +1457 -0
warp/tests/test_collision.py +486 -0
warp/tests/test_compile_consts.py +3 -28
warp/tests/test_dlpack.py +170 -0
warp/tests/test_examples.py +22 -8
warp/tests/test_fast_math.py +10 -4
warp/tests/test_fem.py +81 -1
warp/tests/test_func.py +46 -0
warp/tests/test_implicit_init.py +49 -0
warp/tests/test_jax.py +58 -0
warp/tests/test_mat.py +84 -0
warp/tests/test_mesh_query_point.py +188 -0
warp/tests/test_model.py +13 -0
warp/tests/test_module_hashing.py +40 -0
warp/tests/test_multigpu.py +3 -3
warp/tests/test_overwrite.py +8 -0
warp/tests/test_paddle.py +852 -0
warp/tests/test_print.py +89 -0
warp/tests/test_quat.py +111 -0
warp/tests/test_reload.py +31 -1
warp/tests/test_scalar_ops.py +2 -0
warp/tests/test_static.py +568 -0
warp/tests/test_streams.py +64 -3
warp/tests/test_struct.py +4 -4
warp/tests/test_torch.py +24 -0
warp/tests/test_triangle_closest_point.py +137 -0
warp/tests/test_types.py +1 -1
warp/tests/test_vbd.py +386 -0
warp/tests/test_vec.py +143 -0
warp/tests/test_vec_scalar_ops.py +139 -0
warp/tests/unittest_suites.py +12 -0
warp/tests/unittest_utils.py +9 -5
warp/thirdparty/dlpack.py +3 -1
warp/types.py +167 -36
warp/utils.py +37 -14
{warp_lang-1.3.3.dist-info → warp_lang-1.4.1.dist-info}/METADATA +10 -8
{warp_lang-1.3.3.dist-info → warp_lang-1.4.1.dist-info}/RECORD +109 -97
warp/tests/test_point_triangle_closest_point.py +0 -143
{warp_lang-1.3.3.dist-info → warp_lang-1.4.1.dist-info}/LICENSE.md +0 -0
{warp_lang-1.3.3.dist-info → warp_lang-1.4.1.dist-info}/WHEEL +0 -0
{warp_lang-1.3.3.dist-info → warp_lang-1.4.1.dist-info}/top_level.txt +0 -0

warp/context.py CHANGED Viewed

@@ -6,7 +6,6 @@
 # license agreement from NVIDIA CORPORATION is strictly prohibited.
 import ast
-import builtins
 import ctypes
 import functools
 import hashlib
@@ -19,9 +18,9 @@ import platform
 import sys
 import types
 import typing
+import weakref
 from copy import copy as shallowcopy
 from pathlib import Path
-from struct import pack as struct_pack
 from typing import Any, Callable, Dict, List, Mapping, Optional, Sequence, Tuple, Union
 import numpy as np
@@ -36,6 +35,10 @@ import warp.config
 def create_value_func(type):
     def value_func(arg_types, arg_values):
+        hint_origin = getattr(type, "__origin__", None)
+        if hint_origin is not None and issubclass(hint_origin, typing.Tuple):
+            return type.__args__
         return type
     return value_func
@@ -54,6 +57,38 @@ def get_function_args(func):
 complex_type_hints = (Any, Callable, Tuple)
 sequence_types = (list, tuple)
+function_key_counts = {}
+def generate_unique_function_identifier(key):
+    # Generate unique identifiers for user-defined functions in native code.
+    # - Prevents conflicts when a function is redefined and old versions are still in use.
+    # - Prevents conflicts between multiple closures returned from the same function.
+    # - Prevents conflicts between identically named functions from different modules.
+    #
+    # Currently, we generate a unique id when a new Function is created, which produces
+    # globally unique identifiers.
+    #
+    # NOTE:
+    #   We could move this to the Module class for generating unique identifiers at module scope,
+    #   but then we need another solution for preventing conflicts across modules (e.g., different namespaces).
+    #   That would requires more Python code, generate more native code, and would be slightly slower
+    #   with no clear advantages over globally-unique identifiers (non-global shared state is still shared state).
+    #
+    # TODO:
+    #   Kernels and structs use unique identifiers based on their hash.  Using hash-based identifiers
+    #   for functions would allow filtering out duplicate identical functions during codegen,
+    #   like we do with kernels and structs.  This is worth investigating further, but might require
+    #   additional refactoring.  For example, the code that deals with custom gradient and replay functions
+    #   requires matching function names, but these special functions get created before the hash
+    #   for the parent function can be computed.  In addition to these complications, computing hashes
+    #   for all function instances would increase the cost of module hashing when generic functions
+    #   are involved (currently we only hash the generic templates, which is sufficient).
+    unique_id = function_key_counts.get(key, 0)
+    function_key_counts[key] = unique_id + 1
+    return f"{key}_{unique_id}"
 class Function:
     def __init__(
@@ -90,6 +125,7 @@ class Function:
         code_transformers=None,
         skip_adding_overload=False,
         require_original_output_arg=False,
+        scope_locals=None,  # the locals() where the function is defined, used for overload management
     ):
         if code_transformers is None:
             code_transformers = []
@@ -115,6 +151,7 @@ class Function:
         self.replay_snippet = replay_snippet
         self.custom_grad_func = None
         self.require_original_output_arg = require_original_output_arg
+        self.generic_parent = None  # generic function that was used to instantiate this overload
         if initializer_list_func is None:
             self.initializer_list_func = lambda x, y: False
@@ -124,14 +161,19 @@ class Function:
             )
         self.hidden = hidden  # function will not be listed in docs
         self.skip_replay = (
-            skip_replay  # whether or not operation will be performed during the forward replay in the backward pass
+            skip_replay  # whether operation will be performed during the forward replay in the backward pass
         )
-        self.missing_grad = missing_grad  # whether or not builtin is missing a corresponding adjoint
+        self.missing_grad = missing_grad  # whether builtin is missing a corresponding adjoint
         self.generic = generic
-        # allow registering builtin functions with a different name in Python from the native code
+        # allow registering functions with a different name in Python and native code
         if native_func is None:
-            self.native_func = key
+            if func is None:
+                # builtin function
+                self.native_func = key
+            else:
+                # user functions need unique identifiers to avoid conflicts
+                self.native_func = generate_unique_function_identifier(key)
         else:
             self.native_func = native_func
@@ -162,6 +204,11 @@ class Function:
                 else:
                     self.input_types[name] = type
+            # Record any default parameter values.
+            if not self.defaults:
+                signature = inspect.signature(func)
+                self.defaults = {k: v.default for k, v in signature.parameters.items() if v.default is not v.empty}
         else:
             # builtin function
@@ -210,9 +257,13 @@ class Function:
             signature_params.append(param)
         self.signature = inspect.Signature(signature_params)
+        # scope for resolving overloads
+        if scope_locals is None:
+            scope_locals = inspect.currentframe().f_back.f_locals
         # add to current module
         if module:
-            module.register_function(self, skip_adding_overload)
+            module.register_function(self, scope_locals, skip_adding_overload)
     def __call__(self, *args, **kwargs):
         # handles calling a builtin (native) function
@@ -323,57 +374,52 @@ class Function:
             # check if generic
             if warp.types.is_generic_signature(sig):
-                if sig in self.user_templates:
-                    raise RuntimeError(
-                        f"Duplicate generic function overload {self.key} with arguments {f.input_types.values()}"
-                    )
                 self.user_templates[sig] = f
             else:
-                if sig in self.user_overloads:
-                    raise RuntimeError(
-                        f"Duplicate function overload {self.key} with arguments {f.input_types.values()}"
-                    )
                 self.user_overloads[sig] = f
     def get_overload(self, arg_types, kwarg_types):
         assert not self.is_builtin()
-        sig = warp.types.get_signature(arg_types, func_name=self.key)
+        for f in self.user_overloads.values():
+            if warp.codegen.func_match_args(f, arg_types, kwarg_types):
+                return f
-        f = self.user_overloads.get(sig)
-        if f is not None:
-            return f
-        else:
-            for f in self.user_templates.values():
-                if len(f.input_types) != len(arg_types):
-                    continue
+        for f in self.user_templates.values():
+            if not warp.codegen.func_match_args(f, arg_types, kwarg_types):
+                continue
+            if len(f.input_types) != len(arg_types):
+                continue
-                # try to match the given types to the function template types
-                template_types = list(f.input_types.values())
-                args_matched = True
+            # try to match the given types to the function template types
+            template_types = list(f.input_types.values())
+            args_matched = True
-                for i in range(len(arg_types)):
-                    if not warp.types.type_matches_template(arg_types[i], template_types[i]):
-                        args_matched = False
-                        break
+            for i in range(len(arg_types)):
+                if not warp.types.type_matches_template(arg_types[i], template_types[i]):
+                    args_matched = False
+                    break
-                if args_matched:
-                    # instantiate this function with the specified argument types
+            if args_matched:
+                # instantiate this function with the specified argument types
-                    arg_names = f.input_types.keys()
-                    overload_annotations = dict(zip(arg_names, arg_types))
+                arg_names = f.input_types.keys()
+                overload_annotations = dict(zip(arg_names, arg_types))
-                    ovl = shallowcopy(f)
-                    ovl.adj = warp.codegen.Adjoint(f.func, overload_annotations)
-                    ovl.input_types = overload_annotations
-                    ovl.value_func = None
+                ovl = shallowcopy(f)
+                ovl.adj = warp.codegen.Adjoint(f.func, overload_annotations)
+                ovl.input_types = overload_annotations
+                ovl.value_func = None
+                ovl.generic_parent = f
-                    self.user_overloads[sig] = ovl
+                sig = warp.types.get_signature(arg_types, func_name=self.key)
+                self.user_overloads[sig] = ovl
-                    return ovl
+                return ovl
-            # failed  to find overload
-            return None
+        # failed  to find overload
+        return None
     def __repr__(self):
         inputs_str = ", ".join([f"{k}: {warp.types.type_repr(v)}" for k, v in self.input_types.items()])
@@ -589,8 +635,7 @@ class Kernel:
             self.module = module
         if key is None:
-            unique_key = self.module.generate_unique_kernel_key(func.__name__)
-            self.key = unique_key
+            self.key = warp.codegen.make_full_qualified_name(func)
         else:
             self.key = key
@@ -614,9 +659,15 @@ class Kernel:
         # known overloads for generic kernels, indexed by type signature
         self.overloads = {}
+        # generic kernel that was used to instantiate this overload
+        self.generic_parent = None
         # argument indices by name
         self.arg_indices = {a.label: i for i, a in enumerate(self.adj.args)}
+        # hash will be computed when the module is built
+        self.hash = None
         if self.module:
             self.module.register_kernel(self)
@@ -664,10 +715,11 @@ class Kernel:
         ovl.is_generic = False
         ovl.overloads = {}
         ovl.sig = sig
+        ovl.generic_parent = self
         self.overloads[sig] = ovl
-        self.module.unload()
+        self.module.mark_modified()
         return ovl
@@ -676,10 +728,13 @@ class Kernel:
         return self.overloads.get(sig)
     def get_mangled_name(self):
-        if self.sig:
-            return f"{self.key}_{self.sig}"
-        else:
-            return self.key
+        if self.hash is None:
+            raise RuntimeError(f"Missing hash for kernel {self.key} in module {self.module.name}")
+        # TODO: allow customizing the number of hash characters used
+        hash_suffix = self.hash.hex()[:8]
+        return f"{self.key}_{hash_suffix}"
 # ----------------------
@@ -689,9 +744,11 @@ class Kernel:
 def func(f):
     name = warp.codegen.make_full_qualified_name(f)
+    scope_locals = inspect.currentframe().f_back.f_locals
     m = get_module(f.__module__)
     Function(
-        func=f, key=name, namespace="", module=m, value_func=None
+        func=f, key=name, namespace="", module=m, value_func=None, scope_locals=scope_locals
     )  # value_type not known yet, will be inferred during Adjoint.build()
     # use the top of the list of overloads for this key
@@ -705,6 +762,8 @@ def func_native(snippet, adj_snippet=None, replay_snippet=None):
     Decorator to register native code snippet, @func_native
     """
+    scope_locals = inspect.currentframe().f_back.f_locals
     def snippet_func(f):
         name = warp.codegen.make_full_qualified_name(f)
@@ -717,6 +776,7 @@ def func_native(snippet, adj_snippet=None, replay_snippet=None):
             native_snippet=snippet,
             adj_native_snippet=adj_snippet,
             replay_snippet=replay_snippet,
+            scope_locals=scope_locals,
         )  # value_type not known yet, will be inferred during Adjoint.build()
         g = m.functions[name]
         # copy over the function attributes, including docstring
@@ -783,6 +843,7 @@ def func_grad(forward_fn):
             f.custom_grad_func = Function(
                 grad_fn,
                 key=f.key,
+                native_func=f.native_func,
                 namespace=f.namespace,
                 input_types=reverse_args,
                 value_func=None,
@@ -941,7 +1002,7 @@ def overload(kernel, arg_types=None):
         # ensure this function name corresponds to a kernel
         fn = kernel
         module = get_module(fn.__module__)
-        kernel = module.kernels.get(fn.__name__)
+        kernel = module.find_kernel(fn)
         if kernel is None:
             raise RuntimeError(f"Failed to find a kernel named '{fn.__name__}' in module {fn.__module__}")
@@ -1277,7 +1338,6 @@ def get_module(name):
             # clear out old kernels, funcs, struct definitions
             old_module.kernels = {}
             old_module.functions = {}
-            old_module.constants = {}
             old_module.structs = {}
             old_module.loader = parent_loader
@@ -1289,30 +1349,206 @@ def get_module(name):
         return user_modules[name]
+# ModuleHasher computes the module hash based on all the kernels, module options,
+# and build configuration.  For each kernel, it computes a deep hash by recursively
+# hashing all referenced functions, structs, and constants, even those defined in
+# other modules.  The module hash is computed in the constructor and can be retrieved
+# using get_module_hash().  In addition, the ModuleHasher takes care of filtering out
+# duplicate kernels for codegen (see get_unique_kernels()).
+class ModuleHasher:
+    def __init__(self, module):
+        # cache function hashes to avoid hashing multiple times
+        self.function_hashes = {}  # (function: hash)
+        # avoid recursive spiral of doom (e.g., function calling an overload of itself)
+        self.functions_in_progress = set()
+        # all unique kernels for codegen, filtered by hash
+        self.unique_kernels = {}  # (hash: kernel)
+        # start hashing the module
+        ch = hashlib.sha256()
+        # hash all non-generic kernels
+        for kernel in module.live_kernels:
+            if kernel.is_generic:
+                for ovl in kernel.overloads.values():
+                    if not ovl.adj.skip_build:
+                        ovl.hash = self.hash_kernel(ovl)
+            else:
+                if not kernel.adj.skip_build:
+                    kernel.hash = self.hash_kernel(kernel)
+        # include all unique kernels in the module hash
+        for kernel_hash in sorted(self.unique_kernels.keys()):
+            ch.update(kernel_hash)
+        # configuration parameters
+        for opt in sorted(module.options.keys()):
+            s = f"{opt}:{module.options[opt]}"
+            ch.update(bytes(s, "utf-8"))
+        # ensure to trigger recompilation if flags affecting kernel compilation are changed
+        if warp.config.verify_fp:
+            ch.update(bytes("verify_fp", "utf-8"))
+        # build config
+        ch.update(bytes(warp.config.mode, "utf-8"))
+        # save the module hash
+        self.module_hash = ch.digest()
+    def hash_kernel(self, kernel):
+        # NOTE: We only hash non-generic kernels, so we don't traverse kernel overloads here.
+        ch = hashlib.sha256()
+        ch.update(bytes(kernel.key, "utf-8"))
+        ch.update(self.hash_adjoint(kernel.adj))
+        h = ch.digest()
+        self.unique_kernels[h] = kernel
+        return h
+    def hash_function(self, func):
+        # NOTE: This method hashes all possible overloads that a function call could resolve to.
+        # The exact overload will be resolved at build time, when the argument types are known.
+        h = self.function_hashes.get(func)
+        if h is not None:
+            return h
+        self.functions_in_progress.add(func)
+        ch = hashlib.sha256()
+        ch.update(bytes(func.key, "utf-8"))
+        # include all concrete and generic overloads
+        overloads = {**func.user_overloads, **func.user_templates}
+        for sig in sorted(overloads.keys()):
+            ovl = overloads[sig]
+            # skip instantiations of generic functions
+            if ovl.generic_parent is not None:
+                continue
+            # adjoint
+            ch.update(self.hash_adjoint(ovl.adj))
+            # custom bits
+            if ovl.custom_grad_func:
+                ch.update(self.hash_adjoint(ovl.custom_grad_func.adj))
+            if ovl.custom_replay_func:
+                ch.update(self.hash_adjoint(ovl.custom_replay_func.adj))
+            if ovl.replay_snippet:
+                ch.update(bytes(ovl.replay_snippet, "utf-8"))
+            if ovl.native_snippet:
+                ch.update(bytes(ovl.native_snippet, "utf-8"))
+            if ovl.adj_native_snippet:
+                ch.update(bytes(ovl.adj_native_snippet, "utf-8"))
+        h = ch.digest()
+        self.function_hashes[func] = h
+        self.functions_in_progress.remove(func)
+        return h
+    def hash_adjoint(self, adj):
+        # NOTE: We don't cache adjoint hashes, because adjoints are always unique.
+        # Even instances of generic kernels and functions have unique adjoints with
+        # different argument types.
+        ch = hashlib.sha256()
+        # source
+        ch.update(bytes(adj.source, "utf-8"))
+        # args
+        for arg, arg_type in adj.arg_types.items():
+            s = f"{arg}:{warp.types.get_type_code(arg_type)}"
+            ch.update(bytes(s, "utf-8"))
+            # hash struct types
+            if isinstance(arg_type, warp.codegen.Struct):
+                ch.update(arg_type.hash)
+            elif warp.types.is_array(arg_type) and isinstance(arg_type.dtype, warp.codegen.Struct):
+                ch.update(arg_type.dtype.hash)
+        # find referenced constants, types, and functions
+        constants, types, functions = adj.get_references()
+        # hash referenced constants
+        for name, value in constants.items():
+            ch.update(bytes(name, "utf-8"))
+            ch.update(self.get_constant_bytes(value))
+        # hash wp.static() expressions that were evaluated at declaration time
+        for k, v in adj.static_expressions.items():
+            ch.update(bytes(k, "utf-8"))
+            if isinstance(v, Function):
+                if v not in self.functions_in_progress:
+                    ch.update(self.hash_function(v))
+            else:
+                ch.update(self.get_constant_bytes(v))
+        # hash referenced types
+        for t in types.keys():
+            ch.update(bytes(warp.types.get_type_code(t), "utf-8"))
+        # hash referenced functions
+        for f in functions.keys():
+            if f not in self.functions_in_progress:
+                ch.update(self.hash_function(f))
+        return ch.digest()
+    def get_constant_bytes(self, value):
+        if isinstance(value, int):
+            # this also handles builtins.bool
+            return bytes(ctypes.c_int(value))
+        elif isinstance(value, float):
+            return bytes(ctypes.c_float(value))
+        elif isinstance(value, warp.types.float16):
+            # float16 is a special case
+            return bytes(ctypes.c_float(value.value))
+        elif isinstance(value, tuple(warp.types.scalar_and_bool_types)):
+            return bytes(value._type_(value.value))
+        elif hasattr(value, "_wp_scalar_type_"):
+            return bytes(value)
+        elif isinstance(value, warp.codegen.StructInstance):
+            return bytes(value._ctype)
+        else:
+            raise TypeError(f"Invalid constant type: {type(value)}")
+    def get_module_hash(self):
+        return self.module_hash
+    def get_unique_kernels(self):
+        return self.unique_kernels.values()
 class ModuleBuilder:
-    def __init__(self, module, options):
+    def __init__(self, module, options, hasher=None):
         self.functions = {}
         self.structs = {}
         self.options = options
         self.module = module
         self.deferred_functions = []
-        # build all functions declared in the module
-        for func in module.functions.values():
-            for f in func.user_overloads.values():
-                self.build_function(f)
-                if f.custom_replay_func is not None:
-                    self.build_function(f.custom_replay_func)
-        # build all kernel entry points
-        for kernel in module.kernels.values():
-            if not kernel.is_generic:
-                self.build_kernel(kernel)
-            else:
-                for k in kernel.overloads.values():
-                    self.build_kernel(k)
+        if hasher is None:
+            hasher = ModuleHasher(module)
-        # build all functions outside this module which are called from functions or kernels in this module
+        # build all unique kernels
+        self.kernels = hasher.get_unique_kernels()
+        for kernel in self.kernels:
+            self.build_kernel(kernel)
+        # build deferred functions
         for func in self.deferred_functions:
             self.build_function(func)
@@ -1328,7 +1564,7 @@ class ModuleBuilder:
             for var in s.vars.values():
                 if isinstance(var.type, warp.codegen.Struct):
                     stack.append(var.type)
-                elif isinstance(var.type, warp.types.array) and isinstance(var.type.dtype, warp.codegen.Struct):
+                elif warp.types.is_array(var.type) and isinstance(var.type.dtype, warp.codegen.Struct):
                     stack.append(var.type.dtype)
         # Build them in reverse to generate a correct dependency order.
@@ -1374,8 +1610,12 @@ class ModuleBuilder:
         source = ""
         # code-gen structs
+        visited_structs = set()
         for struct in self.structs.keys():
-            source += warp.codegen.codegen_struct(struct)
+            # avoid emitting duplicates
+            if struct.hash not in visited_structs:
+                source += warp.codegen.codegen_struct(struct)
+                visited_structs.add(struct.hash)
         # code-gen all imported functions
         for func in self.functions.keys():
@@ -1386,21 +1626,15 @@ class ModuleBuilder:
             else:
                 source += warp.codegen.codegen_snippet(
                     func.adj,
-                    name=func.key,
+                    name=func.native_func,
                     snippet=func.native_snippet,
                     adj_snippet=func.adj_native_snippet,
                     replay_snippet=func.replay_snippet,
                 )
-        for kernel in self.module.kernels.values():
-            # each kernel gets an entry point in the module
-            if not kernel.is_generic:
-                source += warp.codegen.codegen_kernel(kernel, device=device, options=self.options)
-                source += warp.codegen.codegen_module(kernel, device=device)
-            else:
-                for k in kernel.overloads.values():
-                    source += warp.codegen.codegen_kernel(k, device=device, options=self.options)
-                    source += warp.codegen.codegen_module(k, device=device)
+        for kernel in self.kernels:
+            source += warp.codegen.codegen_kernel(kernel, device=device, options=self.options)
+            source += warp.codegen.codegen_module(kernel, device=device)
         # add headers
         if device == "cpu":
@@ -1425,8 +1659,9 @@ class ModuleExec:
         instance.handle = None
         return instance
-    def __init__(self, handle, device):
+    def __init__(self, handle, module_hash, device):
         self.handle = handle
+        self.module_hash = module_hash
         self.device = device
         self.kernel_hooks = {}
@@ -1457,8 +1692,12 @@ class ModuleExec:
             )
         else:
             func = ctypes.CFUNCTYPE(None)
-            forward = func(runtime.llvm.lookup(self.handle.encode("utf-8"), (name + "_cpu_forward").encode("utf-8")))
-            backward = func(runtime.llvm.lookup(self.handle.encode("utf-8"), (name + "_cpu_backward").encode("utf-8")))
+            forward = (
+                func(runtime.llvm.lookup(self.handle.encode("utf-8"), (name + "_cpu_forward").encode("utf-8"))) or None
+            )
+            backward = (
+                func(runtime.llvm.lookup(self.handle.encode("utf-8"), (name + "_cpu_backward").encode("utf-8"))) or None
+            )
         hooks = KernelHooks(forward, backward)
         self.kernel_hooks[kernel] = hooks
@@ -1475,16 +1714,33 @@ class Module:
         self.name = name
         self.loader = loader
-        self.kernels = {}
-        self.functions = {}
-        self.constants = {}  # Any constants referenced in this module including those defined in other modules
-        self.structs = {}
+        # lookup the latest versions of kernels, functions, and structs by key
+        self.kernels = {}  # (key: kernel)
+        self.functions = {}  # (key: function)
+        self.structs = {}  # (key: struct)
+        # Set of all "live" kernels in this module.
+        # The difference between `live_kernels` and `kernels` is that `live_kernels` may contain
+        # multiple kernels with the same key (which is essential to support closures), while `kernels`
+        # only holds the latest kernel for each key.  When the module is built, we compute the hash
+        # of each kernel in `live_kernels` and filter out duplicates for codegen.
+        self.live_kernels = weakref.WeakSet()
-        self.cpu_exec = None  # executable CPU module
-        self.cuda_execs = {}  # executable CUDA module lookup by CUDA context
+        # executable modules currently loaded
+        self.execs = {}  # (device.context: ModuleExec)
-        self.cpu_build_failed = False
-        self.cuda_build_failed = False
+        # set of device contexts where the build has failed
+        self.failed_builds = set()
+        # hash data, including the module hash
+        self.hasher = None
+        # LLVM executable modules are identified using strings.  Since it's possible for multiple
+        # executable versions to be loaded at the same time, we need a way to ensure uniqueness.
+        # A unique handle is created from the module name and this auto-incremented integer id.
+        # NOTE: The module hash is not sufficient for uniqueness in rare cases where a module
+        # is retained and later reloaded with the same hash.
+        self.cpu_exec_id = 0
         self.options = {
             "max_unroll": warp.config.max_unroll,
@@ -1497,11 +1753,6 @@ class Module:
         # Module dependencies are determined by scanning each function
         # and kernel for references to external functions and structs.
         #
-        # When a referenced module is modified, all of its dependents need to be reloaded
-        # on the next launch.  To detect this, a module's hash recursively includes
-        # all of its references.
-        # -> See ``Module.hash_module()``
-        #
         # The dependency mechanism works for both static and dynamic (runtime) modifications.
         # When a module is reloaded at runtime, we recursively unload all of its
         # dependents, so that they will be re-hashed and reloaded on the next launch.
@@ -1510,40 +1761,39 @@ class Module:
         self.references = set()  # modules whose content we depend on
         self.dependents = set()  # modules that depend on our content
-        # Since module hashing is recursive, we improve performance by caching the hash of the
-        # module contents (kernel source, function source, and struct source).
-        # After all kernels, functions, and structs are added to the module (usually at import time),
-        # the content hash doesn't change.
-        # -> See ``Module.hash_module_recursive()``
-        self.content_hash = None
-        # number of times module auto-generates kernel key for user
-        # used to ensure unique kernel keys
-        self.count = 0
     def register_struct(self, struct):
         self.structs[struct.key] = struct
         # for a reload of module on next launch
-        self.unload()
+        self.mark_modified()
     def register_kernel(self, kernel):
+        # keep a reference to the latest version
         self.kernels[kernel.key] = kernel
+        # track all kernel objects, even if they are duplicates
+        self.live_kernels.add(kernel)
         self.find_references(kernel.adj)
         # for a reload of module on next launch
-        self.unload()
+        self.mark_modified()
-    def register_function(self, func, skip_adding_overload=False):
-        if func.key not in self.functions:
-            self.functions[func.key] = func
+    def register_function(self, func, scope_locals, skip_adding_overload=False):
+        # check for another Function with the same name in the same scope
+        obj = scope_locals.get(func.func.__name__)
+        if isinstance(obj, Function):
+            func_existing = obj
         else:
+            func_existing = None
+        # keep a reference to the latest version
+        self.functions[func.key] = func_existing or func
+        if func_existing:
             # Check whether the new function's signature match any that has
             # already been registered. If so, then we simply override it, as
             # Python would do it, otherwise we register it as a new overload.
-            func_existing = self.functions[func.key]
             sig = warp.types.get_signature(
                 func.input_types.values(),
                 func_name=func.key,
@@ -1555,19 +1805,43 @@ class Module:
                 arg_names=list(func_existing.input_types.keys()),
             )
             if sig == sig_existing:
+                # replace the top-level function, but keep existing overloads
+                # copy generic overloads
+                func.user_templates = func_existing.user_templates.copy()
+                # copy concrete overloads
+                if warp.types.is_generic_signature(sig):
+                    # skip overloads that were instantiated from the function being replaced
+                    for k, v in func_existing.user_overloads.items():
+                        if v.generic_parent != func_existing:
+                            func.user_overloads[k] = v
+                    func.user_templates[sig] = func
+                else:
+                    func.user_overloads = func_existing.user_overloads.copy()
+                    func.user_overloads[sig] = func
                 self.functions[func.key] = func
             elif not skip_adding_overload:
+                # check if this is a generic overload that replaces an existing one
+                if warp.types.is_generic_signature(sig):
+                    old_generic = func_existing.user_templates.get(sig)
+                    if old_generic is not None:
+                        # purge any concrete overloads that were instantiated from the old one
+                        for k, v in list(func_existing.user_overloads.items()):
+                            if v.generic_parent == old_generic:
+                                del func_existing.user_overloads[k]
                 func_existing.add_overload(func)
         self.find_references(func.adj)
         # for a reload of module on next launch
-        self.unload()
+        self.mark_modified()
-    def generate_unique_kernel_key(self, key):
-        unique_key = f"{key}_{self.count}"
-        self.count += 1
-        return unique_key
+    # find kernel corresponding to a Python function
+    def find_kernel(self, func):
+        qualname = warp.codegen.make_full_qualified_name(func)
+        return self.kernels.get(qualname)
     # collect all referenced functions / structs
     # given the AST of a function or kernel
@@ -1599,165 +1873,30 @@ class Module:
             if isinstance(arg.type, warp.codegen.Struct) and arg.type.module is not None:
                 add_ref(arg.type.module)
-    def hash_module(self, recompute_content_hash=False):
-        """Recursively compute and return a hash for the module.
-        If ``recompute_content_hash`` is False, each module's previously
-        computed ``content_hash`` will be used.
-        """
-        def get_type_name(type_hint) -> str:
-            if isinstance(type_hint, warp.codegen.Struct):
-                return get_type_name(type_hint.cls)
-            elif isinstance(type_hint, warp.array) and isinstance(type_hint.dtype, warp.codegen.Struct):
-                return f"array{get_type_name(type_hint.dtype)}"
-            return str(type_hint)
-        def hash_recursive(module, visited):
-            # Hash this module, including all referenced modules recursively.
-            # The visited set tracks modules already visited to avoid circular references.
-            # check if we need to update the content hash
-            if not module.content_hash or recompute_content_hash:
-                # recompute content hash
-                ch = hashlib.sha256()
-                # Start with an empty constants dictionary in case any have been removed
-                module.constants = {}
-                # struct source
-                for struct in module.structs.values():
-                    s = ",".join(
-                        "{}: {}".format(name, get_type_name(type_hint))
-                        for name, type_hint in warp.codegen.get_annotations(struct.cls).items()
-                    )
-                    ch.update(bytes(s, "utf-8"))
-                # functions source
-                for function in module.functions.values():
-                    # include all concrete and generic overloads
-                    overloads = itertools.chain(function.user_overloads.items(), function.user_templates.items())
-                    for sig, func in overloads:
-                        # signature
-                        ch.update(bytes(sig, "utf-8"))
-                        # source
-                        ch.update(bytes(func.adj.source, "utf-8"))
-                        if func.custom_grad_func:
-                            ch.update(bytes(func.custom_grad_func.adj.source, "utf-8"))
-                        if func.custom_replay_func:
-                            ch.update(bytes(func.custom_replay_func.adj.source, "utf-8"))
-                        if func.replay_snippet:
-                            ch.update(bytes(func.replay_snippet, "utf-8"))
-                        if func.native_snippet:
-                            ch.update(bytes(func.native_snippet, "utf-8"))
-                        if func.adj_native_snippet:
-                            ch.update(bytes(func.adj_native_snippet, "utf-8"))
-                        # Populate constants referenced in this function
-                        if func.adj:
-                            module.constants.update(func.adj.get_constant_references())
-                # kernel source
-                for kernel in module.kernels.values():
-                    ch.update(bytes(kernel.key, "utf-8"))
-                    ch.update(bytes(kernel.adj.source, "utf-8"))
-                    # cache kernel arg types
-                    for arg, arg_type in kernel.adj.arg_types.items():
-                        s = f"{arg}: {get_type_name(arg_type)}"
-                        ch.update(bytes(s, "utf-8"))
-                    # for generic kernels the Python source is always the same,
-                    # but we hash the type signatures of all the overloads
-                    if kernel.is_generic:
-                        for sig in sorted(kernel.overloads.keys()):
-                            ch.update(bytes(sig, "utf-8"))
-                    # Populate constants referenced in this kernel
-                    module.constants.update(kernel.adj.get_constant_references())
-                # constants referenced in this module
-                for constant_name, constant_value in module.constants.items():
-                    ch.update(bytes(constant_name, "utf-8"))
-                    # hash the constant value
-                    if isinstance(constant_value, builtins.bool):
-                        # This needs to come before the check for `int` since all boolean
-                        # values are also instances of `int`.
-                        ch.update(struct_pack("?", constant_value))
-                    elif isinstance(constant_value, int):
-                        ch.update(struct_pack("<q", constant_value))
-                    elif isinstance(constant_value, float):
-                        ch.update(struct_pack("<d", constant_value))
-                    elif isinstance(constant_value, warp.types.float16):
-                        # float16 is a special case
-                        p = ctypes.pointer(ctypes.c_float(constant_value.value))
-                        ch.update(p.contents)
-                    elif isinstance(constant_value, tuple(warp.types.scalar_types)):
-                        p = ctypes.pointer(constant_value._type_(constant_value.value))
-                        ch.update(p.contents)
-                    elif isinstance(constant_value, ctypes.Array):
-                        ch.update(bytes(constant_value))
-                    else:
-                        raise RuntimeError(f"Invalid constant type: {type(constant_value)}")
-                module.content_hash = ch.digest()
-            h = hashlib.sha256()
-            # content hash
-            h.update(module.content_hash)
-            # configuration parameters
-            for k in sorted(module.options.keys()):
-                s = f"{k}={module.options[k]}"
-                h.update(bytes(s, "utf-8"))
-            # ensure to trigger recompilation if flags affecting kernel compilation are changed
-            if warp.config.verify_fp:
-                h.update(bytes("verify_fp", "utf-8"))
-            h.update(bytes(warp.config.mode, "utf-8"))
-            # recurse on references
-            visited.add(module)
-            sorted_deps = sorted(module.references, key=lambda m: m.name)
-            for dep in sorted_deps:
-                if dep not in visited:
-                    dep_hash = hash_recursive(dep, visited)
-                    h.update(dep_hash)
-            return h.digest()
-        return hash_recursive(self, visited=set())
+    def hash_module(self):
+        # compute latest hash
+        self.hasher = ModuleHasher(self)
+        return self.hasher.get_module_hash()
     def load(self, device) -> ModuleExec:
         device = runtime.get_device(device)
-        if device.is_cpu:
-            # check if already loaded
-            if self.cpu_exec:
-                return self.cpu_exec
-            # avoid repeated build attempts
-            if self.cpu_build_failed:
-                return None
-            if not warp.is_cpu_available():
-                raise RuntimeError("Failed to build CPU module because no CPU buildchain was found")
-        else:
-            # check if already loaded
-            cuda_exec = self.cuda_execs.get(device.context)
-            if cuda_exec is not None:
-                return cuda_exec
-            # avoid repeated build attempts
-            if self.cuda_build_failed:
-                return None
-            if not warp.is_cuda_available():
-                raise RuntimeError("Failed to build CUDA module because CUDA is not available")
+        # compute the hash if needed
+        if self.hasher is None:
+            self.hasher = ModuleHasher(self)
+        # check if executable module is already loaded and not stale
+        exec = self.execs.get(device.context)
+        if exec is not None:
+            if exec.module_hash == self.hasher.module_hash:
+                return exec
+        # quietly avoid repeated build attempts to reduce error spew
+        if device.context in self.failed_builds:
+            return None
         module_name = "wp_" + self.name
-        module_hash = self.hash_module()
+        module_hash = self.hasher.module_hash
         # use a unique module path using the module short hash
         module_dir = os.path.join(warp.config.kernel_cache_dir, f"{module_name}_{module_hash.hex()[:7]}")
@@ -1807,7 +1946,7 @@ class Module:
                 or not warp.config.cache_kernels
                 or warp.config.verify_autograd_array_access
             ):
-                builder = ModuleBuilder(self, self.options)
+                builder = ModuleBuilder(self, self.options, hasher=self.hasher)
                 # create a temporary (process unique) dir for build outputs before moving to the binary dir
                 build_dir = os.path.join(
@@ -1844,7 +1983,7 @@ class Module:
                             )
                     except Exception as e:
-                        self.cpu_build_failed = True
+                        self.failed_builds.add(None)
                         module_load_timer.extra_msg = " (error)"
                         raise (e)
@@ -1873,7 +2012,7 @@ class Module:
                             )
                     except Exception as e:
-                        self.cuda_build_failed = True
+                        self.failed_builds.add(device.context)
                         module_load_timer.extra_msg = " (error)"
                         raise (e)
@@ -1914,15 +2053,18 @@ class Module:
             # -----------------------------------------------------------
             # Load CPU or CUDA binary
             if device.is_cpu:
-                runtime.llvm.load_obj(binary_path.encode("utf-8"), module_name.encode("utf-8"))
-                module_exec = ModuleExec(module_name, device)
-                self.cpu_exec = module_exec
+                # LLVM modules are identified using strings, so we need to ensure uniqueness
+                module_handle = f"{module_name}_{self.cpu_exec_id}"
+                self.cpu_exec_id += 1
+                runtime.llvm.load_obj(binary_path.encode("utf-8"), module_handle.encode("utf-8"))
+                module_exec = ModuleExec(module_handle, module_hash, device)
+                self.execs[None] = module_exec
             elif device.is_cuda:
                 cuda_module = warp.build.load_cuda(binary_path, device)
                 if cuda_module is not None:
-                    module_exec = ModuleExec(cuda_module, device)
-                    self.cuda_execs[device.context] = module_exec
+                    module_exec = ModuleExec(cuda_module, module_hash, device)
+                    self.execs[device.context] = module_exec
                 else:
                     module_load_timer.extra_msg = " (error)"
                     raise Exception(f"Failed to load CUDA module '{self.name}'")
@@ -1936,20 +2078,22 @@ class Module:
         return module_exec
     def unload(self):
+        # force rehashing on next load
+        self.mark_modified()
         # clear loaded modules
-        self.cpu_exec = None
-        self.cuda_execs = {}
+        self.execs = {}
-        # clear content hash
-        self.content_hash = None
+    def mark_modified(self):
+        # clear hash data
+        self.hasher = None
+        # clear build failures
+        self.failed_builds = set()
     # lookup kernel entry points based on name, called after compilation / module load
     def get_kernel_hooks(self, kernel, device):
-        if device.is_cuda:
-            module_exec = self.cuda_execs.get(device.context)
-        else:
-            module_exec = self.cpu_exec
+        module_exec = self.execs.get(device.context)
         if module_exec is not None:
             return module_exec.get_kernel_hooks(kernel)
         else:
@@ -2056,6 +2200,63 @@ class ContextGuard:
             runtime.core.cuda_context_set_current(self.saved_context)
+class Event:
+    """A CUDA event that can be recorded onto a stream.
+    Events can be used for device-side synchronization, which do not block
+    the host thread.
+    """
+    # event creation flags
+    class Flags:
+        DEFAULT = 0x0
+        BLOCKING_SYNC = 0x1
+        DISABLE_TIMING = 0x2
+    def __new__(cls, *args, **kwargs):
+        """Creates a new event instance."""
+        instance = super(Event, cls).__new__(cls)
+        instance.owner = False
+        return instance
+    def __init__(self, device: "Devicelike" = None, cuda_event=None, enable_timing: bool = False):
+        """Initializes the event on a CUDA device.
+        Args:
+            device: The CUDA device whose streams this event may be recorded onto.
+              If ``None``, then the current default device will be used.
+            cuda_event: A pointer to a previously allocated CUDA event. If
+              `None`, then a new event will be allocated on the associated device.
+            enable_timing: If ``True`` this event will record timing data.
+              :func:`~warp.get_event_elapsed_time` can be used to measure the
+              time between two events created with ``enable_timing=True`` and
+              recorded onto streams.
+        """
+        device = get_device(device)
+        if not device.is_cuda:
+            raise RuntimeError(f"Device {device} is not a CUDA device")
+        self.device = device
+        if cuda_event is not None:
+            self.cuda_event = cuda_event
+        else:
+            flags = Event.Flags.DEFAULT
+            if not enable_timing:
+                flags |= Event.Flags.DISABLE_TIMING
+            self.cuda_event = runtime.core.cuda_event_create(device.context, flags)
+            if not self.cuda_event:
+                raise RuntimeError(f"Failed to create event on device {device}")
+            self.owner = True
+    def __del__(self):
+        if not self.owner:
+            return
+        runtime.core.cuda_event_destroy(self.cuda_event)
 class Stream:
     def __new__(cls, *args, **kwargs):
         instance = super(Stream, cls).__new__(cls)
@@ -2063,7 +2264,27 @@ class Stream:
         instance.owner = False
         return instance
-    def __init__(self, device=None, **kwargs):
+    def __init__(self, device: Optional[Union["Device", str]] = None, priority: int = 0, **kwargs):
+        """Initialize the stream on a device with an optional specified priority.
+        Args:
+            device: The CUDA device on which this stream will be created.
+            priority: An optional integer specifying the requested stream priority.
+              Can be -1 (high priority) or 0 (low/default priority).
+              Values outside this range will be clamped.
+            cuda_stream (int): A optional external stream handle passed as an
+              integer. The caller is responsible for ensuring that the external
+              stream does not get destroyed while it is referenced by this
+              object.
+        Raises:
+            RuntimeError: If function is called before Warp has completed
+              initialization with a ``device`` that is not an instance of
+              :class:`Device``.
+            RuntimeError: ``device`` is not a CUDA Device.
+            RuntimeError: The stream could not be created on the device.
+            TypeError: The requested stream priority is not an integer.
+        """
         # event used internally for synchronization (cached to avoid creating temporary events)
         self._cached_event = None
@@ -2072,7 +2293,7 @@ class Stream:
             device = runtime.get_device(device)
         elif not isinstance(device, Device):
             raise RuntimeError(
-                "A device object is required when creating a stream before or during Warp initialization"
+                "A Device object is required when creating a stream before or during Warp initialization"
             )
         if not device.is_cuda:
@@ -2085,7 +2306,11 @@ class Stream:
             self.cuda_stream = kwargs["cuda_stream"]
             device.runtime.core.cuda_stream_register(device.context, self.cuda_stream)
         else:
-            self.cuda_stream = device.runtime.core.cuda_stream_create(device.context)
+            if not isinstance(priority, int):
+                raise TypeError("Stream priority must be an integer.")
+            clamped_priority = max(-1, min(priority, 0))  # Only support two priority levels
+            self.cuda_stream = device.runtime.core.cuda_stream_create(device.context, clamped_priority)
             if not self.cuda_stream:
                 raise RuntimeError(f"Failed to create stream on device {device}")
             self.owner = True
@@ -2100,12 +2325,22 @@ class Stream:
             runtime.core.cuda_stream_unregister(self.device.context, self.cuda_stream)
     @property
-    def cached_event(self):
+    def cached_event(self) -> Event:
         if self._cached_event is None:
             self._cached_event = Event(self.device)
         return self._cached_event
-    def record_event(self, event=None):
+    def record_event(self, event: Optional[Event] = None) -> Event:
+        """Record an event onto the stream.
+        Args:
+            event: A warp.Event instance to be recorded onto the stream. If not
+              provided, an :class:`~warp.Event` on the same device will be created.
+        Raises:
+            RuntimeError: The provided :class:`~warp.Event` is from a different device than
+                the recording stream.
+        """
         if event is None:
             event = Event(self.device)
         elif event.device != self.device:
@@ -2117,56 +2352,45 @@ class Stream:
         return event
-    def wait_event(self, event):
+    def wait_event(self, event: Event):
+        """Makes all future work in this stream wait until `event` has completed.
+        This function does not block the host thread.
+        """
         runtime.core.cuda_stream_wait_event(self.cuda_stream, event.cuda_event)
-    def wait_stream(self, other_stream, event=None):
+    def wait_stream(self, other_stream: "Stream", event: Optional[Event] = None):
+        """Records an event on `other_stream` and makes this stream wait on it.
+        All work added to this stream after this function has been called will
+        delay their execution until all preceding commands in `other_stream`
+        have completed.
+        This function does not block the host thread.
+        Args:
+            other_stream: The stream on which the calling stream will wait for
+              previously issued commands to complete before executing subsequent
+              commands.
+            event: An optional :class:`Event` instance that will be used to
+              record an event onto ``other_stream``. If ``None``, an internally
+              managed :class:`Event` instance will be used.
+        """
         if event is None:
             event = other_stream.cached_event
         runtime.core.cuda_stream_wait_stream(self.cuda_stream, other_stream.cuda_stream, event.cuda_event)
-    # whether a graph capture is currently ongoing on this stream
     @property
-    def is_capturing(self):
+    def is_capturing(self) -> bool:
+        """A boolean indicating whether a graph capture is currently ongoing on this stream."""
         return bool(runtime.core.cuda_stream_is_capturing(self.cuda_stream))
-class Event:
-    # event creation flags
-    class Flags:
-        DEFAULT = 0x0
-        BLOCKING_SYNC = 0x1
-        DISABLE_TIMING = 0x2
-    def __new__(cls, *args, **kwargs):
-        instance = super(Event, cls).__new__(cls)
-        instance.owner = False
-        return instance
-    def __init__(self, device=None, cuda_event=None, enable_timing=False):
-        device = get_device(device)
-        if not device.is_cuda:
-            raise RuntimeError(f"Device {device} is not a CUDA device")
-        self.device = device
-        if cuda_event is not None:
-            self.cuda_event = cuda_event
-        else:
-            flags = Event.Flags.DEFAULT
-            if not enable_timing:
-                flags |= Event.Flags.DISABLE_TIMING
-            self.cuda_event = runtime.core.cuda_event_create(device.context, flags)
-            if not self.cuda_event:
-                raise RuntimeError(f"Failed to create event on device {device}")
-            self.owner = True
-    def __del__(self):
-        if not self.owner:
-            return
-        runtime.core.cuda_event_destroy(self.cuda_event)
+    @property
+    def priority(self) -> int:
+        """An integer representing the priority of the stream."""
+        return runtime.core.cuda_stream_get_priority(self.cuda_stream)
 class Device:
@@ -2178,14 +2402,14 @@ class Device:
             or ``"CPU"`` if the processor name cannot be determined.
         arch: An integer representing the compute capability version number calculated as
             ``10 * major + minor``. ``0`` for CPU devices.
-        is_uva: A boolean indicating whether or not the device supports unified addressing.
+        is_uva: A boolean indicating whether the device supports unified addressing.
             ``False`` for CPU devices.
-        is_cubin_supported: A boolean indicating whether or not Warp's version of NVRTC can directly
+        is_cubin_supported: A boolean indicating whether Warp's version of NVRTC can directly
             generate CUDA binary files (cubin) for this device's architecture. ``False`` for CPU devices.
-        is_mempool_supported: A boolean indicating whether or not the device supports using the
+        is_mempool_supported: A boolean indicating whether the device supports using the
             ``cuMemAllocAsync`` and ``cuMemPool`` family of APIs for stream-ordered memory allocations. ``False`` for
             CPU devices.
-        is_primary: A boolean indicating whether or not this device's CUDA context is also the
+        is_primary: A boolean indicating whether this device's CUDA context is also the
             device's primary context.
         uuid: A string representing the UUID of the CUDA device. The UUID is in the same format used by
             ``nvidia-smi -L``. ``None`` for CPU devices.
@@ -2274,7 +2498,7 @@ class Device:
             # initialize streams unless context acquisition is postponed
             if self._context is not None:
-                self.init_streams()
+                self._init_streams()
             # TODO: add more device-specific dispatch functions
             self.memset = lambda ptr, value, size: runtime.core.memset_device(self.context, ptr, value, size)
@@ -2285,7 +2509,13 @@ class Device:
         else:
             raise RuntimeError(f"Invalid device ordinal ({ordinal})'")
-    def get_allocator(self, pinned=False):
+    def get_allocator(self, pinned: bool = False):
+        """Get the memory allocator for this device.
+        Args:
+            pinned: If ``True``, an allocator for pinned memory will be
+              returned. Only applicable when this device is a CPU device.
+        """
         if self.is_cuda:
             return self.current_allocator
         else:
@@ -2294,7 +2524,8 @@ class Device:
             else:
                 return self.default_allocator
-    def init_streams(self):
+    def _init_streams(self):
+        """Initializes the device's current stream and the device's null stream."""
         # create a stream for asynchronous work
         self.set_stream(Stream(self))
@@ -2302,17 +2533,18 @@ class Device:
         self.null_stream = Stream(self, cuda_stream=None)
     @property
-    def is_cpu(self):
-        """A boolean indicating whether or not the device is a CPU device."""
+    def is_cpu(self) -> bool:
+        """A boolean indicating whether the device is a CPU device."""
         return self.ordinal < 0
     @property
-    def is_cuda(self):
-        """A boolean indicating whether or not the device is a CUDA device."""
+    def is_cuda(self) -> bool:
+        """A boolean indicating whether the device is a CUDA device."""
         return self.ordinal >= 0
     @property
-    def is_capturing(self):
+    def is_capturing(self) -> bool:
+        """A boolean indicating whether this device's default stream is currently capturing a graph."""
         if self.is_cuda and self.stream is not None:
             # There is no CUDA API to check if graph capture was started on a device, so we
             # can't tell if a capture was started by external code on a different stream.
@@ -2336,17 +2568,17 @@ class Device:
                 raise RuntimeError(f"Failed to acquire primary context for device {self}")
             self.runtime.context_map[self._context] = self
             # initialize streams
-            self.init_streams()
+            self._init_streams()
             runtime.core.cuda_context_set_current(prev_context)
         return self._context
     @property
-    def has_context(self):
-        """A boolean indicating whether or not the device has a CUDA context associated with it."""
+    def has_context(self) -> bool:
+        """A boolean indicating whether the device has a CUDA context associated with it."""
         return self._context is not None
     @property
-    def stream(self):
+    def stream(self) -> Stream:
         """The stream associated with a CUDA device.
         Raises:
@@ -2361,7 +2593,22 @@ class Device:
     def stream(self, stream):
         self.set_stream(stream)
-    def set_stream(self, stream, sync=True):
+    def set_stream(self, stream: Stream, sync: bool = True) -> None:
+        """Set the current stream for this CUDA device.
+        The current stream will be used by default for all kernel launches and
+        memory operations on this device.
+        If this is an external stream, the caller is responsible for
+        guaranteeing the lifetime of the stream.
+        Consider using :class:`warp.ScopedStream` instead.
+        Args:
+            stream: The stream to set as this device's current stream.
+            sync: If ``True``, then ``stream`` will perform a device-side
+              synchronization with the device's previous current stream.
+        """
         if self.is_cuda:
             if stream.device != self:
                 raise RuntimeError(f"Stream from device {stream.device} cannot be used on device {self}")
@@ -2372,12 +2619,12 @@ class Device:
             raise RuntimeError(f"Device {self} is not a CUDA device")
     @property
-    def has_stream(self):
-        """A boolean indicating whether or not the device has a stream associated with it."""
+    def has_stream(self) -> bool:
+        """A boolean indicating whether the device has a stream associated with it."""
         return self._stream is not None
     @property
-    def total_memory(self):
+    def total_memory(self) -> int:
         """The total amount of device memory available in bytes.
         This function is currently only implemented for CUDA devices. 0 will be returned if called on a CPU device.
@@ -2391,7 +2638,7 @@ class Device:
             return 0
     @property
-    def free_memory(self):
+    def free_memory(self) -> int:
         """The amount of memory on the device that is free according to the OS in bytes.
         This function is currently only implemented for CUDA devices. 0 will be returned if called on a CPU device.
@@ -2755,6 +3002,12 @@ class Runtime:
             self.core.mesh_refit_host.argtypes = [ctypes.c_uint64]
             self.core.mesh_refit_device.argtypes = [ctypes.c_uint64]
+            self.core.mesh_set_points_host.argtypes = [ctypes.c_uint64, warp.types.array_t]
+            self.core.mesh_set_points_device.argtypes = [ctypes.c_uint64, warp.types.array_t]
+            self.core.mesh_set_velocities_host.argtypes = [ctypes.c_uint64, warp.types.array_t]
+            self.core.mesh_set_velocities_device.argtypes = [ctypes.c_uint64, warp.types.array_t]
             self.core.hash_grid_create_host.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.c_int]
             self.core.hash_grid_create_host.restype = ctypes.c_uint64
             self.core.hash_grid_destroy_host.argtypes = [ctypes.c_uint64]
@@ -3029,7 +3282,7 @@ class Runtime:
             self.core.cuda_set_mempool_access_enabled.argtypes = [ctypes.c_int, ctypes.c_int, ctypes.c_int]
             self.core.cuda_set_mempool_access_enabled.restype = ctypes.c_int
-            self.core.cuda_stream_create.argtypes = [ctypes.c_void_p]
+            self.core.cuda_stream_create.argtypes = [ctypes.c_void_p, ctypes.c_int]
             self.core.cuda_stream_create.restype = ctypes.c_void_p
             self.core.cuda_stream_destroy.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
             self.core.cuda_stream_destroy.restype = None
@@ -3047,6 +3300,8 @@ class Runtime:
             self.core.cuda_stream_is_capturing.restype = ctypes.c_int
             self.core.cuda_stream_get_capture_id.argtypes = [ctypes.c_void_p]
             self.core.cuda_stream_get_capture_id.restype = ctypes.c_uint64
+            self.core.cuda_stream_get_priority.argtypes = [ctypes.c_void_p]
+            self.core.cuda_stream_get_priority.restype = ctypes.c_int
             self.core.cuda_event_create.argtypes = [ctypes.c_void_p, ctypes.c_uint]
             self.core.cuda_event_create.restype = ctypes.c_void_p
@@ -3382,10 +3637,10 @@ class Runtime:
         raise ValueError(f"Invalid device identifier: {ident}")
-    def set_default_device(self, ident: Devicelike):
+    def set_default_device(self, ident: Devicelike) -> None:
         self.default_device = self.get_device(ident)
-    def get_current_cuda_device(self):
+    def get_current_cuda_device(self) -> Device:
         current_context = self.core.cuda_context_get_current()
         if current_context is not None:
             current_device = self.context_map.get(current_context)
@@ -3415,7 +3670,7 @@ class Runtime:
             else:
                 raise RuntimeError('"cuda" device requested but CUDA is not supported by the hardware or driver')
-    def rename_device(self, device, alias):
+    def rename_device(self, device, alias) -> Device:
         del self.device_map[device.alias]
         device.alias = alias
         self.device_map[alias] = device
@@ -3462,7 +3717,7 @@ class Runtime:
                 return device
-    def unmap_cuda_device(self, alias):
+    def unmap_cuda_device(self, alias) -> None:
         device = self.device_map.get(alias)
         # make sure the alias refers to a CUDA device
@@ -3473,7 +3728,7 @@ class Runtime:
         del self.context_map[device.context]
         self.cuda_devices.remove(device)
-    def verify_cuda_device(self, device: Devicelike = None):
+    def verify_cuda_device(self, device: Devicelike = None) -> None:
         if warp.config.verify_cuda:
             device = runtime.get_device(device)
             if not device.is_cuda:
@@ -3485,13 +3740,13 @@ class Runtime:
 # global entry points
-def is_cpu_available():
+def is_cpu_available() -> bool:
     init()
-    return runtime.llvm
+    return runtime.llvm is not None
-def is_cuda_available():
+def is_cuda_available() -> bool:
     return get_cuda_device_count() > 0
@@ -3575,8 +3830,8 @@ def get_device(ident: Devicelike = None) -> Device:
     return runtime.get_device(ident)
-def set_device(ident: Devicelike):
-    """Sets the target device identified by the argument."""
+def set_device(ident: Devicelike) -> None:
+    """Sets the default device identified by the argument."""
     init()
@@ -3604,7 +3859,7 @@ def map_cuda_device(alias: str, context: ctypes.c_void_p = None) -> Device:
     return runtime.map_cuda_device(alias, context)
-def unmap_cuda_device(alias: str):
+def unmap_cuda_device(alias: str) -> None:
     """Remove a CUDA device with the given alias."""
     init()
@@ -3612,7 +3867,7 @@ def unmap_cuda_device(alias: str):
     runtime.unmap_cuda_device(alias)
-def is_mempool_supported(device: Devicelike):
+def is_mempool_supported(device: Devicelike) -> bool:
     """Check if CUDA memory pool allocators are available on the device."""
     init()
@@ -3622,7 +3877,7 @@ def is_mempool_supported(device: Devicelike):
     return device.is_mempool_supported
-def is_mempool_enabled(device: Devicelike):
+def is_mempool_enabled(device: Devicelike) -> bool:
     """Check if CUDA memory pool allocators are enabled on the device."""
     init()
@@ -3632,7 +3887,7 @@ def is_mempool_enabled(device: Devicelike):
     return device.is_mempool_enabled
-def set_mempool_enabled(device: Devicelike, enable: bool):
+def set_mempool_enabled(device: Devicelike, enable: bool) -> None:
     """Enable or disable CUDA memory pool allocators on the device.
     Pooled allocators are typically faster and allow allocating memory during graph capture.
@@ -3663,7 +3918,7 @@ def set_mempool_enabled(device: Devicelike, enable: bool):
             raise ValueError("Memory pools are only supported on CUDA devices")
-def set_mempool_release_threshold(device: Devicelike, threshold: Union[int, float]):
+def set_mempool_release_threshold(device: Devicelike, threshold: Union[int, float]) -> None:
     """Set the CUDA memory pool release threshold on the device.
     This is the amount of reserved memory to hold onto before trying to release memory back to the OS.
@@ -3694,7 +3949,7 @@ def set_mempool_release_threshold(device: Devicelike, threshold: Union[int, floa
         raise RuntimeError(f"Failed to set memory pool release threshold for device {device}")
-def get_mempool_release_threshold(device: Devicelike):
+def get_mempool_release_threshold(device: Devicelike) -> int:
     """Get the CUDA memory pool release threshold on the device."""
     init()
@@ -3710,7 +3965,7 @@ def get_mempool_release_threshold(device: Devicelike):
     return runtime.core.cuda_device_get_mempool_release_threshold(device.ordinal)
-def is_peer_access_supported(target_device: Devicelike, peer_device: Devicelike):
+def is_peer_access_supported(target_device: Devicelike, peer_device: Devicelike) -> bool:
     """Check if `peer_device` can directly access the memory of `target_device` on this system.
     This applies to memory allocated using default CUDA allocators.  For memory allocated using
@@ -3731,7 +3986,7 @@ def is_peer_access_supported(target_device: Devicelike, peer_device: Devicelike)
     return bool(runtime.core.cuda_is_peer_access_supported(target_device.ordinal, peer_device.ordinal))
-def is_peer_access_enabled(target_device: Devicelike, peer_device: Devicelike):
+def is_peer_access_enabled(target_device: Devicelike, peer_device: Devicelike) -> bool:
     """Check if `peer_device` can currently access the memory of `target_device`.
     This applies to memory allocated using default CUDA allocators.  For memory allocated using
@@ -3752,7 +4007,7 @@ def is_peer_access_enabled(target_device: Devicelike, peer_device: Devicelike):
     return bool(runtime.core.cuda_is_peer_access_enabled(target_device.context, peer_device.context))
-def set_peer_access_enabled(target_device: Devicelike, peer_device: Devicelike, enable: bool):
+def set_peer_access_enabled(target_device: Devicelike, peer_device: Devicelike, enable: bool) -> None:
     """Enable or disable direct access from `peer_device` to the memory of `target_device`.
     Enabling peer access can improve the speed of peer-to-peer memory transfers, but can have
@@ -3784,7 +4039,7 @@ def set_peer_access_enabled(target_device: Devicelike, peer_device: Devicelike,
         raise RuntimeError(f"Failed to {action} peer access from device {peer_device} to device {target_device}")
-def is_mempool_access_supported(target_device: Devicelike, peer_device: Devicelike):
+def is_mempool_access_supported(target_device: Devicelike, peer_device: Devicelike) -> bool:
     """Check if `peer_device` can directly access the memory pool of `target_device`.
     If mempool access is possible, it can be managed using `set_mempool_access_enabled()` and `is_mempool_access_enabled()`.
@@ -3801,7 +4056,7 @@ def is_mempool_access_supported(target_device: Devicelike, peer_device: Deviceli
     return target_device.is_mempool_supported and is_peer_access_supported(target_device, peer_device)
-def is_mempool_access_enabled(target_device: Devicelike, peer_device: Devicelike):
+def is_mempool_access_enabled(target_device: Devicelike, peer_device: Devicelike) -> bool:
     """Check if `peer_device` can currently access the memory pool of `target_device`.
     This applies to memory allocated using CUDA pooled allocators.  For memory allocated using
@@ -3822,7 +4077,7 @@ def is_mempool_access_enabled(target_device: Devicelike, peer_device: Devicelike
     return bool(runtime.core.cuda_is_mempool_access_enabled(target_device.ordinal, peer_device.ordinal))
-def set_mempool_access_enabled(target_device: Devicelike, peer_device: Devicelike, enable: bool):
+def set_mempool_access_enabled(target_device: Devicelike, peer_device: Devicelike, enable: bool) -> None:
     """Enable or disable access from `peer_device` to the memory pool of `target_device`.
     This applies to memory allocated using CUDA pooled allocators.  For memory allocated using
@@ -3858,26 +4113,41 @@ def set_mempool_access_enabled(target_device: Devicelike, peer_device: Devicelik
 def get_stream(device: Devicelike = None) -> Stream:
-    """Return the stream currently used by the given device"""
+    """Return the stream currently used by the given device.
+    Args:
+        device: An optional :class:`Device` instance or device alias
+          (e.g. "cuda:0") for which the current stream will be returned.
+          If ``None``, the default device will be used.
+    Raises:
+        RuntimeError: The device is not a CUDA device.
+    """
     return get_device(device).stream
-def set_stream(stream, device: Devicelike = None, sync: bool = False):
-    """Set the stream to be used by the given device.
+def set_stream(stream: Stream, device: Devicelike = None, sync: bool = False) -> None:
+    """Convenience function for calling :meth:`Device.set_stream` on the given ``device``.
-    If this is an external stream, caller is responsible for guaranteeing the lifetime of the stream.
-    Consider using wp.ScopedStream instead.
+    Args:
+        device: An optional :class:`Device` instance or device alias
+          (e.g. "cuda:0") for which the current stream is to be replaced with
+          ``stream``. If ``None``, the default device will be used.
+        stream: The stream to set as this device's current stream.
+        sync: If ``True``, then ``stream`` will perform a device-side
+          synchronization with the device's previous current stream.
     """
     get_device(device).set_stream(stream, sync=sync)
-def record_event(event: Event = None):
-    """Record a CUDA event on the current stream.
+def record_event(event: Optional[Event] = None):
+    """Convenience function for calling :meth:`Stream.record_event` on the current stream.
     Args:
-        event: Event to record. If None, a new Event will be created.
+        event: :class:`Event` instance to record. If ``None``, a new :class:`Event`
+          instance will be created.
     Returns:
         The recorded event.
@@ -3887,29 +4157,31 @@ def record_event(event: Event = None):
 def wait_event(event: Event):
-    """Make the current stream wait for a CUDA event.
+    """Convenience function for calling :meth:`Stream.wait_event` on the current stream.
     Args:
-        event: Event to wait for.
+        event: :class:`Event` instance to wait for.
     """
     get_stream().wait_event(event)
-def get_event_elapsed_time(start_event: Event, end_event: Event, synchronize: bool = True):
+def get_event_elapsed_time(start_event: Event, end_event: Event, synchronize: Optional[bool] = True):
     """Get the elapsed time between two recorded events.
-    The result is in milliseconds with a resolution of about 0.5 microsecond.
-    Both events must have been previously recorded with ``wp.record_event()`` or ``wp.Stream.record_event()``.
+    Both events must have been previously recorded with
+    :func:`~warp.record_event()` or :meth:`warp.Stream.record_event()`.
     If ``synchronize`` is False, the caller must ensure that device execution has reached ``end_event``
     prior to calling ``get_event_elapsed_time()``.
     Args:
-        start_event (Event): The start event.
-        end_event (Event): The end event.
-        synchronize (bool, optional): Whether Warp should synchronize on the ``end_event``.
+        start_event: The start event.
+        end_event: The end event.
+        synchronize: Whether Warp should synchronize on the ``end_event``.
+    Returns:
+        The elapsed time in milliseconds with a resolution about 0.5 ms.
     """
     # ensure the end_event is reached
@@ -3919,14 +4191,19 @@ def get_event_elapsed_time(start_event: Event, end_event: Event, synchronize: bo
     return runtime.core.cuda_event_elapsed_time(start_event.cuda_event, end_event.cuda_event)
-def wait_stream(stream: Stream, event: Event = None):
-    """Make the current stream wait for another CUDA stream to complete its work.
+def wait_stream(other_stream: Stream, event: Event = None):
+    """Convenience function for calling :meth:`Stream.wait_stream` on the current stream.
     Args:
-        event: Event to be used.  If None, a new Event will be created.
+        other_stream: The stream on which the calling stream will wait for
+          previously issued commands to complete before executing subsequent
+          commands.
+        event: An optional :class:`Event` instance that will be used to
+          record an event onto ``other_stream``. If ``None``, an internally
+          managed :class:`Event` instance will be used.
     """
-    get_stream().wait_stream(stream, event=event)
+    get_stream().wait_stream(other_stream, event=event)
 class RegisteredGLBuffer:
@@ -4362,7 +4639,7 @@ def from_numpy(
         dtype: The data type of the new Warp array. If this is not provided, the data type will be inferred.
         shape: The shape of the Warp array.
         device: The device on which the Warp array will be constructed.
-        requires_grad: Whether or not gradients will be tracked for this array.
+        requires_grad: Whether gradients will be tracked for this array.
     Raises:
         RuntimeError: The data type of the NumPy array is not supported.
@@ -4398,7 +4675,7 @@ def pack_arg(kernel, arg_type, arg_name, value, device, adjoint=False):
             return arg_type.__ctype__()
         elif isinstance(value, warp.types.array_t):
-            # accept array descriptors verbatum
+            # accept array descriptors verbatim
             return value
         else:
@@ -4865,7 +5142,7 @@ def synchronize_device(device: Devicelike = None):
         runtime.core.cuda_context_synchronize(device.context)
-def synchronize_stream(stream_or_device=None):
+def synchronize_stream(stream_or_device: Union[Stream, Devicelike, None] = None):
     """Synchronize the calling CPU thread with any outstanding CUDA work on the specified stream.
     This function allows the host application code to ensure that all kernel launches
@@ -4989,7 +5266,7 @@ def set_module_options(options: Dict[str, Any], module: Optional[Any] = None):
         m = module
     get_module(m.__name__).options.update(options)
-    get_module(m.__name__).unload()
+    get_module(m.__name__).mark_modified()
 def get_module_options(module: Optional[Any] = None) -> Dict[str, Any]:
@@ -5016,7 +5293,7 @@ def capture_begin(device: Devicelike = None, stream=None, force_module_load=None
     Args:
         device: The CUDA device to capture on
         stream: The CUDA stream to capture on
-        force_module_load: Whether or not to force loading of all kernels before capture.
+        force_module_load: Whether to force loading of all kernels before capture.
           In general it is better to use :func:`~warp.load_module()` to selectively load kernels.
           When running with CUDA drivers that support CUDA 12.3 or newer, this option is not recommended to be set to
           ``True`` because kernels can be loaded during graph capture on more recent drivers. If this argument is
@@ -5574,7 +5851,7 @@ def export_stubs(file):  # pragma: no cover
             return_str = ""
-            if not f.export or f.hidden:  # or f.generic:
+            if f.hidden:  # or f.generic:
                 continue
             return_type = f.value_func(None, None)