PyPI - warp-lang - Versions diffs - 1.6.2__py3-none-win_amd64.whl → 1.7.1__py3-none-win_amd64.whl - Mend

warp-lang 1.6.2__py3-none-win_amd64.whl → 1.7.1__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (191) hide show

warp/__init__.py +7 -1
warp/autograd.py +12 -2
warp/bin/warp-clang.dll +0 -0
warp/bin/warp.dll +0 -0
warp/build.py +410 -0
warp/build_dll.py +6 -14
warp/builtins.py +463 -372
warp/codegen.py +196 -124
warp/config.py +42 -6
warp/context.py +496 -271
warp/dlpack.py +8 -6
warp/examples/assets/nonuniform.usd +0 -0
warp/examples/assets/nvidia_logo.png +0 -0
warp/examples/benchmarks/benchmark_cloth.py +1 -1
warp/examples/benchmarks/benchmark_tile_load_store.py +103 -0
warp/examples/core/example_sample_mesh.py +300 -0
warp/examples/distributed/example_jacobi_mpi.py +507 -0
warp/examples/fem/example_apic_fluid.py +1 -1
warp/examples/fem/example_burgers.py +2 -2
warp/examples/fem/example_deformed_geometry.py +1 -1
warp/examples/fem/example_distortion_energy.py +1 -1
warp/examples/fem/example_magnetostatics.py +6 -6
warp/examples/fem/utils.py +9 -3
warp/examples/interop/example_jax_callable.py +116 -0
warp/examples/interop/example_jax_ffi_callback.py +132 -0
warp/examples/interop/example_jax_kernel.py +205 -0
warp/examples/optim/example_fluid_checkpoint.py +497 -0
warp/examples/tile/example_tile_matmul.py +2 -4
warp/fem/__init__.py +11 -1
warp/fem/adaptivity.py +4 -4
warp/fem/field/field.py +11 -1
warp/fem/field/nodal_field.py +56 -88
warp/fem/field/virtual.py +62 -23
warp/fem/geometry/adaptive_nanogrid.py +16 -13
warp/fem/geometry/closest_point.py +1 -1
warp/fem/geometry/deformed_geometry.py +5 -2
warp/fem/geometry/geometry.py +5 -0
warp/fem/geometry/grid_2d.py +12 -12
warp/fem/geometry/grid_3d.py +12 -15
warp/fem/geometry/hexmesh.py +5 -7
warp/fem/geometry/nanogrid.py +9 -11
warp/fem/geometry/quadmesh.py +13 -13
warp/fem/geometry/tetmesh.py +3 -4
warp/fem/geometry/trimesh.py +7 -20
warp/fem/integrate.py +262 -93
warp/fem/linalg.py +5 -5
warp/fem/quadrature/pic_quadrature.py +37 -22
warp/fem/quadrature/quadrature.py +194 -25
warp/fem/space/__init__.py +1 -1
warp/fem/space/basis_function_space.py +4 -2
warp/fem/space/basis_space.py +25 -18
warp/fem/space/hexmesh_function_space.py +2 -2
warp/fem/space/partition.py +6 -2
warp/fem/space/quadmesh_function_space.py +8 -8
warp/fem/space/shape/cube_shape_function.py +23 -23
warp/fem/space/shape/square_shape_function.py +12 -12
warp/fem/space/shape/triangle_shape_function.py +1 -1
warp/fem/space/tetmesh_function_space.py +3 -3
warp/fem/space/trimesh_function_space.py +2 -2
warp/fem/utils.py +12 -6
warp/jax.py +14 -1
warp/jax_experimental/__init__.py +16 -0
warp/{jax_experimental.py → jax_experimental/custom_call.py} +28 -29
warp/jax_experimental/ffi.py +702 -0
warp/jax_experimental/xla_ffi.py +602 -0
warp/math.py +89 -0
warp/native/array.h +13 -0
warp/native/builtin.h +29 -3
warp/native/bvh.cpp +3 -1
warp/native/bvh.cu +42 -14
warp/native/bvh.h +2 -1
warp/native/clang/clang.cpp +30 -3
warp/native/cuda_util.cpp +14 -0
warp/native/cuda_util.h +2 -0
warp/native/exports.h +68 -63
warp/native/intersect.h +26 -26
warp/native/intersect_adj.h +33 -33
warp/native/marching.cu +1 -1
warp/native/mat.h +513 -9
warp/native/mesh.h +10 -10
warp/native/quat.h +99 -11
warp/native/rand.h +6 -0
warp/native/sort.cpp +122 -59
warp/native/sort.cu +152 -15
warp/native/sort.h +8 -1
warp/native/sparse.cpp +43 -22
warp/native/sparse.cu +52 -17
warp/native/svd.h +116 -0
warp/native/tile.h +312 -116
warp/native/tile_reduce.h +46 -3
warp/native/vec.h +68 -7
warp/native/volume.cpp +85 -113
warp/native/volume_builder.cu +25 -10
warp/native/volume_builder.h +6 -0
warp/native/warp.cpp +5 -6
warp/native/warp.cu +100 -11
warp/native/warp.h +19 -10
warp/optim/linear.py +10 -10
warp/render/render_opengl.py +19 -17
warp/render/render_usd.py +93 -3
warp/sim/articulation.py +4 -4
warp/sim/collide.py +32 -19
warp/sim/import_mjcf.py +449 -155
warp/sim/import_urdf.py +32 -12
warp/sim/inertia.py +189 -156
warp/sim/integrator_euler.py +8 -5
warp/sim/integrator_featherstone.py +3 -10
warp/sim/integrator_vbd.py +207 -2
warp/sim/integrator_xpbd.py +8 -5
warp/sim/model.py +71 -25
warp/sim/render.py +4 -0
warp/sim/utils.py +2 -2
warp/sparse.py +642 -555
warp/stubs.py +217 -20
warp/tests/__main__.py +0 -15
warp/tests/assets/torus.usda +1 -1
warp/tests/cuda/__init__.py +0 -0
warp/tests/{test_mempool.py → cuda/test_mempool.py} +39 -0
warp/tests/{test_streams.py → cuda/test_streams.py} +71 -0
warp/tests/geometry/__init__.py +0 -0
warp/tests/{test_mesh_query_point.py → geometry/test_mesh_query_point.py} +66 -63
warp/tests/{test_mesh_query_ray.py → geometry/test_mesh_query_ray.py} +1 -1
warp/tests/{test_volume.py → geometry/test_volume.py} +41 -6
warp/tests/interop/__init__.py +0 -0
warp/tests/{test_dlpack.py → interop/test_dlpack.py} +28 -5
warp/tests/sim/__init__.py +0 -0
warp/tests/{disabled_kinematics.py → sim/disabled_kinematics.py} +9 -10
warp/tests/{test_collision.py → sim/test_collision.py} +236 -205
warp/tests/sim/test_inertia.py +161 -0
warp/tests/{test_model.py → sim/test_model.py} +40 -0
warp/tests/{flaky_test_sim_grad.py → sim/test_sim_grad.py} +4 -0
warp/tests/{test_sim_kinematics.py → sim/test_sim_kinematics.py} +2 -1
warp/tests/sim/test_vbd.py +597 -0
warp/tests/sim/test_xpbd.py +399 -0
warp/tests/test_bool.py +1 -1
warp/tests/test_codegen.py +24 -3
warp/tests/test_examples.py +40 -38
warp/tests/test_fem.py +98 -14
warp/tests/test_linear_solvers.py +0 -11
warp/tests/test_mat.py +577 -156
warp/tests/test_mat_scalar_ops.py +4 -4
warp/tests/test_overwrite.py +0 -60
warp/tests/test_quat.py +356 -151
warp/tests/test_rand.py +44 -37
warp/tests/test_sparse.py +47 -6
warp/tests/test_spatial.py +75 -0
warp/tests/test_static.py +1 -1
warp/tests/test_utils.py +84 -4
warp/tests/test_vec.py +336 -178
warp/tests/tile/__init__.py +0 -0
warp/tests/{test_tile.py → tile/test_tile.py} +136 -51
warp/tests/{test_tile_load.py → tile/test_tile_load.py} +98 -1
warp/tests/{test_tile_mathdx.py → tile/test_tile_mathdx.py} +9 -6
warp/tests/{test_tile_mlp.py → tile/test_tile_mlp.py} +25 -14
warp/tests/{test_tile_reduce.py → tile/test_tile_reduce.py} +60 -1
warp/tests/{test_tile_view.py → tile/test_tile_view.py} +1 -1
warp/tests/unittest_serial.py +1 -0
warp/tests/unittest_suites.py +45 -62
warp/tests/unittest_utils.py +2 -1
warp/thirdparty/unittest_parallel.py +3 -1
warp/types.py +175 -666
warp/utils.py +137 -72
{warp_lang-1.6.2.dist-info → warp_lang-1.7.1.dist-info}/METADATA +46 -12
{warp_lang-1.6.2.dist-info → warp_lang-1.7.1.dist-info}/RECORD +184 -171
{warp_lang-1.6.2.dist-info → warp_lang-1.7.1.dist-info}/WHEEL +1 -1
{warp_lang-1.6.2.dist-info → warp_lang-1.7.1.dist-info/licenses}/LICENSE.md +0 -26
warp/examples/optim/example_walker.py +0 -317
warp/native/cutlass_gemm.cpp +0 -43
warp/native/cutlass_gemm.cu +0 -382
warp/tests/test_matmul.py +0 -511
warp/tests/test_matmul_lite.py +0 -411
warp/tests/test_vbd.py +0 -386
warp/tests/unused_test_misc.py +0 -77
/warp/tests/{test_async.py → cuda/test_async.py} +0 -0
/warp/tests/{test_ipc.py → cuda/test_ipc.py} +0 -0
/warp/tests/{test_multigpu.py → cuda/test_multigpu.py} +0 -0
/warp/tests/{test_peer.py → cuda/test_peer.py} +0 -0
/warp/tests/{test_pinned.py → cuda/test_pinned.py} +0 -0
/warp/tests/{test_bvh.py → geometry/test_bvh.py} +0 -0
/warp/tests/{test_hash_grid.py → geometry/test_hash_grid.py} +0 -0
/warp/tests/{test_marching_cubes.py → geometry/test_marching_cubes.py} +0 -0
/warp/tests/{test_mesh.py → geometry/test_mesh.py} +0 -0
/warp/tests/{test_mesh_query_aabb.py → geometry/test_mesh_query_aabb.py} +0 -0
/warp/tests/{test_volume_write.py → geometry/test_volume_write.py} +0 -0
/warp/tests/{test_jax.py → interop/test_jax.py} +0 -0
/warp/tests/{test_paddle.py → interop/test_paddle.py} +0 -0
/warp/tests/{test_torch.py → interop/test_torch.py} +0 -0
/warp/tests/{test_coloring.py → sim/test_coloring.py} +0 -0
/warp/tests/{test_sim_grad_bounce_linear.py → sim/test_sim_grad_bounce_linear.py} +0 -0
/warp/tests/{test_tile_shared_memory.py → tile/test_tile_shared_memory.py} +0 -0
{warp_lang-1.6.2.dist-info → warp_lang-1.7.1.dist-info}/top_level.txt +0 -0

warp/context.py CHANGED Viewed

@@ -17,7 +17,6 @@ from __future__ import annotations
 import ast
 import ctypes
-import errno
 import functools
 import hashlib
 import inspect
@@ -28,13 +27,27 @@ import operator
 import os
 import platform
 import sys
-import time
 import types
 import typing
 import weakref
 from copy import copy as shallowcopy
 from pathlib import Path
-from typing import Any, Callable, Dict, List, Optional, Sequence, Tuple, Union
+from typing import (
+    Any,
+    Callable,
+    Dict,
+    List,
+    Literal,
+    Mapping,
+    Optional,
+    Sequence,
+    Set,
+    Tuple,
+    TypeVar,
+    Union,
+    get_args,
+    get_origin,
+)
 import numpy as np
@@ -42,7 +55,7 @@ import warp
 import warp.build
 import warp.codegen
 import warp.config
-from warp.types import launch_bounds_t
+from warp.types import Array, launch_bounds_t
 # represents either a built-in or user-defined function
@@ -71,10 +84,10 @@ def get_function_args(func):
 complex_type_hints = (Any, Callable, Tuple)
 sequence_types = (list, tuple)
-function_key_counts = {}
+function_key_counts: Dict[str, int] = {}
-def generate_unique_function_identifier(key):
+def generate_unique_function_identifier(key: str) -> str:
     # Generate unique identifiers for user-defined functions in native code.
     # - Prevents conflicts when a function is redefined and old versions are still in use.
     # - Prevents conflicts between multiple closures returned from the same function.
@@ -107,40 +120,40 @@ def generate_unique_function_identifier(key):
 class Function:
     def __init__(
         self,
-        func,
-        key,
-        namespace,
-        input_types=None,
-        value_type=None,
-        value_func=None,
-        export_func=None,
-        dispatch_func=None,
-        lto_dispatch_func=None,
-        module=None,
-        variadic=False,
-        initializer_list_func=None,
-        export=False,
-        doc="",
-        group="",
-        hidden=False,
-        skip_replay=False,
-        missing_grad=False,
-        generic=False,
-        native_func=None,
-        defaults=None,
-        custom_replay_func=None,
-        native_snippet=None,
-        adj_native_snippet=None,
-        replay_snippet=None,
-        skip_forward_codegen=False,
-        skip_reverse_codegen=False,
-        custom_reverse_num_input_args=-1,
-        custom_reverse_mode=False,
-        overloaded_annotations=None,
-        code_transformers=None,
-        skip_adding_overload=False,
-        require_original_output_arg=False,
-        scope_locals=None,  # the locals() where the function is defined, used for overload management
+        func: Optional[Callable],
+        key: str,
+        namespace: str,
+        input_types: Optional[Dict[str, Union[type, TypeVar]]] = None,
+        value_type: Optional[type] = None,
+        value_func: Optional[Callable[[Mapping[str, type], Mapping[str, Any]], type]] = None,
+        export_func: Optional[Callable[[Dict[str, type]], Dict[str, type]]] = None,
+        dispatch_func: Optional[Callable] = None,
+        lto_dispatch_func: Optional[Callable] = None,
+        module: Optional[Module] = None,
+        variadic: bool = False,
+        initializer_list_func: Optional[Callable[[Dict[str, Any], type], bool]] = None,
+        export: bool = False,
+        doc: str = "",
+        group: str = "",
+        hidden: bool = False,
+        skip_replay: bool = False,
+        missing_grad: bool = False,
+        generic: bool = False,
+        native_func: Optional[str] = None,
+        defaults: Optional[Dict[str, Any]] = None,
+        custom_replay_func: Optional[Function] = None,
+        native_snippet: Optional[str] = None,
+        adj_native_snippet: Optional[str] = None,
+        replay_snippet: Optional[str] = None,
+        skip_forward_codegen: bool = False,
+        skip_reverse_codegen: bool = False,
+        custom_reverse_num_input_args: int = -1,
+        custom_reverse_mode: bool = False,
+        overloaded_annotations: Optional[Dict[str, type]] = None,
+        code_transformers: Optional[List[ast.NodeTransformer]] = None,
+        skip_adding_overload: bool = False,
+        require_original_output_arg: bool = False,
+        scope_locals: Optional[Dict[str, Any]] = None,
     ):
         if code_transformers is None:
             code_transformers = []
@@ -165,7 +178,7 @@ class Function:
         self.native_snippet = native_snippet
         self.adj_native_snippet = adj_native_snippet
         self.replay_snippet = replay_snippet
-        self.custom_grad_func = None
+        self.custom_grad_func: Optional[Function] = None
         self.require_original_output_arg = require_original_output_arg
         self.generic_parent = None  # generic function that was used to instantiate this overload
@@ -181,6 +194,7 @@ class Function:
         )
         self.missing_grad = missing_grad  # whether builtin is missing a corresponding adjoint
         self.generic = generic
+        self.mangled_name: Optional[str] = None
         # allow registering functions with a different name in Python and native code
         if native_func is None:
@@ -197,8 +211,8 @@ class Function:
             # user-defined function
             # generic and concrete overload lookups by type signature
-            self.user_templates = {}
-            self.user_overloads = {}
+            self.user_templates: Dict[str, Function] = {}
+            self.user_overloads: Dict[str, Function] = {}
             # user defined (Python) function
             self.adj = warp.codegen.Adjoint(
@@ -229,19 +243,17 @@ class Function:
             # builtin function
             # embedded linked list of all overloads
-            # the builtin_functions dictionary holds
-            # the list head for a given key (func name)
-            self.overloads = []
+            # the builtin_functions dictionary holds the list head for a given key (func name)
+            self.overloads: List[Function] = []
             # builtin (native) function, canonicalize argument types
-            for k, v in input_types.items():
-                self.input_types[k] = warp.types.type_to_warp(v)
+            if input_types is not None:
+                for k, v in input_types.items():
+                    self.input_types[k] = warp.types.type_to_warp(v)
             # cache mangled name
             if self.export and self.is_simple():
                 self.mangled_name = self.mangle()
-            else:
-                self.mangled_name = None
         if not skip_adding_overload:
             self.add_overload(self)
@@ -272,7 +284,7 @@ class Function:
             signature_params.append(param)
         self.signature = inspect.Signature(signature_params)
-        # scope for resolving overloads
+        # scope for resolving overloads, the locals() where the function is defined
         if scope_locals is None:
             scope_locals = inspect.currentframe().f_back.f_locals
@@ -334,10 +346,10 @@ class Function:
         # this function has no overloads, call it like a plain Python function
         return self.func(*args, **kwargs)
-    def is_builtin(self):
+    def is_builtin(self) -> bool:
         return self.func is None
-    def is_simple(self):
+    def is_simple(self) -> bool:
         if self.variadic:
             return False
@@ -351,9 +363,8 @@ class Function:
         return True
-    def mangle(self):
-        # builds a mangled name for the C-exported
-        # function, e.g.: builtin_normalize_vec3()
+    def mangle(self) -> str:
+        """Build a mangled name for the C-exported function, e.g.: `builtin_normalize_vec3()`."""
         name = "builtin_" + self.key
@@ -369,7 +380,7 @@ class Function:
         return "_".join([name, *types])
-    def add_overload(self, f):
+    def add_overload(self, f: Function) -> None:
         if self.is_builtin():
             # todo: note that it is an error to add two functions
             # with the exact same signature as this would cause compile
@@ -384,7 +395,7 @@ class Function:
         else:
             # get function signature based on the input types
             sig = warp.types.get_signature(
-                f.input_types.values(), func_name=f.key, arg_names=list(f.input_types.keys())
+                list(f.input_types.values()), func_name=f.key, arg_names=list(f.input_types.keys())
             )
             # check if generic
@@ -393,7 +404,7 @@ class Function:
             else:
                 self.user_overloads[sig] = f
-    def get_overload(self, arg_types, kwarg_types):
+    def get_overload(self, arg_types: List[type], kwarg_types: Mapping[str, type]) -> Optional[Function]:
         assert not self.is_builtin()
         for f in self.user_overloads.values():
@@ -446,7 +457,7 @@ class Function:
         return f"<Function {self.key}({inputs_str})>"
-def call_builtin(func: Function, *params) -> Tuple[bool, Any]:
+def call_builtin(func: Function, *params: Any) -> Tuple[bool, Any]:
     uses_non_warp_array_type = False
     init()
@@ -758,42 +769,62 @@ class Kernel:
         return f"{self.key}_{hash_suffix}"
+    def __call__(self, *args, **kwargs):
+        # we implement this function only to ensure Kernel is a callable object
+        # so that we can document Warp kernels in the same way as Python functions
+        # annotated by @wp.kernel (see functools.update_wrapper())
+        raise NotImplementedError("Kernel.__call__() is not implemented, please use wp.launch() instead")
 # ----------------------
 # decorator to register function, @func
-def func(f):
-    name = warp.codegen.make_full_qualified_name(f)
-    scope_locals = inspect.currentframe().f_back.f_locals
-    m = get_module(f.__module__)
-    doc = getattr(f, "__doc__", "") or ""
-    Function(
-        func=f,
-        key=name,
-        namespace="",
-        module=m,
-        value_func=None,
-        scope_locals=scope_locals,
-        doc=doc.strip(),
-    )  # value_type not known yet, will be inferred during Adjoint.build()
-    # use the top of the list of overloads for this key
-    g = m.functions[name]
-    # copy over the function attributes, including docstring
-    return functools.update_wrapper(g, f)
-def func_native(snippet, adj_snippet=None, replay_snippet=None):
+def func(f: Optional[Callable] = None, *, name: Optional[str] = None):
+    def wrapper(f, *args, **kwargs):
+        if name is None:
+            key = warp.codegen.make_full_qualified_name(f)
+        else:
+            key = name
+        scope_locals = inspect.currentframe().f_back.f_back.f_locals
+        m = get_module(f.__module__)
+        doc = getattr(f, "__doc__", "") or ""
+        Function(
+            func=f,
+            key=key,
+            namespace="",
+            module=m,
+            value_func=None,
+            scope_locals=scope_locals,
+            doc=doc.strip(),
+        )  # value_type not known yet, will be inferred during Adjoint.build()
+        # use the top of the list of overloads for this key
+        g = m.functions[key]
+        # copy over the function attributes, including docstring
+        return functools.update_wrapper(g, f)
+    if f is None:
+        # Arguments were passed to the decorator.
+        return wrapper
+    return wrapper(f)
+def func_native(snippet: str, adj_snippet: Optional[str] = None, replay_snippet: Optional[str] = None):
     """
     Decorator to register native code snippet, @func_native
     """
-    scope_locals = inspect.currentframe().f_back.f_locals
+    frame = inspect.currentframe()
+    if frame is None or frame.f_back is None:
+        scope_locals = {}
+    else:
+        scope_locals = frame.f_back.f_locals
-    def snippet_func(f):
+    def snippet_func(f: Callable) -> Callable:
         name = warp.codegen.make_full_qualified_name(f)
         m = get_module(f.__module__)
@@ -965,22 +996,71 @@ def func_replay(forward_fn):
     return wrapper
-# decorator to register kernel, @kernel, custom_name may be a string
-# that creates a kernel with a different name from the actual function
-def kernel(f=None, *, enable_backward=None):
+def kernel(
+    f: Optional[Callable] = None,
+    *,
+    enable_backward: Optional[bool] = None,
+    module: Optional[Union[Module, Literal["unique"]]] = None,
+):
+    """
+    Decorator to register a Warp kernel from a Python function.
+    The function must be defined with type annotations for all arguments.
+    The function must not return anything.
+    Example::
+        @wp.kernel
+        def my_kernel(a: wp.array(dtype=float), b: wp.array(dtype=float)):
+            tid = wp.tid()
+            b[tid] = a[tid] + 1.0
+        @wp.kernel(enable_backward=False)
+        def my_kernel_no_backward(a: wp.array(dtype=float, ndim=2), x: float):
+            # the backward pass will not be generated
+            i, j = wp.tid()
+            a[i, j] = x
+        @wp.kernel(module="unique")
+        def my_kernel_unique_module(a: wp.array(dtype=float), b: wp.array(dtype=float)):
+            # the kernel will be registered in new unique module created just for this
+            # kernel and its dependent functions and structs
+            tid = wp.tid()
+            b[tid] = a[tid] + 1.0
+    Args:
+        f: The function to be registered as a kernel.
+        enable_backward: If False, the backward pass will not be generated.
+        module: The :class:`warp.context.Module` to which the kernel belongs. Alternatively, if a string `"unique"` is provided, the kernel is assigned to a new module named after the kernel name and hash. If None, the module is inferred from the function's module.
+    Returns:
+        The registered kernel.
+    """
     def wrapper(f, *args, **kwargs):
         options = {}
         if enable_backward is not None:
             options["enable_backward"] = enable_backward
-        m = get_module(f.__module__)
+        if module is None:
+            m = get_module(f.__module__)
+        elif module == "unique":
+            m = Module(f.__name__, None)
+        else:
+            m = module
         k = Kernel(
             func=f,
             key=warp.codegen.make_full_qualified_name(f),
             module=m,
             options=options,
         )
+        if module == "unique":
+            # add the hash to the module name
+            hasher = warp.context.ModuleHasher(m)
+            k.module.name = f"{k.key}_{hasher.module_hash.hex()[:8]}"
         k = functools.update_wrapper(k, f)
         return k
@@ -992,7 +1072,7 @@ def kernel(f=None, *, enable_backward=None):
 # decorator to register struct, @struct
-def struct(c):
+def struct(c: type):
     m = get_module(c.__module__)
     s = warp.codegen.Struct(cls=c, key=warp.codegen.make_full_qualified_name(c), module=m)
     s = functools.update_wrapper(s, c)
@@ -1105,47 +1185,47 @@ scalar_types.update({x: x._wp_scalar_type_ for x in warp.types.vector_types})
 def add_builtin(
-    key,
-    input_types=None,
-    constraint=None,
-    value_type=None,
-    value_func=None,
-    export_func=None,
-    dispatch_func=None,
-    lto_dispatch_func=None,
-    doc="",
-    namespace="wp::",
-    variadic=False,
+    key: str,
+    input_types: Optional[Dict[str, Union[type, TypeVar]]] = None,
+    constraint: Optional[Callable[[Mapping[str, type]], bool]] = None,
+    value_type: Optional[type] = None,
+    value_func: Optional[Callable] = None,
+    export_func: Optional[Callable] = None,
+    dispatch_func: Optional[Callable] = None,
+    lto_dispatch_func: Optional[Callable] = None,
+    doc: str = "",
+    namespace: str = "wp::",
+    variadic: bool = False,
     initializer_list_func=None,
-    export=True,
-    group="Other",
-    hidden=False,
-    skip_replay=False,
-    missing_grad=False,
-    native_func=None,
-    defaults=None,
-    require_original_output_arg=False,
+    export: bool = True,
+    group: str = "Other",
+    hidden: bool = False,
+    skip_replay: bool = False,
+    missing_grad: bool = False,
+    native_func: Optional[str] = None,
+    defaults: Optional[Dict[str, Any]] = None,
+    require_original_output_arg: bool = False,
 ):
     """Main entry point to register a new built-in function.
     Args:
-        key (str): Function name. Multiple overloaded functions can be registered
+        key: Function name. Multiple overloaded functions can be registered
             under the same name as long as their signature differ.
-        input_types (Mapping[str, Any]): Signature of the user-facing function.
+        input_types: Signature of the user-facing function.
             Variadic arguments are supported by prefixing the parameter names
             with asterisks as in `*args` and `**kwargs`. Generic arguments are
             supported with types such as `Any`, `Float`, `Scalar`, etc.
-        constraint (Callable): For functions that define generic arguments and
+        constraint: For functions that define generic arguments and
             are to be exported, this callback is used to specify whether some
             combination of inferred arguments are valid or not.
-        value_type (Any): Type returned by the function.
-        value_func (Callable): Callback used to specify the return type when
+        value_type: Type returned by the function.
+        value_func: Callback used to specify the return type when
             `value_type` isn't enough.
-        export_func (Callable): Callback used during the context stage to specify
+        export_func: Callback used during the context stage to specify
             the signature of the underlying C++ function, not accounting for
             the template parameters.
             If not provided, `input_types` is used.
-        dispatch_func (Callable): Callback used during the codegen stage to specify
+        dispatch_func: Callback used during the codegen stage to specify
             the runtime and template arguments to be passed to the underlying C++
             function. In other words, this allows defining a mapping between
             the signatures of the user-facing and the C++ functions, and even to
@@ -1153,27 +1233,26 @@ def add_builtin(
             The arguments returned must be of type `codegen.Var`.
             If not provided, all arguments passed by the users when calling
             the built-in are passed as-is as runtime arguments to the C++ function.
-        lto_dispatch_func (Callable): Same as dispatch_func, but takes an 'option' dict
+        lto_dispatch_func: Same as dispatch_func, but takes an 'option' dict
             as extra argument (indicating tile_size and target architecture) and returns
             an LTO-IR buffer as extra return value
-        doc (str): Used to generate the Python's docstring and the HTML documentation.
+        doc: Used to generate the Python's docstring and the HTML documentation.
         namespace: Namespace for the underlying C++ function.
-        variadic (bool): Whether the function declares variadic arguments.
-        initializer_list_func (bool): Whether to use the initializer list syntax
-            when passing the arguments to the underlying C++ function.
-        export (bool): Whether the function is to be exposed to the Python
+        variadic: Whether the function declares variadic arguments.
+        initializer_list_func: Callback to determine whether to use the
+            initializer list syntax when passing the arguments to the underlying
+            C++ function.
+        export: Whether the function is to be exposed to the Python
             interpreter so that it becomes available from within the `warp`
             module.
-        group (str): Classification used for the documentation.
-        hidden (bool): Whether to add that function into the documentation.
-        skip_replay (bool): Whether operation will be performed during
+        group: Classification used for the documentation.
+        hidden: Whether to add that function into the documentation.
+        skip_replay: Whether operation will be performed during
             the forward replay in the backward pass.
-        missing_grad (bool): Whether the function is missing a corresponding
-            adjoint.
-        native_func (str): Name of the underlying C++ function.
-        defaults (Mapping[str, Any]): Default values for the parameters defined
-            in `input_types`.
-        require_original_output_arg (bool): Used during the codegen stage to
+        missing_grad: Whether the function is missing a corresponding adjoint.
+        native_func: Name of the underlying C++ function.
+        defaults: Default values for the parameters defined in `input_types`.
+        require_original_output_arg: Used during the codegen stage to
             specify whether an adjoint parameter corresponding to the return
             value should be included in the signature of the backward function.
     """
@@ -1355,19 +1434,14 @@ def add_builtin(
 def register_api_function(
     function: Function,
     group: str = "Other",
-    hidden=False,
+    hidden: bool = False,
 ):
     """Main entry point to register a Warp Python function to be part of the Warp API and appear in the documentation.
     Args:
-        function (Function): Warp function to be registered.
-        group (str): Classification used for the documentation.
-        input_types (Mapping[str, Any]): Signature of the user-facing function.
-            Variadic arguments are supported by prefixing the parameter names
-            with asterisks as in `*args` and `**kwargs`. Generic arguments are
-            supported with types such as `Any`, `Float`, `Scalar`, etc.
-        value_type (Any): Type returned by the function.
-        hidden (bool): Whether to add that function into the documentation.
+        function: Warp function to be registered.
+        group: Classification used for the documentation.
+        hidden: Whether to add that function into the documentation.
     """
     function.group = group
     function.hidden = hidden
@@ -1375,10 +1449,10 @@ def register_api_function(
 # global dictionary of modules
-user_modules = {}
+user_modules: Dict[str, Module] = {}
-def get_module(name):
+def get_module(name: str) -> Module:
     # some modules might be manually imported using `importlib` without being
     # registered into `sys.modules`
     parent = sys.modules.get(name, None)
@@ -1460,13 +1534,16 @@ class ModuleHasher:
         if warp.config.verify_fp:
             ch.update(bytes("verify_fp", "utf-8"))
+        # line directives, e.g. for Nsight Compute
+        ch.update(bytes(ctypes.c_int(warp.config.line_directives)))
         # build config
         ch.update(bytes(warp.config.mode, "utf-8"))
         # save the module hash
         self.module_hash = ch.digest()
-    def hash_kernel(self, kernel):
+    def hash_kernel(self, kernel: Kernel) -> bytes:
         # NOTE: We only hash non-generic kernels, so we don't traverse kernel overloads here.
         ch = hashlib.sha256()
@@ -1480,7 +1557,7 @@ class ModuleHasher:
         return h
-    def hash_function(self, func):
+    def hash_function(self, func: Function) -> bytes:
         # NOTE: This method hashes all possible overloads that a function call could resolve to.
         # The exact overload will be resolved at build time, when the argument types are known.
@@ -1495,7 +1572,7 @@ class ModuleHasher:
         ch.update(bytes(func.key, "utf-8"))
         # include all concrete and generic overloads
-        overloads = {**func.user_overloads, **func.user_templates}
+        overloads: Dict[str, Function] = {**func.user_overloads, **func.user_templates}
         for sig in sorted(overloads.keys()):
             ovl = overloads[sig]
@@ -1526,7 +1603,7 @@ class ModuleHasher:
         return h
-    def hash_adjoint(self, adj):
+    def hash_adjoint(self, adj: warp.codegen.Adjoint) -> bytes:
         # NOTE: We don't cache adjoint hashes, because adjoints are always unique.
         # Even instances of generic kernels and functions have unique adjoints with
         # different argument types.
@@ -1575,7 +1652,7 @@ class ModuleHasher:
         return ch.digest()
-    def get_constant_bytes(self, value):
+    def get_constant_bytes(self, value) -> bytes:
         if isinstance(value, int):
             # this also handles builtins.bool
             return bytes(ctypes.c_int(value))
@@ -1593,7 +1670,7 @@ class ModuleHasher:
         else:
             raise TypeError(f"Invalid constant type: {type(value)}")
-    def get_module_hash(self):
+    def get_module_hash(self) -> bytes:
         return self.module_hash
     def get_unique_kernels(self):
@@ -1610,6 +1687,7 @@ class ModuleBuilder:
         self.fatbins = {}  # map from <some identifier> to fatbins, to add at link time
         self.ltoirs = {}  # map from lto symbol to lto binary
         self.ltoirs_decl = {}  # map from lto symbol to lto forward declaration
+        self.shared_memory_bytes = {}  # map from lto symbol to shared memory requirements
         if hasher is None:
             hasher = ModuleHasher(module)
@@ -1726,9 +1804,9 @@ class ModuleBuilder:
         # add headers
         if device == "cpu":
-            source = warp.codegen.cpu_module_header.format(tile_size=self.options["block_dim"]) + source
+            source = warp.codegen.cpu_module_header.format(block_dim=self.options["block_dim"]) + source
         else:
-            source = warp.codegen.cuda_module_header.format(tile_size=self.options["block_dim"]) + source
+            source = warp.codegen.cuda_module_header.format(block_dim=self.options["block_dim"]) + source
         return source
@@ -1765,7 +1843,7 @@ class ModuleExec:
                 runtime.llvm.unload_obj(self.handle.encode("utf-8"))
     # lookup and cache kernel entry points
-    def get_kernel_hooks(self, kernel):
+    def get_kernel_hooks(self, kernel) -> KernelHooks:
         # Use kernel.adj as a unique key for cache lookups instead of the kernel itself.
         # This avoids holding a reference to the kernel and is faster than using
         # a WeakKeyDictionary with kernels as keys.
@@ -1838,7 +1916,7 @@ class ModuleExec:
 # creates a hash of the function to use for checking
 # build cache
 class Module:
-    def __init__(self, name, loader):
+    def __init__(self, name: Optional[str], loader=None):
         self.name = name if name is not None else "None"
         self.loader = loader
@@ -1878,7 +1956,7 @@ class Module:
             "enable_backward": warp.config.enable_backward,
             "fast_math": False,
             "fuse_fp": True,
-            "lineinfo": False,
+            "lineinfo": warp.config.lineinfo,
             "cuda_output": None,  # supported values: "ptx", "cubin", or None (automatic)
             "mode": warp.config.mode,
             "block_dim": 256,
@@ -2081,7 +2159,11 @@ class Module:
                     use_ptx = True
                 if use_ptx:
-                    output_arch = min(device.arch, warp.config.ptx_target_arch)
+                    # use the default PTX arch if the device supports it
+                    if warp.config.ptx_target_arch is not None:
+                        output_arch = min(device.arch, warp.config.ptx_target_arch)
+                    else:
+                        output_arch = min(device.arch, runtime.default_ptx_arch)
                     output_name = f"{module_name_short}.sm{output_arch}.ptx"
                 else:
                     output_arch = device.arch
@@ -2194,34 +2276,8 @@ class Module:
                 # -----------------------------------------------------------
                 # update cache
-                def safe_rename(src, dst, attempts=5, delay=0.1):
-                    for i in range(attempts):
-                        try:
-                            os.rename(src, dst)
-                            return
-                        except FileExistsError:
-                            return
-                        except OSError as e:
-                            if e.errno == errno.ENOTEMPTY:
-                                # if directory exists we assume another process
-                                # got there first, in which case we will copy
-                                # our output to the directory manually in second step
-                                return
-                            else:
-                                # otherwise assume directory creation failed e.g.: access denied
-                                # on Windows we see occasional failures to rename directories due to
-                                # some process holding a lock on a file to be moved to workaround
-                                # this we make multiple attempts to rename with some delay
-                                if i < attempts - 1:
-                                    time.sleep(delay)
-                                else:
-                                    print(
-                                        f"Could not update Warp cache with module binaries, trying to rename {build_dir} to {module_dir}, error {e}"
-                                    )
-                                    raise e
                 # try to move process outputs to cache
-                safe_rename(build_dir, module_dir)
+                warp.build.safe_rename(build_dir, module_dir)
                 if os.path.exists(module_dir):
                     if not os.path.exists(binary_path):
@@ -2294,7 +2350,7 @@ class Module:
         self.failed_builds = set()
     # lookup kernel entry points based on name, called after compilation / module load
-    def get_kernel_hooks(self, kernel, device):
+    def get_kernel_hooks(self, kernel, device: Device) -> KernelHooks:
         module_exec = self.execs.get((device.context, self.options["block_dim"]))
         if module_exec is not None:
             return module_exec.get_kernel_hooks(kernel)
@@ -2449,6 +2505,7 @@ class Event:
             raise RuntimeError(f"Device {device} is not a CUDA device")
         self.device = device
+        self.enable_timing = enable_timing
         if cuda_event is not None:
             self.cuda_event = cuda_event
@@ -2498,6 +2555,17 @@ class Event:
         else:
             raise RuntimeError(f"Device {self.device} does not support IPC.")
+    @property
+    def is_complete(self) -> bool:
+        """A boolean indicating whether all work on the stream when the event was recorded has completed.
+        This property may not be accessed during a graph capture on any stream.
+        """
+        result_code = runtime.core.cuda_event_query(self.cuda_event)
+        return result_code == 0
     def __del__(self):
         if not self.owner:
             return
@@ -2512,7 +2580,7 @@ class Stream:
         instance.owner = False
         return instance
-    def __init__(self, device: Optional[Union["Device", str]] = None, priority: int = 0, **kwargs):
+    def __init__(self, device: Union["Device", str, None] = None, priority: int = 0, **kwargs):
         """Initialize the stream on a device with an optional specified priority.
         Args:
@@ -2528,7 +2596,7 @@ class Stream:
         Raises:
             RuntimeError: If function is called before Warp has completed
               initialization with a ``device`` that is not an instance of
-              :class:`Device``.
+              :class:`Device <warp.context.Device>`.
             RuntimeError: ``device`` is not a CUDA Device.
             RuntimeError: The stream could not be created on the device.
             TypeError: The requested stream priority is not an integer.
@@ -2596,7 +2664,7 @@ class Stream:
                 f"Event from device {event.device} cannot be recorded on stream from device {self.device}"
             )
-        runtime.core.cuda_event_record(event.cuda_event, self.cuda_stream)
+        runtime.core.cuda_event_record(event.cuda_event, self.cuda_stream, event.enable_timing)
         return event
@@ -2630,6 +2698,17 @@ class Stream:
         runtime.core.cuda_stream_wait_stream(self.cuda_stream, other_stream.cuda_stream, event.cuda_event)
+    @property
+    def is_complete(self) -> bool:
+        """A boolean indicating whether all work on the stream has completed.
+        This property may not be accessed during a graph capture on any stream.
+        """
+        result_code = runtime.core.cuda_stream_query(self.cuda_stream)
+        return result_code == 0
     @property
     def is_capturing(self) -> bool:
         """A boolean indicating whether a graph capture is currently ongoing on this stream."""
@@ -2952,18 +3031,14 @@ Devicelike = Union[Device, str, None]
 class Graph:
-    def __new__(cls, *args, **kwargs):
-        instance = super(Graph, cls).__new__(cls)
-        instance.graph_exec = None
-        return instance
     def __init__(self, device: Device, capture_id: int):
         self.device = device
         self.capture_id = capture_id
-        self.module_execs = set()
+        self.module_execs: Set[ModuleExec] = set()
+        self.graph_exec: Optional[ctypes.c_void_p] = None
     def __del__(self):
-        if not self.graph_exec:
+        if not hasattr(self, "graph_exec") or not hasattr(self, "device") or not self.graph_exec:
             return
         # use CUDA context guard to avoid side effects during garbage collection
@@ -3205,6 +3280,43 @@ class Runtime:
             self.core.radix_sort_pairs_float_host.argtypes = [ctypes.c_uint64, ctypes.c_uint64, ctypes.c_int]
             self.core.radix_sort_pairs_float_device.argtypes = [ctypes.c_uint64, ctypes.c_uint64, ctypes.c_int]
+            self.core.radix_sort_pairs_int64_host.argtypes = [ctypes.c_uint64, ctypes.c_uint64, ctypes.c_int]
+            self.core.radix_sort_pairs_int64_device.argtypes = [ctypes.c_uint64, ctypes.c_uint64, ctypes.c_int]
+            self.core.segmented_sort_pairs_int_host.argtypes = [
+                ctypes.c_uint64,
+                ctypes.c_uint64,
+                ctypes.c_int,
+                ctypes.c_uint64,
+                ctypes.c_uint64,
+                ctypes.c_int,
+            ]
+            self.core.segmented_sort_pairs_int_device.argtypes = [
+                ctypes.c_uint64,
+                ctypes.c_uint64,
+                ctypes.c_int,
+                ctypes.c_uint64,
+                ctypes.c_uint64,
+                ctypes.c_int,
+            ]
+            self.core.segmented_sort_pairs_float_host.argtypes = [
+                ctypes.c_uint64,
+                ctypes.c_uint64,
+                ctypes.c_int,
+                ctypes.c_uint64,
+                ctypes.c_uint64,
+                ctypes.c_int,
+            ]
+            self.core.segmented_sort_pairs_float_device.argtypes = [
+                ctypes.c_uint64,
+                ctypes.c_uint64,
+                ctypes.c_int,
+                ctypes.c_uint64,
+                ctypes.c_uint64,
+                ctypes.c_int,
+            ]
             self.core.runlength_encode_int_host.argtypes = [
                 ctypes.c_uint64,
                 ctypes.c_uint64,
@@ -3285,26 +3397,6 @@ class Runtime:
             self.core.hash_grid_update_device.argtypes = [ctypes.c_uint64, ctypes.c_float, ctypes.c_void_p]
             self.core.hash_grid_reserve_device.argtypes = [ctypes.c_uint64, ctypes.c_int]
-            self.core.cutlass_gemm.argtypes = [
-                ctypes.c_void_p,
-                ctypes.c_int,
-                ctypes.c_int,
-                ctypes.c_int,
-                ctypes.c_int,
-                ctypes.c_char_p,
-                ctypes.c_void_p,
-                ctypes.c_void_p,
-                ctypes.c_void_p,
-                ctypes.c_void_p,
-                ctypes.c_float,
-                ctypes.c_float,
-                ctypes.c_bool,
-                ctypes.c_bool,
-                ctypes.c_bool,
-                ctypes.c_int,
-            ]
-            self.core.cutlass_gemm.restype = ctypes.c_bool
             self.core.volume_create_host.argtypes = [ctypes.c_void_p, ctypes.c_uint64, ctypes.c_bool, ctypes.c_bool]
             self.core.volume_create_host.restype = ctypes.c_uint64
             self.core.volume_get_tiles_host.argtypes = [
@@ -3335,36 +3427,18 @@ class Runtime:
             ]
             self.core.volume_destroy_device.argtypes = [ctypes.c_uint64]
-            self.core.volume_f_from_tiles_device.argtypes = [
+            self.core.volume_from_tiles_device.argtypes = [
                 ctypes.c_void_p,
                 ctypes.c_void_p,
                 ctypes.c_int,
                 ctypes.c_float * 9,
                 ctypes.c_float * 3,
                 ctypes.c_bool,
-                ctypes.c_float,
-            ]
-            self.core.volume_f_from_tiles_device.restype = ctypes.c_uint64
-            self.core.volume_v_from_tiles_device.argtypes = [
                 ctypes.c_void_p,
-                ctypes.c_void_p,
-                ctypes.c_int,
-                ctypes.c_float * 9,
-                ctypes.c_float * 3,
-                ctypes.c_bool,
-                ctypes.c_float * 3,
-            ]
-            self.core.volume_v_from_tiles_device.restype = ctypes.c_uint64
-            self.core.volume_i_from_tiles_device.argtypes = [
-                ctypes.c_void_p,
-                ctypes.c_void_p,
-                ctypes.c_int,
-                ctypes.c_float * 9,
-                ctypes.c_float * 3,
-                ctypes.c_bool,
-                ctypes.c_int,
+                ctypes.c_uint32,
+                ctypes.c_char_p,
             ]
-            self.core.volume_i_from_tiles_device.restype = ctypes.c_uint64
+            self.core.volume_from_tiles_device.restype = ctypes.c_uint64
             self.core.volume_index_from_tiles_device.argtypes = [
                 ctypes.c_void_p,
                 ctypes.c_void_p,
@@ -3433,6 +3507,7 @@ class Runtime:
                 ctypes.POINTER(ctypes.c_int),  # tpl_cols
                 ctypes.c_void_p,  # tpl_values
                 ctypes.c_bool,  # prune_numerical_zeros
+                ctypes.c_bool,  # masked
                 ctypes.POINTER(ctypes.c_int),  # bsr_offsets
                 ctypes.POINTER(ctypes.c_int),  # bsr_columns
                 ctypes.c_void_p,  # bsr_values
@@ -3467,8 +3542,6 @@ class Runtime:
             self.core.is_cuda_enabled.restype = ctypes.c_int
             self.core.is_cuda_compatibility_enabled.argtypes = None
             self.core.is_cuda_compatibility_enabled.restype = ctypes.c_int
-            self.core.is_cutlass_enabled.argtypes = None
-            self.core.is_cutlass_enabled.restype = ctypes.c_int
             self.core.is_mathdx_enabled.argtypes = None
             self.core.is_mathdx_enabled.restype = ctypes.c_int
@@ -3502,6 +3575,10 @@ class Runtime:
             self.core.cuda_device_set_mempool_release_threshold.restype = ctypes.c_int
             self.core.cuda_device_get_mempool_release_threshold.argtypes = [ctypes.c_int]
             self.core.cuda_device_get_mempool_release_threshold.restype = ctypes.c_uint64
+            self.core.cuda_device_get_mempool_used_mem_current.argtypes = [ctypes.c_int]
+            self.core.cuda_device_get_mempool_used_mem_current.restype = ctypes.c_uint64
+            self.core.cuda_device_get_mempool_used_mem_high.argtypes = [ctypes.c_int]
+            self.core.cuda_device_get_mempool_used_mem_high.restype = ctypes.c_uint64
             self.core.cuda_device_get_memory_info.argtypes = [ctypes.c_int, ctypes.c_void_p, ctypes.c_void_p]
             self.core.cuda_device_get_memory_info.restype = None
             self.core.cuda_device_get_uuid.argtypes = [ctypes.c_int, ctypes.c_char * 16]
@@ -3571,6 +3648,8 @@ class Runtime:
             self.core.cuda_stream_create.restype = ctypes.c_void_p
             self.core.cuda_stream_destroy.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
             self.core.cuda_stream_destroy.restype = None
+            self.core.cuda_stream_query.argtypes = [ctypes.c_void_p]
+            self.core.cuda_stream_query.restype = ctypes.c_int
             self.core.cuda_stream_register.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
             self.core.cuda_stream_register.restype = None
             self.core.cuda_stream_unregister.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
@@ -3592,7 +3671,9 @@ class Runtime:
             self.core.cuda_event_create.restype = ctypes.c_void_p
             self.core.cuda_event_destroy.argtypes = [ctypes.c_void_p]
             self.core.cuda_event_destroy.restype = None
-            self.core.cuda_event_record.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
+            self.core.cuda_event_query.argtypes = [ctypes.c_void_p]
+            self.core.cuda_event_query.restype = ctypes.c_int
+            self.core.cuda_event_record.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_bool]
             self.core.cuda_event_record.restype = None
             self.core.cuda_event_synchronize.argtypes = [ctypes.c_void_p]
             self.core.cuda_event_synchronize.restype = None
@@ -3841,9 +3922,20 @@ class Runtime:
                 cuda_device_count = len(self.cuda_devices)
             else:
                 self.set_default_device("cuda:0")
+            # the minimum PTX architecture that supports all of Warp's features
+            self.default_ptx_arch = 75
+            # Update the default PTX architecture based on devices present in the system.
+            # Use the lowest architecture among devices that meet the minimum architecture requirement.
+            # Devices below the required minimum will use the highest architecture they support.
+            eligible_archs = [d.arch for d in self.cuda_devices if d.arch >= self.default_ptx_arch]
+            if eligible_archs:
+                self.default_ptx_arch = min(eligible_archs)
         else:
             # CUDA not available
             self.set_default_device("cpu")
+            self.default_ptx_arch = None
         # initialize kernel cache
         warp.build.init_kernel_cache(warp.config.kernel_cache_dir)
@@ -3856,6 +3948,11 @@ class Runtime:
             greeting = []
             greeting.append(f"Warp {warp.config.version} initialized:")
+            # Add git commit hash to greeting if available
+            if warp.config._git_commit_hash is not None:
+                greeting.append(f"   Git commit: {warp.config._git_commit_hash}")
             if cuda_device_count > 0:
                 # print CUDA version info
                 greeting.append(
@@ -4208,7 +4305,7 @@ def set_device(ident: Devicelike) -> None:
     device.make_current()
-def map_cuda_device(alias: str, context: ctypes.c_void_p = None) -> Device:
+def map_cuda_device(alias: str, context: Optional[ctypes.c_void_p] = None) -> Device:
     """Assign a device alias to a CUDA context.
     This function can be used to create a wp.Device for an external CUDA context.
@@ -4236,7 +4333,13 @@ def unmap_cuda_device(alias: str) -> None:
 def is_mempool_supported(device: Devicelike) -> bool:
-    """Check if CUDA memory pool allocators are available on the device."""
+    """Check if CUDA memory pool allocators are available on the device.
+    Parameters:
+        device: The :class:`Device <warp.context.Device>` or device identifier
+          for which the query is to be performed.
+          If ``None``, the default device will be used.
+    """
     init()
@@ -4246,7 +4349,13 @@ def is_mempool_supported(device: Devicelike) -> bool:
 def is_mempool_enabled(device: Devicelike) -> bool:
-    """Check if CUDA memory pool allocators are enabled on the device."""
+    """Check if CUDA memory pool allocators are enabled on the device.
+    Parameters:
+        device: The :class:`Device <warp.context.Device>` or device identifier
+          for which the query is to be performed.
+          If ``None``, the default device will be used.
+    """
     init()
@@ -4266,6 +4375,11 @@ def set_mempool_enabled(device: Devicelike, enable: bool) -> None:
     to Warp.  The preferred solution is to enable memory pool access using :func:`set_mempool_access_enabled`.
     If peer access is not supported, then the default CUDA allocators must be used to pre-allocate the memory
     prior to graph capture.
+    Parameters:
+        device: The :class:`Device <warp.context.Device>` or device identifier
+          for which the operation is to be performed.
+          If ``None``, the default device will be used.
     """
     init()
@@ -4296,6 +4410,18 @@ def set_mempool_release_threshold(device: Devicelike, threshold: Union[int, floa
     Values between 0 and 1 are interpreted as fractions of available memory.  For example, 0.5 means
     half of the device's physical memory.  Greater values are interpreted as an absolute number of bytes.
     For example, 1024**3 means one GiB of memory.
+    Parameters:
+        device: The :class:`Device <warp.context.Device>` or device identifier
+          for which the operation is to be performed.
+          If ``None``, the default device will be used.
+        threshold: An integer representing a number of bytes, or a ``float`` between 0 and 1,
+          specifying the desired release threshold.
+    Raises:
+        ValueError: If ``device`` is not a CUDA device.
+        RuntimeError: If ``device`` is a CUDA device, but does not support memory pools.
+        RuntimeError: Failed to set the memory pool release threshold.
     """
     init()
@@ -4317,8 +4443,21 @@ def set_mempool_release_threshold(device: Devicelike, threshold: Union[int, floa
         raise RuntimeError(f"Failed to set memory pool release threshold for device {device}")
-def get_mempool_release_threshold(device: Devicelike) -> int:
-    """Get the CUDA memory pool release threshold on the device in bytes."""
+def get_mempool_release_threshold(device: Devicelike = None) -> int:
+    """Get the CUDA memory pool release threshold on the device.
+    Parameters:
+        device: The :class:`Device <warp.context.Device>` or device identifier
+          for which the query is to be performed.
+          If ``None``, the default device will be used.
+    Returns:
+        The memory pool release threshold in bytes.
+    Raises:
+        ValueError: If ``device`` is not a CUDA device.
+        RuntimeError: If ``device`` is a CUDA device, but does not support memory pools.
+    """
     init()
@@ -4333,6 +4472,64 @@ def get_mempool_release_threshold(device: Devicelike) -> int:
     return runtime.core.cuda_device_get_mempool_release_threshold(device.ordinal)
+def get_mempool_used_mem_current(device: Devicelike = None) -> int:
+    """Get the amount of memory from the device's memory pool that is currently in use by the application.
+    Parameters:
+        device: The :class:`Device <warp.context.Device>` or device identifier
+          for which the query is to be performed.
+          If ``None``, the default device will be used.
+    Returns:
+        The amount of memory used in bytes.
+    Raises:
+        ValueError: If ``device`` is not a CUDA device.
+        RuntimeError: If ``device`` is a CUDA device, but does not support memory pools.
+    """
+    init()
+    device = runtime.get_device(device)
+    if not device.is_cuda:
+        raise ValueError("Memory pools are only supported on CUDA devices")
+    if not device.is_mempool_supported:
+        raise RuntimeError(f"Device {device} does not support memory pools")
+    return runtime.core.cuda_device_get_mempool_used_mem_current(device.ordinal)
+def get_mempool_used_mem_high(device: Devicelike = None) -> int:
+    """Get the application's memory usage high-water mark from the device's CUDA memory pool.
+    Parameters:
+        device: The :class:`Device <warp.context.Device>` or device identifier
+          for which the query is to be performed.
+          If ``None``, the default device will be used.
+    Returns:
+        The high-water mark of memory used from the memory pool in bytes.
+    Raises:
+        ValueError: If ``device`` is not a CUDA device.
+        RuntimeError: If ``device`` is a CUDA device, but does not support memory pools.
+    """
+    init()
+    device = runtime.get_device(device)
+    if not device.is_cuda:
+        raise ValueError("Memory pools are only supported on CUDA devices")
+    if not device.is_mempool_supported:
+        raise RuntimeError(f"Device {device} does not support memory pools")
+    return runtime.core.cuda_device_get_mempool_used_mem_high(device.ordinal)
 def is_peer_access_supported(target_device: Devicelike, peer_device: Devicelike) -> bool:
     """Check if `peer_device` can directly access the memory of `target_device` on this system.
@@ -4535,7 +4732,7 @@ def wait_event(event: Event):
     get_stream().wait_event(event)
-def get_event_elapsed_time(start_event: Event, end_event: Event, synchronize: Optional[bool] = True):
+def get_event_elapsed_time(start_event: Event, end_event: Event, synchronize: bool = True):
     """Get the elapsed time between two recorded events.
     Both events must have been previously recorded with
@@ -4560,7 +4757,7 @@ def get_event_elapsed_time(start_event: Event, end_event: Event, synchronize: Op
     return runtime.core.cuda_event_elapsed_time(start_event.cuda_event, end_event.cuda_event)
-def wait_stream(other_stream: Stream, event: Event = None):
+def wait_stream(other_stream: Stream, event: Optional[Event] = None):
     """Convenience function for calling :meth:`Stream.wait_stream` on the current stream.
     Args:
@@ -4727,7 +4924,7 @@ class RegisteredGLBuffer:
 def zeros(
-    shape: Tuple = None,
+    shape: Union[int, Tuple[int, ...], List[int], None] = None,
     dtype=float,
     device: Devicelike = None,
     requires_grad: bool = False,
@@ -4755,7 +4952,7 @@ def zeros(
 def zeros_like(
-    src: warp.array, device: Devicelike = None, requires_grad: bool = None, pinned: bool = None
+    src: Array, device: Devicelike = None, requires_grad: Optional[bool] = None, pinned: Optional[bool] = None
 ) -> warp.array:
     """Return a zero-initialized array with the same type and dimension of another array
@@ -4777,7 +4974,7 @@ def zeros_like(
 def ones(
-    shape: Tuple = None,
+    shape: Union[int, Tuple[int, ...], List[int], None] = None,
     dtype=float,
     device: Devicelike = None,
     requires_grad: bool = False,
@@ -4801,7 +4998,7 @@ def ones(
 def ones_like(
-    src: warp.array, device: Devicelike = None, requires_grad: bool = None, pinned: bool = None
+    src: Array, device: Devicelike = None, requires_grad: Optional[bool] = None, pinned: Optional[bool] = None
 ) -> warp.array:
     """Return a one-initialized array with the same type and dimension of another array
@@ -4819,7 +5016,7 @@ def ones_like(
 def full(
-    shape: Tuple = None,
+    shape: Union[int, Tuple[int, ...], List[int], None] = None,
     value=0,
     dtype=Any,
     device: Devicelike = None,
@@ -4885,7 +5082,11 @@ def full(
 def full_like(
-    src: warp.array, value: Any, device: Devicelike = None, requires_grad: bool = None, pinned: bool = None
+    src: Array,
+    value: Any,
+    device: Devicelike = None,
+    requires_grad: Optional[bool] = None,
+    pinned: Optional[bool] = None,
 ) -> warp.array:
     """Return an array with all elements initialized to the given value with the same type and dimension of another array
@@ -4907,7 +5108,9 @@ def full_like(
     return arr
-def clone(src: warp.array, device: Devicelike = None, requires_grad: bool = None, pinned: bool = None) -> warp.array:
+def clone(
+    src: warp.array, device: Devicelike = None, requires_grad: Optional[bool] = None, pinned: Optional[bool] = None
+) -> warp.array:
     """Clone an existing array, allocates a copy of the src memory
     Args:
@@ -4928,7 +5131,7 @@ def clone(src: warp.array, device: Devicelike = None, requires_grad: bool = None
 def empty(
-    shape: Tuple = None,
+    shape: Union[int, Tuple[int, ...], List[int], None] = None,
     dtype=float,
     device: Devicelike = None,
     requires_grad: bool = False,
@@ -4961,7 +5164,7 @@ def empty(
 def empty_like(
-    src: warp.array, device: Devicelike = None, requires_grad: bool = None, pinned: bool = None
+    src: Array, device: Devicelike = None, requires_grad: Optional[bool] = None, pinned: Optional[bool] = None
 ) -> warp.array:
     """Return an uninitialized array with the same type and dimension of another array
@@ -5193,8 +5396,6 @@ def pack_arg(kernel, arg_type, arg_name, value, device, adjoint=False):
             ) from e
-# represents all data required for a kernel launch
-# so that launches can be replayed quickly, use `wp.launch(..., record_cmd=True)`
 class Launch:
     """Represents all data required for a kernel launch so that launches can be replayed quickly.
@@ -5465,7 +5666,7 @@ def launch(
         max_blocks: The maximum number of CUDA thread blocks to use.
           Only has an effect for CUDA kernel launches.
           If negative or zero, the maximum hardware value will be used.
-        block_dim: The number of threads per block.
+        block_dim: The number of threads per block (always 1 for "cpu" devices).
     """
     init()
@@ -5476,6 +5677,9 @@ def launch(
     else:
         device = runtime.get_device(device)
+    if device == "cpu":
+        block_dim = 1
     # check function is a Kernel
     if not isinstance(kernel, Kernel):
         raise RuntimeError("Error launching kernel, can only launch functions decorated with @wp.kernel.")
@@ -5708,6 +5912,18 @@ def launch_tiled(*args, **kwargs):
             "Launch block dimension 'block_dim' argument should be passed via. keyword args for wp.launch_tiled()"
         )
+    if "device" in kwargs:
+        device = kwargs["device"]
+    else:
+        # todo: this doesn't consider the case where device
+        # is passed through positional args
+        device = None
+    # force the block_dim to 1 if running on "cpu"
+    device = runtime.get_device(device)
+    if device.is_cpu:
+        kwargs["block_dim"] = 1
     dim = kwargs["dim"]
     if not isinstance(dim, list):
         dim = list(dim) if isinstance(dim, tuple) else [dim]
@@ -5876,6 +6092,7 @@ def set_module_options(options: Dict[str, Any], module: Optional[Any] = None):
     * **mode**: The compilation mode to use, can be "debug", or "release", defaults to the value of ``warp.config.mode``.
     * **max_unroll**: The maximum fixed-size loop to unroll, defaults to the value of ``warp.config.max_unroll``.
+    * **block_dim**: The default number of threads to assign to each block
     Args:
@@ -5901,7 +6118,12 @@ def get_module_options(module: Optional[Any] = None) -> Dict[str, Any]:
     return get_module(m.__name__).options
-def capture_begin(device: Devicelike = None, stream=None, force_module_load=None, external=False):
+def capture_begin(
+    device: Devicelike = None,
+    stream: Optional[Stream] = None,
+    force_module_load: Optional[bool] = None,
+    external: bool = False,
+):
     """Begin capture of a CUDA graph
     Captures all subsequent kernel launches and memory operations on CUDA devices.
@@ -5968,16 +6190,15 @@ def capture_begin(device: Devicelike = None, stream=None, force_module_load=None
     runtime.captures[capture_id] = graph
-def capture_end(device: Devicelike = None, stream: Stream = None) -> Graph:
-    """Ends the capture of a CUDA graph
+def capture_end(device: Devicelike = None, stream: Optional[Stream] = None) -> Graph:
+    """End the capture of a CUDA graph.
     Args:
         device: The CUDA device where capture began
         stream: The CUDA stream where capture began
     Returns:
-        A Graph object that can be launched with :func:`~warp.capture_launch()`
+        A :class:`Graph` object that can be launched with :func:`~warp.capture_launch()`
     """
     if stream is not None:
@@ -6011,12 +6232,12 @@ def capture_end(device: Devicelike = None, stream: Stream = None) -> Graph:
     return graph
-def capture_launch(graph: Graph, stream: Stream = None):
+def capture_launch(graph: Graph, stream: Optional[Stream] = None):
     """Launch a previously captured CUDA graph
     Args:
-        graph: A Graph as returned by :func:`~warp.capture_end()`
-        stream: A Stream to launch the graph on (optional)
+        graph: A :class:`Graph` as returned by :func:`~warp.capture_end()`
+        stream: A :class:`Stream` to launch the graph on
     """
     if stream is not None:
@@ -6032,24 +6253,28 @@ def capture_launch(graph: Graph, stream: Stream = None):
 def copy(
-    dest: warp.array, src: warp.array, dest_offset: int = 0, src_offset: int = 0, count: int = 0, stream: Stream = None
+    dest: warp.array,
+    src: warp.array,
+    dest_offset: int = 0,
+    src_offset: int = 0,
+    count: int = 0,
+    stream: Optional[Stream] = None,
 ):
     """Copy array contents from `src` to `dest`.
     Args:
-        dest: Destination array, must be at least as big as source buffer
+        dest: Destination array, must be at least as large as source buffer
         src: Source array
         dest_offset: Element offset in the destination array
         src_offset: Element offset in the source array
         count: Number of array elements to copy (will copy all elements if set to 0)
-        stream: The stream on which to perform the copy (optional)
+        stream: The stream on which to perform the copy
     The stream, if specified, can be from any device.  If the stream is omitted, then Warp selects a stream based on the following rules:
     (1) If the destination array is on a CUDA device, use the current stream on the destination device.
     (2) Otherwise, if the source array is on a CUDA device, use the current stream on the source device.
     If neither source nor destination are on a CUDA device, no stream is used for the copy.
     """
     from warp.context import runtime
@@ -6274,8 +6499,8 @@ def type_str(t):
             return f"Transformation[{type_str(t._wp_scalar_type_)}]"
         raise TypeError("Invalid vector or matrix dimensions")
-    elif warp.codegen.get_type_origin(t) in (list, tuple):
-        args_repr = ", ".join(type_str(x) for x in warp.codegen.get_type_args(t))
+    elif get_origin(t) in (list, tuple):
+        args_repr = ", ".join(type_str(x) for x in get_args(t))
         return f"{t._name}[{args_repr}]"
     elif t is Ellipsis:
         return "..."