PyPI - warp-lang - Versions diffs - 1.7.2rc1__py3-none-win_amd64.whl → 1.8.0__py3-none-win_amd64.whl - Mend

warp-lang 1.7.2rc1__py3-none-win_amd64.whl → 1.8.0__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (181) hide show

warp/__init__.py +3 -1
warp/__init__.pyi +3489 -1
warp/autograd.py +45 -122
warp/bin/warp-clang.dll +0 -0
warp/bin/warp.dll +0 -0
warp/build.py +241 -252
warp/build_dll.py +125 -26
warp/builtins.py +1907 -384
warp/codegen.py +257 -101
warp/config.py +12 -1
warp/constants.py +1 -1
warp/context.py +657 -223
warp/dlpack.py +1 -1
warp/examples/benchmarks/benchmark_cloth.py +2 -2
warp/examples/benchmarks/benchmark_tile_sort.py +155 -0
warp/examples/core/example_sample_mesh.py +1 -1
warp/examples/core/example_spin_lock.py +93 -0
warp/examples/core/example_work_queue.py +118 -0
warp/examples/fem/example_adaptive_grid.py +5 -5
warp/examples/fem/example_apic_fluid.py +1 -1
warp/examples/fem/example_burgers.py +1 -1
warp/examples/fem/example_convection_diffusion.py +9 -6
warp/examples/fem/example_darcy_ls_optimization.py +489 -0
warp/examples/fem/example_deformed_geometry.py +1 -1
warp/examples/fem/example_diffusion.py +2 -2
warp/examples/fem/example_diffusion_3d.py +1 -1
warp/examples/fem/example_distortion_energy.py +1 -1
warp/examples/fem/example_elastic_shape_optimization.py +387 -0
warp/examples/fem/example_magnetostatics.py +5 -3
warp/examples/fem/example_mixed_elasticity.py +5 -3
warp/examples/fem/example_navier_stokes.py +11 -9
warp/examples/fem/example_nonconforming_contact.py +5 -3
warp/examples/fem/example_streamlines.py +8 -3
warp/examples/fem/utils.py +9 -8
warp/examples/interop/example_jax_ffi_callback.py +2 -2
warp/examples/optim/example_drone.py +1 -1
warp/examples/sim/example_cloth.py +1 -1
warp/examples/sim/example_cloth_self_contact.py +48 -54
warp/examples/tile/example_tile_block_cholesky.py +502 -0
warp/examples/tile/example_tile_cholesky.py +2 -1
warp/examples/tile/example_tile_convolution.py +1 -1
warp/examples/tile/example_tile_filtering.py +1 -1
warp/examples/tile/example_tile_matmul.py +1 -1
warp/examples/tile/example_tile_mlp.py +2 -0
warp/fabric.py +7 -7
warp/fem/__init__.py +5 -0
warp/fem/adaptivity.py +1 -1
warp/fem/cache.py +152 -63
warp/fem/dirichlet.py +2 -2
warp/fem/domain.py +136 -6
warp/fem/field/field.py +141 -99
warp/fem/field/nodal_field.py +85 -39
warp/fem/field/virtual.py +97 -52
warp/fem/geometry/adaptive_nanogrid.py +91 -86
warp/fem/geometry/closest_point.py +13 -0
warp/fem/geometry/deformed_geometry.py +102 -40
warp/fem/geometry/element.py +56 -2
warp/fem/geometry/geometry.py +323 -22
warp/fem/geometry/grid_2d.py +157 -62
warp/fem/geometry/grid_3d.py +116 -20
warp/fem/geometry/hexmesh.py +86 -20
warp/fem/geometry/nanogrid.py +166 -86
warp/fem/geometry/partition.py +59 -25
warp/fem/geometry/quadmesh.py +86 -135
warp/fem/geometry/tetmesh.py +47 -119
warp/fem/geometry/trimesh.py +77 -270
warp/fem/integrate.py +107 -52
warp/fem/linalg.py +25 -58
warp/fem/operator.py +124 -27
warp/fem/quadrature/pic_quadrature.py +36 -14
warp/fem/quadrature/quadrature.py +40 -16
warp/fem/space/__init__.py +1 -1
warp/fem/space/basis_function_space.py +66 -46
warp/fem/space/basis_space.py +17 -4
warp/fem/space/dof_mapper.py +1 -1
warp/fem/space/function_space.py +2 -2
warp/fem/space/grid_2d_function_space.py +4 -1
warp/fem/space/hexmesh_function_space.py +4 -2
warp/fem/space/nanogrid_function_space.py +3 -1
warp/fem/space/partition.py +11 -2
warp/fem/space/quadmesh_function_space.py +4 -1
warp/fem/space/restriction.py +5 -2
warp/fem/space/shape/__init__.py +10 -8
warp/fem/space/tetmesh_function_space.py +4 -1
warp/fem/space/topology.py +52 -21
warp/fem/space/trimesh_function_space.py +4 -1
warp/fem/utils.py +53 -8
warp/jax.py +1 -2
warp/jax_experimental/ffi.py +12 -17
warp/jax_experimental/xla_ffi.py +37 -24
warp/math.py +171 -1
warp/native/array.h +99 -0
warp/native/builtin.h +174 -31
warp/native/coloring.cpp +1 -1
warp/native/exports.h +118 -63
warp/native/intersect.h +3 -3
warp/native/mat.h +5 -10
warp/native/mathdx.cpp +11 -5
warp/native/matnn.h +1 -123
warp/native/quat.h +28 -4
warp/native/sparse.cpp +121 -258
warp/native/sparse.cu +181 -274
warp/native/spatial.h +305 -17
warp/native/tile.h +583 -72
warp/native/tile_radix_sort.h +1108 -0
warp/native/tile_reduce.h +237 -2
warp/native/tile_scan.h +240 -0
warp/native/tuple.h +189 -0
warp/native/vec.h +6 -16
warp/native/warp.cpp +36 -4
warp/native/warp.cu +574 -51
warp/native/warp.h +47 -74
warp/optim/linear.py +5 -1
warp/paddle.py +7 -8
warp/py.typed +0 -0
warp/render/render_opengl.py +58 -29
warp/render/render_usd.py +124 -61
warp/sim/__init__.py +9 -0
warp/sim/collide.py +252 -78
warp/sim/graph_coloring.py +8 -1
warp/sim/import_mjcf.py +4 -3
warp/sim/import_usd.py +11 -7
warp/sim/integrator.py +5 -2
warp/sim/integrator_euler.py +1 -1
warp/sim/integrator_featherstone.py +1 -1
warp/sim/integrator_vbd.py +751 -320
warp/sim/integrator_xpbd.py +1 -1
warp/sim/model.py +265 -260
warp/sim/utils.py +10 -7
warp/sparse.py +303 -166
warp/tape.py +52 -51
warp/tests/cuda/test_conditional_captures.py +1046 -0
warp/tests/cuda/test_streams.py +1 -1
warp/tests/geometry/test_volume.py +2 -2
warp/tests/interop/test_dlpack.py +9 -9
warp/tests/interop/test_jax.py +0 -1
warp/tests/run_coverage_serial.py +1 -1
warp/tests/sim/disabled_kinematics.py +2 -2
warp/tests/sim/{test_vbd.py → test_cloth.py} +296 -113
warp/tests/sim/test_collision.py +159 -51
warp/tests/sim/test_coloring.py +15 -1
warp/tests/test_array.py +254 -2
warp/tests/test_array_reduce.py +2 -2
warp/tests/test_atomic_cas.py +299 -0
warp/tests/test_codegen.py +142 -19
warp/tests/test_conditional.py +47 -1
warp/tests/test_ctypes.py +0 -20
warp/tests/test_devices.py +8 -0
warp/tests/test_fabricarray.py +4 -2
warp/tests/test_fem.py +58 -25
warp/tests/test_func.py +42 -1
warp/tests/test_grad.py +1 -1
warp/tests/test_lerp.py +1 -3
warp/tests/test_map.py +481 -0
warp/tests/test_mat.py +1 -24
warp/tests/test_quat.py +6 -15
warp/tests/test_rounding.py +10 -38
warp/tests/test_runlength_encode.py +7 -7
warp/tests/test_smoothstep.py +1 -1
warp/tests/test_sparse.py +51 -2
warp/tests/test_spatial.py +507 -1
warp/tests/test_struct.py +2 -2
warp/tests/test_tuple.py +265 -0
warp/tests/test_types.py +2 -2
warp/tests/test_utils.py +24 -18
warp/tests/tile/test_tile.py +420 -1
warp/tests/tile/test_tile_mathdx.py +518 -14
warp/tests/tile/test_tile_reduce.py +213 -0
warp/tests/tile/test_tile_shared_memory.py +130 -1
warp/tests/tile/test_tile_sort.py +117 -0
warp/tests/unittest_suites.py +4 -6
warp/types.py +462 -308
warp/utils.py +647 -86
{warp_lang-1.7.2rc1.dist-info → warp_lang-1.8.0.dist-info}/METADATA +20 -6
{warp_lang-1.7.2rc1.dist-info → warp_lang-1.8.0.dist-info}/RECORD +178 -166
warp/stubs.py +0 -3381
warp/tests/sim/test_xpbd.py +0 -399
warp/tests/test_mlp.py +0 -282
{warp_lang-1.7.2rc1.dist-info → warp_lang-1.8.0.dist-info}/WHEEL +0 -0
{warp_lang-1.7.2rc1.dist-info → warp_lang-1.8.0.dist-info}/licenses/LICENSE.md +0 -0
{warp_lang-1.7.2rc1.dist-info → warp_lang-1.8.0.dist-info}/top_level.txt +0 -0

warp/context.py CHANGED Viewed

@@ -32,22 +32,7 @@ import typing
 import weakref
 from copy import copy as shallowcopy
 from pathlib import Path
-from typing import (
-    Any,
-    Callable,
-    Dict,
-    List,
-    Literal,
-    Mapping,
-    Optional,
-    Sequence,
-    Set,
-    Tuple,
-    TypeVar,
-    Union,
-    get_args,
-    get_origin,
-)
+from typing import Any, Callable, Dict, List, Literal, Mapping, Sequence, Tuple, TypeVar, Union, get_args, get_origin
 import numpy as np
@@ -84,7 +69,7 @@ def get_function_args(func):
 complex_type_hints = (Any, Callable, Tuple)
 sequence_types = (list, tuple)
-function_key_counts: Dict[str, int] = {}
+function_key_counts: dict[str, int] = {}
 def generate_unique_function_identifier(key: str) -> str:
@@ -120,40 +105,41 @@ def generate_unique_function_identifier(key: str) -> str:
 class Function:
     def __init__(
         self,
-        func: Optional[Callable],
+        func: Callable | None,
         key: str,
         namespace: str,
-        input_types: Optional[Dict[str, Union[type, TypeVar]]] = None,
-        value_type: Optional[type] = None,
-        value_func: Optional[Callable[[Mapping[str, type], Mapping[str, Any]], type]] = None,
-        export_func: Optional[Callable[[Dict[str, type]], Dict[str, type]]] = None,
-        dispatch_func: Optional[Callable] = None,
-        lto_dispatch_func: Optional[Callable] = None,
-        module: Optional[Module] = None,
+        input_types: dict[str, type | TypeVar] | None = None,
+        value_type: type | None = None,
+        value_func: Callable[[Mapping[str, type], Mapping[str, Any]], type] | None = None,
+        export_func: Callable[[dict[str, type]], dict[str, type]] | None = None,
+        dispatch_func: Callable | None = None,
+        lto_dispatch_func: Callable | None = None,
+        module: Module | None = None,
         variadic: bool = False,
-        initializer_list_func: Optional[Callable[[Dict[str, Any], type], bool]] = None,
+        initializer_list_func: Callable[[dict[str, Any], type], bool] | None = None,
         export: bool = False,
+        source: str | None = None,
         doc: str = "",
         group: str = "",
         hidden: bool = False,
         skip_replay: bool = False,
         missing_grad: bool = False,
         generic: bool = False,
-        native_func: Optional[str] = None,
-        defaults: Optional[Dict[str, Any]] = None,
-        custom_replay_func: Optional[Function] = None,
-        native_snippet: Optional[str] = None,
-        adj_native_snippet: Optional[str] = None,
-        replay_snippet: Optional[str] = None,
+        native_func: str | None = None,
+        defaults: dict[str, Any] | None = None,
+        custom_replay_func: Function | None = None,
+        native_snippet: str | None = None,
+        adj_native_snippet: str | None = None,
+        replay_snippet: str | None = None,
         skip_forward_codegen: bool = False,
         skip_reverse_codegen: bool = False,
         custom_reverse_num_input_args: int = -1,
         custom_reverse_mode: bool = False,
-        overloaded_annotations: Optional[Dict[str, type]] = None,
-        code_transformers: Optional[List[ast.NodeTransformer]] = None,
+        overloaded_annotations: dict[str, type] | None = None,
+        code_transformers: list[ast.NodeTransformer] | None = None,
         skip_adding_overload: bool = False,
         require_original_output_arg: bool = False,
-        scope_locals: Optional[Dict[str, Any]] = None,
+        scope_locals: dict[str, Any] | None = None,
     ):
         if code_transformers is None:
             code_transformers = []
@@ -178,7 +164,7 @@ class Function:
         self.native_snippet = native_snippet
         self.adj_native_snippet = adj_native_snippet
         self.replay_snippet = replay_snippet
-        self.custom_grad_func: Optional[Function] = None
+        self.custom_grad_func: Function | None = None
         self.require_original_output_arg = require_original_output_arg
         self.generic_parent = None  # generic function that was used to instantiate this overload
@@ -194,7 +180,7 @@ class Function:
         )
         self.missing_grad = missing_grad  # whether builtin is missing a corresponding adjoint
         self.generic = generic
-        self.mangled_name: Optional[str] = None
+        self.mangled_name: str | None = None
         # allow registering functions with a different name in Python and native code
         if native_func is None:
@@ -211,12 +197,13 @@ class Function:
             # user-defined function
             # generic and concrete overload lookups by type signature
-            self.user_templates: Dict[str, Function] = {}
-            self.user_overloads: Dict[str, Function] = {}
+            self.user_templates: dict[str, Function] = {}
+            self.user_overloads: dict[str, Function] = {}
             # user defined (Python) function
             self.adj = warp.codegen.Adjoint(
                 func,
+                source=source,
                 is_user_function=True,
                 skip_forward_codegen=skip_forward_codegen,
                 skip_reverse_codegen=skip_reverse_codegen,
@@ -244,7 +231,7 @@ class Function:
             # embedded linked list of all overloads
             # the builtin_functions dictionary holds the list head for a given key (func name)
-            self.overloads: List[Function] = []
+            self.overloads: list[Function] = []
             # builtin (native) function, canonicalize argument types
             if input_types is not None:
@@ -293,10 +280,11 @@ class Function:
             module.register_function(self, scope_locals, skip_adding_overload)
     def __call__(self, *args, **kwargs):
-        # handles calling a builtin (native) function
-        # as if it was a Python function, i.e.: from
-        # within the CPython interpreter rather than
-        # from within a kernel (experimental).
+        """Call this function from the CPython interpreter.
+        This is used to call built-in or user functions from the CPython
+        interpreter, rather than from within a kernel.
+        """
         if self.is_builtin() and self.mangled_name:
             # For each of this function's existing overloads, we attempt to pack
@@ -306,7 +294,23 @@ class Function:
                 if overload.generic:
                     continue
-                success, return_value = call_builtin(overload, *args)
+                try:
+                    # Try to bind the given arguments to the function's signature.
+                    # This is not checking whether the argument types are matching,
+                    # rather it's just assigning each argument to the corresponding
+                    # function parameter.
+                    bound_args = self.signature.bind(*args, **kwargs)
+                except TypeError:
+                    continue
+                if self.defaults:
+                    # Populate the bound arguments with any default values.
+                    default_args = {k: v for k, v in self.defaults.items() if k not in bound_args.arguments}
+                    warp.codegen.apply_defaults(bound_args, default_args)
+                bound_args = tuple(bound_args.arguments.values())
+                success, return_value = call_builtin(overload, bound_args)
                 if success:
                     return return_value
@@ -324,6 +328,9 @@ class Function:
             arguments = tuple(bound_args.arguments.values())
+            # Store the last runtime error we encountered from a function execution
+            last_execution_error = None
             # try and find a matching overload
             for overload in self.user_overloads.values():
                 if len(overload.input_types) != len(arguments):
@@ -334,10 +341,25 @@ class Function:
                     # attempt to unify argument types with function template types
                     warp.types.infer_argument_types(arguments, template_types, arg_names)
                     return overload.func(*arguments)
-                except Exception:
+                except Exception as e:
+                    # The function was callable but threw an error during its execution.
+                    # This might be the intended overload, but it failed, or it might be the wrong overload.
+                    # We save this specific error and continue, just in case another overload later in the
+                    # list is a better match and doesn't fail.
+                    last_execution_error = e
                     continue
-            raise RuntimeError(f"Error calling function '{self.key}', no overload found for arguments {args}")
+            if last_execution_error:
+                # Raise a new, more contextual RuntimeError, but link it to the
+                # original error that was caught. This preserves the original
+                # traceback and error type for easier debugging.
+                raise RuntimeError(
+                    f"Error calling function '{self.key}'. No version succeeded. "
+                    f"See above for the error from the last version that was tried."
+                ) from last_execution_error
+            else:
+                # We got here without ever calling an overload.func
+                raise RuntimeError(f"Error calling function '{self.key}', no overload found for arguments {args}")
         # user-defined function with no overloads
         if self.func is None:
@@ -358,9 +380,6 @@ class Function:
             if warp.types.is_array(v) or v in complex_type_hints:
                 return False
-        if type(self.value_type) in sequence_types:
-            return False
         return True
     def mangle(self) -> str:
@@ -404,8 +423,12 @@ class Function:
             else:
                 self.user_overloads[sig] = f
-    def get_overload(self, arg_types: List[type], kwarg_types: Mapping[str, type]) -> Optional[Function]:
-        assert not self.is_builtin()
+    def get_overload(self, arg_types: list[type], kwarg_types: Mapping[str, type]) -> Function | None:
+        if self.is_builtin():
+            for f in self.overloads:
+                if warp.codegen.func_match_args(f, arg_types, kwarg_types):
+                    return f
+            return None
         for f in self.user_overloads.values():
             if warp.codegen.func_match_args(f, arg_types, kwarg_types):
@@ -439,7 +462,7 @@ class Function:
                         overload_annotations[k] = warp.codegen.strip_reference(warp.codegen.get_arg_type(d))
                 ovl = shallowcopy(f)
-                ovl.adj = warp.codegen.Adjoint(f.func, overload_annotations)
+                ovl.adj = warp.codegen.Adjoint(f.func, overload_annotations, source=f.adj.source)
                 ovl.input_types = overload_annotations
                 ovl.value_func = None
                 ovl.generic_parent = f
@@ -475,11 +498,25 @@ def get_builtin_type(return_type: type) -> type:
     return return_type
-def call_builtin(func: Function, *params: Any) -> Tuple[bool, Any]:
+def extract_return_value(value_type: type, value_ctype: type, ret: Any) -> Any:
+    if issubclass(value_ctype, ctypes.Array) or issubclass(value_ctype, ctypes.Structure):
+        # return vector types as ctypes
+        return ret
+    if value_type is warp.types.float16:
+        return warp.types.half_bits_to_float(ret.value)
+    return ret.value
+def call_builtin(func: Function, params: tuple) -> tuple[bool, Any]:
     uses_non_warp_array_type = False
     init()
+    if func.mangled_name is None:
+        return (False, None)
     # Retrieve the built-in function from Warp's dll.
     c_func = getattr(warp.context.runtime.core, func.mangled_name)
@@ -489,6 +526,8 @@ def call_builtin(func: Function, *params: Any) -> Tuple[bool, Any]:
     else:
         func_args = func.input_types
+    value_type = func.value_func(None, None)
     # Try gathering the parameters that the function expects and pack them
     # into their corresponding C types.
     c_params = []
@@ -604,9 +643,9 @@ def call_builtin(func: Function, *params: Any) -> Tuple[bool, Any]:
             if not (
                 isinstance(param, arg_type)
-                or (type(param) is float and arg_type is warp.types.float32)  # noqa: E721
-                or (type(param) is int and arg_type is warp.types.int32)  # noqa: E721
-                or (type(param) is bool and arg_type is warp.types.bool)  # noqa: E721
+                or (type(param) is float and arg_type is warp.types.float32)
+                or (type(param) is int and arg_type is warp.types.int32)
+                or (type(param) is bool and arg_type is warp.types.bool)
                 or warp.types.np_dtype_to_warp_type.get(getattr(param, "dtype", None)) is arg_type
             ):
                 return (False, None)
@@ -620,25 +659,18 @@ def call_builtin(func: Function, *params: Any) -> Tuple[bool, Any]:
             else:
                 c_params.append(arg_type._type_(param))
-    # returns the corresponding ctype for a scalar or vector warp type
-    value_type = func.value_func(None, None)
+    # Retrieve the return type.
+    value_type = func.value_func(func_args, None)
-    if value_type == float:
-        value_ctype = ctypes.c_float
-    elif value_type == int:
-        value_ctype = ctypes.c_int32
-    elif value_type == bool:
-        value_ctype = ctypes.c_bool
-    elif issubclass(value_type, (ctypes.Array, ctypes.Structure)):
-        value_ctype = value_type
-    else:
-        # scalar type
-        value_ctype = value_type._type_
+    if value_type is not None:
+        if not isinstance(value_type, Sequence):
+            value_type = (value_type,)
+        value_ctype = tuple(warp.types.type_ctype(x) for x in value_type)
+        ret = tuple(x() for x in value_ctype)
+        ret_addr = tuple(ctypes.c_void_p(ctypes.addressof(x)) for x in ret)
-    # construct return value (passed by address)
-    ret = value_ctype()
-    ret_addr = ctypes.c_void_p(ctypes.addressof(ret))
-    c_params.append(ret_addr)
+        c_params.extend(ret_addr)
     # Call the built-in function from Warp's dll.
     c_func(*c_params)
@@ -653,17 +685,14 @@ def call_builtin(func: Function, *params: Any) -> Tuple[bool, Any]:
             stacklevel=3,
         )
-    if issubclass(value_ctype, ctypes.Array) or issubclass(value_ctype, ctypes.Structure):
-        # return vector types as ctypes
-        return (True, ret)
+    if value_type is None:
+        return (True, None)
-    if value_type == warp.types.float16:
-        value = warp.types.half_bits_to_float(ret.value)
-    else:
-        value = ret.value
+    return_value = tuple(extract_return_value(x, y, z) for x, y, z in zip(value_type, value_ctype, ret))
+    if len(return_value) == 1:
+        return_value = return_value[0]
-    # return scalar types as int/float
-    return (True, value)
+    return (True, return_value)
 class KernelHooks:
@@ -677,7 +706,7 @@ class KernelHooks:
 # caches source and compiled entry points for a kernel (will be populated after module loads)
 class Kernel:
-    def __init__(self, func, key=None, module=None, options=None, code_transformers=None):
+    def __init__(self, func, key=None, module=None, options=None, code_transformers=None, source=None):
         self.func = func
         if module is None:
@@ -695,7 +724,7 @@ class Kernel:
         if code_transformers is None:
             code_transformers = []
-        self.adj = warp.codegen.Adjoint(func, transformers=code_transformers)
+        self.adj = warp.codegen.Adjoint(func, transformers=code_transformers, source=source)
         # check if generic
         self.is_generic = False
@@ -762,7 +791,7 @@ class Kernel:
         # instantiate this kernel with the given argument types
         ovl = shallowcopy(self)
-        ovl.adj = warp.codegen.Adjoint(self.func, overload_annotations)
+        ovl.adj = warp.codegen.Adjoint(self.func, overload_annotations, source=self.adj.source)
         ovl.is_generic = False
         ovl.overloads = {}
         ovl.sig = sig
@@ -798,7 +827,7 @@ class Kernel:
 # decorator to register function, @func
-def func(f: Optional[Callable] = None, *, name: Optional[str] = None):
+def func(f: Callable | None = None, *, name: str | None = None):
     def wrapper(f, *args, **kwargs):
         if name is None:
             key = warp.codegen.make_full_qualified_name(f)
@@ -831,7 +860,7 @@ def func(f: Optional[Callable] = None, *, name: Optional[str] = None):
     return wrapper(f)
-def func_native(snippet: str, adj_snippet: Optional[str] = None, replay_snippet: Optional[str] = None):
+def func_native(snippet: str, adj_snippet: str | None = None, replay_snippet: str | None = None):
     """
     Decorator to register native code snippet, @func_native
     """
@@ -1015,10 +1044,10 @@ def func_replay(forward_fn):
 def kernel(
-    f: Optional[Callable] = None,
+    f: Callable | None = None,
     *,
-    enable_backward: Optional[bool] = None,
-    module: Optional[Union[Module, Literal["unique"]]] = None,
+    enable_backward: bool | None = None,
+    module: Module | Literal["unique"] | None = None,
 ):
     """
     Decorator to register a Warp kernel from a Python function.
@@ -1181,7 +1210,7 @@ def overload(kernel, arg_types=Union[None, Dict[str, Any], List[Any]]):
 # native functions that are part of the Warp API
-builtin_functions: Dict[str, Function] = {}
+builtin_functions: dict[str, Function] = {}
 def get_generic_vtypes():
@@ -1204,13 +1233,13 @@ scalar_types.update({x: x._wp_scalar_type_ for x in warp.types.vector_types})
 def add_builtin(
     key: str,
-    input_types: Optional[Dict[str, Union[type, TypeVar]]] = None,
-    constraint: Optional[Callable[[Mapping[str, type]], bool]] = None,
-    value_type: Optional[type] = None,
-    value_func: Optional[Callable] = None,
-    export_func: Optional[Callable] = None,
-    dispatch_func: Optional[Callable] = None,
-    lto_dispatch_func: Optional[Callable] = None,
+    input_types: dict[str, type | TypeVar] | None = None,
+    constraint: Callable[[Mapping[str, type]], bool] | None = None,
+    value_type: type | None = None,
+    value_func: Callable | None = None,
+    export_func: Callable | None = None,
+    dispatch_func: Callable | None = None,
+    lto_dispatch_func: Callable | None = None,
     doc: str = "",
     namespace: str = "wp::",
     variadic: bool = False,
@@ -1220,8 +1249,8 @@ def add_builtin(
     hidden: bool = False,
     skip_replay: bool = False,
     missing_grad: bool = False,
-    native_func: Optional[str] = None,
-    defaults: Optional[Dict[str, Any]] = None,
+    native_func: str | None = None,
+    defaults: dict[str, Any] | None = None,
     require_original_output_arg: bool = False,
 ):
     """Main entry point to register a new built-in function.
@@ -1371,18 +1400,13 @@ def add_builtin(
                 return_type = value_func(concrete_arg_types, None)
-                # The return_type might just be vector_t(length=3,dtype=wp.float32), so we've got to match that
-                # in the list of hard coded types so it knows it's returning one of them:
-                if hasattr(return_type, "_wp_generic_type_hint_"):
-                    return_type_match = tuple(
-                        x
-                        for x in generic_vtypes
-                        if x._wp_generic_type_hint_ == return_type._wp_generic_type_hint_
-                        and x._wp_type_params_ == return_type._wp_type_params_
-                    )
-                    if not return_type_match:
-                        continue
-                    return_type = return_type_match[0]
+                try:
+                    if isinstance(return_type, Sequence):
+                        return_type = tuple(get_builtin_type(x) for x in return_type)
+                    else:
+                        return_type = get_builtin_type(return_type)
+                except RuntimeError:
+                    continue
                 # finally we can generate a function call for these concrete types:
                 add_builtin(
@@ -1485,7 +1509,7 @@ def register_api_function(
 # global dictionary of modules
-user_modules: Dict[str, Module] = {}
+user_modules: dict[str, Module] = {}
 def get_module(name: str) -> Module:
@@ -1608,7 +1632,7 @@ class ModuleHasher:
         ch.update(bytes(func.key, "utf-8"))
         # include all concrete and generic overloads
-        overloads: Dict[str, Function] = {**func.user_overloads, **func.user_templates}
+        overloads: dict[str, Function] = {**func.user_overloads, **func.user_templates}
         for sig in sorted(overloads.keys()):
             ovl = overloads[sig]
@@ -1857,7 +1881,7 @@ class ModuleBuilder:
 # the original Modules get reloaded.
 class ModuleExec:
     def __new__(cls, *args, **kwargs):
-        instance = super(ModuleExec, cls).__new__(cls)
+        instance = super().__new__(cls)
         instance.handle = None
         return instance
@@ -1952,7 +1976,7 @@ class ModuleExec:
 # creates a hash of the function to use for checking
 # build cache
 class Module:
-    def __init__(self, name: Optional[str], loader=None):
+    def __init__(self, name: str | None, loader=None):
         self.name = name if name is not None else "None"
         self.loader = loader
@@ -1996,6 +2020,7 @@ class Module:
             "cuda_output": None,  # supported values: "ptx", "cubin", or None (automatic)
             "mode": warp.config.mode,
             "block_dim": 256,
+            "compile_time_trace": warp.config.compile_time_trace,
         }
         # Module dependencies are determined by scanning each function
@@ -2222,7 +2247,7 @@ class Module:
             ):
                 builder_options = {
                     **self.options,
-                    # Some of the Tile codegen, such as cuFFTDx and cuBLASDx, requires knowledge of the target arch
+                    # Some of the tile codegen, such as cuFFTDx and cuBLASDx, requires knowledge of the target arch
                     "output_arch": output_arch,
                 }
                 builder = ModuleBuilder(self, builder_options, hasher=self.hashers[active_block_dim])
@@ -2291,6 +2316,7 @@ class Module:
                                 fast_math=self.options["fast_math"],
                                 fuse_fp=self.options["fuse_fp"],
                                 lineinfo=self.options["lineinfo"],
+                                compile_time_trace=self.options["compile_time_trace"],
                                 ltoirs=builder.ltoirs.values(),
                                 fatbins=builder.fatbins.values(),
                             )
@@ -2343,7 +2369,7 @@ class Module:
             # Load CPU or CUDA binary
             meta_path = os.path.join(module_dir, f"{module_name_short}.meta")
-            with open(meta_path, "r") as meta_file:
+            with open(meta_path) as meta_file:
                 meta = json.load(meta_file)
             if device.is_cpu:
@@ -2406,7 +2432,7 @@ class CpuDefaultAllocator:
     def alloc(self, size_in_bytes):
         ptr = runtime.core.alloc_host(size_in_bytes)
         if not ptr:
-            raise RuntimeError(f"Failed to allocate {size_in_bytes} bytes on device '{self.device}'")
+            raise RuntimeError(f"Failed to allocate {size_in_bytes} bytes on device 'cpu'")
         return ptr
     def free(self, ptr, size_in_bytes):
@@ -2510,12 +2536,12 @@ class Event:
     def __new__(cls, *args, **kwargs):
         """Creates a new event instance."""
-        instance = super(Event, cls).__new__(cls)
+        instance = super().__new__(cls)
         instance.owner = False
         return instance
     def __init__(
-        self, device: "Devicelike" = None, cuda_event=None, enable_timing: bool = False, interprocess: bool = False
+        self, device: Devicelike = None, cuda_event=None, enable_timing: bool = False, interprocess: bool = False
     ):
         """Initializes the event on a CUDA device.
@@ -2611,12 +2637,12 @@ class Event:
 class Stream:
     def __new__(cls, *args, **kwargs):
-        instance = super(Stream, cls).__new__(cls)
+        instance = super().__new__(cls)
         instance.cuda_stream = None
         instance.owner = False
         return instance
-    def __init__(self, device: Union["Device", str, None] = None, priority: int = 0, **kwargs):
+    def __init__(self, device: Device | str | None = None, priority: int = 0, **kwargs):
         """Initialize the stream on a device with an optional specified priority.
         Args:
@@ -2682,7 +2708,7 @@ class Stream:
             self._cached_event = Event(self.device)
         return self._cached_event
-    def record_event(self, event: Optional[Event] = None) -> Event:
+    def record_event(self, event: Event | None = None) -> Event:
         """Record an event onto the stream.
         Args:
@@ -2711,7 +2737,7 @@ class Stream:
         """
         runtime.core.cuda_stream_wait_event(self.cuda_stream, event.cuda_event)
-    def wait_stream(self, other_stream: "Stream", event: Optional[Event] = None):
+    def wait_stream(self, other_stream: Stream, event: Event | None = None):
         """Records an event on `other_stream` and makes this stream wait on it.
         All work added to this stream after this function has been called will
@@ -2765,6 +2791,8 @@ class Device:
             or ``"CPU"`` if the processor name cannot be determined.
         arch (int): The compute capability version number calculated as ``10 * major + minor``.
             ``0`` for CPU devices.
+        sm_count (int): The number of streaming multiprocessors on the CUDA device.
+            ``0`` for CPU devices.
         is_uva (bool): Indicates whether the device supports unified addressing.
             ``False`` for CPU devices.
         is_cubin_supported (bool): Indicates whether Warp's version of NVRTC can directly
@@ -2810,6 +2838,7 @@ class Device:
             # CPU device
             self.name = platform.processor() or "CPU"
             self.arch = 0
+            self.sm_count = 0
             self.is_uva = False
             self.is_mempool_supported = False
             self.is_mempool_enabled = False
@@ -2829,6 +2858,7 @@ class Device:
             # CUDA device
             self.name = runtime.core.cuda_device_get_name(ordinal).decode()
             self.arch = runtime.core.cuda_device_get_arch(ordinal)
+            self.sm_count = runtime.core.cuda_device_get_sm_count(ordinal)
             self.is_uva = runtime.core.cuda_device_is_uva(ordinal) > 0
             self.is_mempool_supported = runtime.core.cuda_device_is_mempool_supported(ordinal) > 0
             if platform.system() == "Linux":
@@ -3070,16 +3100,23 @@ class Graph:
     def __init__(self, device: Device, capture_id: int):
         self.device = device
         self.capture_id = capture_id
-        self.module_execs: Set[ModuleExec] = set()
-        self.graph_exec: Optional[ctypes.c_void_p] = None
+        self.module_execs: set[ModuleExec] = set()
+        self.graph_exec: ctypes.c_void_p | None = None
+        self.graph: ctypes.c_void_p | None = None
+        self.has_conditional = (
+            False  # Track if there are conditional nodes in the graph since they are not allowed in child graphs
+        )
     def __del__(self):
-        if not hasattr(self, "graph_exec") or not hasattr(self, "device") or not self.graph_exec:
+        if not hasattr(self, "graph") or not hasattr(self, "device") or not self.graph:
             return
         # use CUDA context guard to avoid side effects during garbage collection
         with self.device.context_guard:
-            runtime.core.cuda_graph_destroy(self.device.context, self.graph_exec)
+            runtime.core.cuda_graph_destroy(self.device.context, self.graph)
+            if hasattr(self, "graph_exec") and self.graph_exec is not None:
+                runtime.core.cuda_graph_exec_destroy(self.device.context, self.graph_exec)
     # retain executable CUDA modules used by this graph, which prevents them from being unloaded
     def retain_module_exec(self, module_exec: ModuleExec):
@@ -3088,8 +3125,6 @@ class Graph:
 class Runtime:
     def __init__(self):
-        if sys.version_info < (3, 8):
-            raise RuntimeError("Warp requires Python 3.8 as a minimum")
         if sys.version_info < (3, 9):
             warp.utils.warn(f"Python 3.9 or newer is recommended for running Warp, detected {sys.version_info}")
@@ -3535,44 +3570,40 @@ class Runtime:
             self.core.volume_get_blind_data_info.restype = ctypes.c_char_p
             bsr_matrix_from_triplets_argtypes = [
-                ctypes.c_int,  # rows_per_bock
-                ctypes.c_int,  # cols_per_blocks
+                ctypes.c_int,  # block_size
+                ctypes.c_int,  # scalar size in bytes
                 ctypes.c_int,  # row_count
-                ctypes.c_int,  # tpl_nnz
+                ctypes.c_int,  # col_count
+                ctypes.c_int,  # nnz_upper_bound
+                ctypes.POINTER(ctypes.c_int),  # tpl_nnz
                 ctypes.POINTER(ctypes.c_int),  # tpl_rows
                 ctypes.POINTER(ctypes.c_int),  # tpl_cols
                 ctypes.c_void_p,  # tpl_values
-                ctypes.c_bool,  # prune_numerical_zeros
+                ctypes.c_uint64,  # zero_value_mask
                 ctypes.c_bool,  # masked
                 ctypes.POINTER(ctypes.c_int),  # bsr_offsets
                 ctypes.POINTER(ctypes.c_int),  # bsr_columns
-                ctypes.c_void_p,  # bsr_values
+                ctypes.POINTER(ctypes.c_int),  # prefix sum of block count to sum for each bsr block
+                ctypes.POINTER(ctypes.c_int),  # indices to ptriplet blocks to sum for each bsr block
                 ctypes.POINTER(ctypes.c_int),  # bsr_nnz
                 ctypes.c_void_p,  # bsr_nnz_event
             ]
-            self.core.bsr_matrix_from_triplets_float_host.argtypes = bsr_matrix_from_triplets_argtypes
-            self.core.bsr_matrix_from_triplets_double_host.argtypes = bsr_matrix_from_triplets_argtypes
-            self.core.bsr_matrix_from_triplets_float_device.argtypes = bsr_matrix_from_triplets_argtypes
-            self.core.bsr_matrix_from_triplets_double_device.argtypes = bsr_matrix_from_triplets_argtypes
+            self.core.bsr_matrix_from_triplets_host.argtypes = bsr_matrix_from_triplets_argtypes
+            self.core.bsr_matrix_from_triplets_device.argtypes = bsr_matrix_from_triplets_argtypes
             bsr_transpose_argtypes = [
-                ctypes.c_int,  # rows_per_bock
-                ctypes.c_int,  # cols_per_blocks
                 ctypes.c_int,  # row_count
                 ctypes.c_int,  # col count
                 ctypes.c_int,  # nnz
                 ctypes.POINTER(ctypes.c_int),  # transposed_bsr_offsets
                 ctypes.POINTER(ctypes.c_int),  # transposed_bsr_columns
-                ctypes.c_void_p,  # bsr_values
                 ctypes.POINTER(ctypes.c_int),  # transposed_bsr_offsets
                 ctypes.POINTER(ctypes.c_int),  # transposed_bsr_columns
-                ctypes.c_void_p,  # transposed_bsr_values
+                ctypes.POINTER(ctypes.c_int),  # src to dest block map
             ]
-            self.core.bsr_transpose_float_host.argtypes = bsr_transpose_argtypes
-            self.core.bsr_transpose_double_host.argtypes = bsr_transpose_argtypes
-            self.core.bsr_transpose_float_device.argtypes = bsr_transpose_argtypes
-            self.core.bsr_transpose_double_device.argtypes = bsr_transpose_argtypes
+            self.core.bsr_transpose_host.argtypes = bsr_transpose_argtypes
+            self.core.bsr_transpose_device.argtypes = bsr_transpose_argtypes
             self.core.is_cuda_enabled.argtypes = None
             self.core.is_cuda_enabled.restype = ctypes.c_int
@@ -3601,6 +3632,8 @@ class Runtime:
             self.core.cuda_device_get_name.restype = ctypes.c_char_p
             self.core.cuda_device_get_arch.argtypes = [ctypes.c_int]
             self.core.cuda_device_get_arch.restype = ctypes.c_int
+            self.core.cuda_device_get_sm_count.argtypes = [ctypes.c_int]
+            self.core.cuda_device_get_sm_count.restype = ctypes.c_int
             self.core.cuda_device_is_uva.argtypes = [ctypes.c_int]
             self.core.cuda_device_is_uva.restype = ctypes.c_int
             self.core.cuda_device_is_mempool_supported.argtypes = [ctypes.c_int]
@@ -3724,11 +3757,72 @@ class Runtime:
                 ctypes.POINTER(ctypes.c_void_p),
             ]
             self.core.cuda_graph_end_capture.restype = ctypes.c_bool
+            self.core.cuda_graph_create_exec.argtypes = [
+                ctypes.c_void_p,
+                ctypes.c_void_p,
+                ctypes.POINTER(ctypes.c_void_p),
+            ]
+            self.core.cuda_graph_create_exec.restype = ctypes.c_bool
+            self.core.capture_debug_dot_print.argtypes = [ctypes.c_void_p, ctypes.c_char_p, ctypes.c_uint32]
+            self.core.capture_debug_dot_print.restype = ctypes.c_bool
             self.core.cuda_graph_launch.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
             self.core.cuda_graph_launch.restype = ctypes.c_bool
+            self.core.cuda_graph_exec_destroy.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
+            self.core.cuda_graph_exec_destroy.restype = ctypes.c_bool
             self.core.cuda_graph_destroy.argtypes = [ctypes.c_void_p, ctypes.c_void_p]
             self.core.cuda_graph_destroy.restype = ctypes.c_bool
+            self.core.cuda_graph_insert_if_else.argtypes = [
+                ctypes.c_void_p,
+                ctypes.c_void_p,
+                ctypes.POINTER(ctypes.c_int),
+                ctypes.POINTER(ctypes.c_void_p),
+                ctypes.POINTER(ctypes.c_void_p),
+            ]
+            self.core.cuda_graph_insert_if_else.restype = ctypes.c_bool
+            self.core.cuda_graph_insert_while.argtypes = [
+                ctypes.c_void_p,
+                ctypes.c_void_p,
+                ctypes.POINTER(ctypes.c_int),
+                ctypes.POINTER(ctypes.c_void_p),
+                ctypes.POINTER(ctypes.c_uint64),
+            ]
+            self.core.cuda_graph_insert_while.restype = ctypes.c_bool
+            self.core.cuda_graph_set_condition.argtypes = [
+                ctypes.c_void_p,
+                ctypes.c_void_p,
+                ctypes.POINTER(ctypes.c_int),
+                ctypes.c_uint64,
+            ]
+            self.core.cuda_graph_set_condition.restype = ctypes.c_bool
+            self.core.cuda_graph_pause_capture.argtypes = [
+                ctypes.c_void_p,
+                ctypes.c_void_p,
+                ctypes.POINTER(ctypes.c_void_p),
+            ]
+            self.core.cuda_graph_pause_capture.restype = ctypes.c_bool
+            self.core.cuda_graph_resume_capture.argtypes = [
+                ctypes.c_void_p,
+                ctypes.c_void_p,
+                ctypes.c_void_p,
+            ]
+            self.core.cuda_graph_resume_capture.restype = ctypes.c_bool
+            self.core.cuda_graph_insert_child_graph.argtypes = [
+                ctypes.c_void_p,
+                ctypes.c_void_p,
+                ctypes.c_void_p,
+            ]
+            self.core.cuda_graph_insert_child_graph.restype = ctypes.c_bool
             self.core.cuda_compile_program.argtypes = [
                 ctypes.c_char_p,  # cuda_src
                 ctypes.c_char_p,  # program name
@@ -3742,6 +3836,7 @@ class Runtime:
                 ctypes.c_bool,  # fast_math
                 ctypes.c_bool,  # fuse_fp
                 ctypes.c_bool,  # lineinfo
+                ctypes.c_bool,  # compile_time_trace
                 ctypes.c_char_p,  # output_path
                 ctypes.c_size_t,  # num_ltoirs
                 ctypes.POINTER(ctypes.c_char_p),  # ltoirs
@@ -3796,11 +3891,17 @@ class Runtime:
                 ctypes.c_int,  # arch
                 ctypes.c_int,  # M
                 ctypes.c_int,  # N
+                ctypes.c_int,  # NRHS
+                ctypes.c_int,  # function
+                ctypes.c_int,  # side
+                ctypes.c_int,  # diag
                 ctypes.c_int,  # precision
+                ctypes.c_int,  # a_arrangement
+                ctypes.c_int,  # b_arrangement
                 ctypes.c_int,  # fill_mode
                 ctypes.c_int,  # num threads
             ]
-            self.core.cuda_compile_fft.restype = ctypes.c_bool
+            self.core.cuda_compile_solver.restype = ctypes.c_bool
             self.core.cuda_load_module.argtypes = [ctypes.c_void_p, ctypes.c_char_p]
             self.core.cuda_load_module.restype = ctypes.c_void_p
@@ -4270,7 +4371,7 @@ def is_cuda_driver_initialized() -> bool:
     return runtime.core.cuda_driver_is_initialized()
-def get_devices() -> List[Device]:
+def get_devices() -> list[Device]:
     """Returns a list of devices supported in this environment."""
     init()
@@ -4291,7 +4392,7 @@ def get_cuda_device_count() -> int:
     return len(runtime.cuda_devices)
-def get_cuda_device(ordinal: Union[int, None] = None) -> Device:
+def get_cuda_device(ordinal: int | None = None) -> Device:
     """Returns the CUDA device with the given ordinal or the current CUDA device if ordinal is None."""
     init()
@@ -4302,7 +4403,7 @@ def get_cuda_device(ordinal: Union[int, None] = None) -> Device:
         return runtime.cuda_devices[ordinal]
-def get_cuda_devices() -> List[Device]:
+def get_cuda_devices() -> list[Device]:
     """Returns a list of CUDA devices supported in this environment."""
     init()
@@ -4341,7 +4442,7 @@ def set_device(ident: Devicelike) -> None:
     device.make_current()
-def map_cuda_device(alias: str, context: Optional[ctypes.c_void_p] = None) -> Device:
+def map_cuda_device(alias: str, context: ctypes.c_void_p | None = None) -> Device:
     """Assign a device alias to a CUDA context.
     This function can be used to create a wp.Device for an external CUDA context.
@@ -4436,7 +4537,7 @@ def set_mempool_enabled(device: Devicelike, enable: bool) -> None:
             raise ValueError("Memory pools are only supported on CUDA devices")
-def set_mempool_release_threshold(device: Devicelike, threshold: Union[int, float]) -> None:
+def set_mempool_release_threshold(device: Devicelike, threshold: int | float) -> None:
     """Set the CUDA memory pool release threshold on the device.
     This is the amount of reserved memory to hold onto before trying to release memory back to the OS.
@@ -4744,7 +4845,7 @@ def set_stream(stream: Stream, device: Devicelike = None, sync: bool = False) ->
     get_device(device).set_stream(stream, sync=sync)
-def record_event(event: Optional[Event] = None):
+def record_event(event: Event | None = None):
     """Convenience function for calling :meth:`Stream.record_event` on the current stream.
     Args:
@@ -4793,7 +4894,7 @@ def get_event_elapsed_time(start_event: Event, end_event: Event, synchronize: bo
     return runtime.core.cuda_event_elapsed_time(start_event.cuda_event, end_event.cuda_event)
-def wait_stream(other_stream: Stream, event: Optional[Event] = None):
+def wait_stream(other_stream: Stream, event: Event | None = None):
     """Convenience function for calling :meth:`Stream.wait_stream` on the current stream.
     Args:
@@ -4863,7 +4964,7 @@ class RegisteredGLBuffer:
     __fallback_warning_shown = False
     def __new__(cls, *args, **kwargs):
-        instance = super(RegisteredGLBuffer, cls).__new__(cls)
+        instance = super().__new__(cls)
         instance.resource = None
         return instance
@@ -4960,8 +5061,8 @@ class RegisteredGLBuffer:
 def zeros(
-    shape: Union[int, Tuple[int, ...], List[int], None] = None,
-    dtype=float,
+    shape: int | tuple[int, ...] | list[int] | None = None,
+    dtype: type = float,
     device: Devicelike = None,
     requires_grad: bool = False,
     pinned: bool = False,
@@ -4988,7 +5089,7 @@ def zeros(
 def zeros_like(
-    src: Array, device: Devicelike = None, requires_grad: Optional[bool] = None, pinned: Optional[bool] = None
+    src: Array, device: Devicelike = None, requires_grad: bool | None = None, pinned: bool | None = None
 ) -> warp.array:
     """Return a zero-initialized array with the same type and dimension of another array
@@ -5010,8 +5111,8 @@ def zeros_like(
 def ones(
-    shape: Union[int, Tuple[int, ...], List[int], None] = None,
-    dtype=float,
+    shape: int | tuple[int, ...] | list[int] | None = None,
+    dtype: type = float,
     device: Devicelike = None,
     requires_grad: bool = False,
     pinned: bool = False,
@@ -5034,7 +5135,7 @@ def ones(
 def ones_like(
-    src: Array, device: Devicelike = None, requires_grad: Optional[bool] = None, pinned: Optional[bool] = None
+    src: Array, device: Devicelike = None, requires_grad: bool | None = None, pinned: bool | None = None
 ) -> warp.array:
     """Return a one-initialized array with the same type and dimension of another array
@@ -5052,7 +5153,7 @@ def ones_like(
 def full(
-    shape: Union[int, Tuple[int, ...], List[int], None] = None,
+    shape: int | tuple[int, ...] | list[int] | None = None,
     value=0,
     dtype=Any,
     device: Devicelike = None,
@@ -5121,8 +5222,8 @@ def full_like(
     src: Array,
     value: Any,
     device: Devicelike = None,
-    requires_grad: Optional[bool] = None,
-    pinned: Optional[bool] = None,
+    requires_grad: bool | None = None,
+    pinned: bool | None = None,
 ) -> warp.array:
     """Return an array with all elements initialized to the given value with the same type and dimension of another array
@@ -5145,7 +5246,7 @@ def full_like(
 def clone(
-    src: warp.array, device: Devicelike = None, requires_grad: Optional[bool] = None, pinned: Optional[bool] = None
+    src: warp.array, device: Devicelike = None, requires_grad: bool | None = None, pinned: bool | None = None
 ) -> warp.array:
     """Clone an existing array, allocates a copy of the src memory
@@ -5167,7 +5268,7 @@ def clone(
 def empty(
-    shape: Union[int, Tuple[int, ...], List[int], None] = None,
+    shape: int | tuple[int, ...] | list[int] | None = None,
     dtype=float,
     device: Devicelike = None,
     requires_grad: bool = False,
@@ -5200,7 +5301,7 @@ def empty(
 def empty_like(
-    src: Array, device: Devicelike = None, requires_grad: Optional[bool] = None, pinned: Optional[bool] = None
+    src: Array, device: Devicelike = None, requires_grad: bool | None = None, pinned: bool | None = None
 ) -> warp.array:
     """Return an uninitialized array with the same type and dimension of another array
@@ -5235,9 +5336,9 @@ def empty_like(
 def from_numpy(
     arr: np.ndarray,
-    dtype: Optional[type] = None,
-    shape: Optional[Sequence[int]] = None,
-    device: Optional[Devicelike] = None,
+    dtype: type | None = None,
+    shape: Sequence[int] | None = None,
+    device: Devicelike | None = None,
     requires_grad: bool = False,
 ) -> warp.array:
     """Returns a Warp array created from a NumPy array.
@@ -5255,7 +5356,7 @@ def from_numpy(
     if dtype is None:
         base_type = warp.types.np_dtype_to_warp_type.get(arr.dtype)
         if base_type is None:
-            raise RuntimeError("Unsupported NumPy data type '{}'.".format(arr.dtype))
+            raise RuntimeError(f"Unsupported NumPy data type '{arr.dtype}'.")
         dim_count = len(arr.shape)
         if dim_count == 2:
@@ -5274,7 +5375,7 @@ def from_numpy(
     )
-def event_from_ipc_handle(handle, device: "Devicelike" = None) -> Event:
+def event_from_ipc_handle(handle, device: Devicelike = None) -> Event:
     """Create an event from an IPC handle.
     Args:
@@ -5443,10 +5544,10 @@ class Launch:
         self,
         kernel,
         device: Device,
-        hooks: Optional[KernelHooks] = None,
-        params: Optional[Sequence[Any]] = None,
-        params_addr: Optional[Sequence[ctypes.c_void_p]] = None,
-        bounds: Optional[launch_bounds_t] = None,
+        hooks: KernelHooks | None = None,
+        params: Sequence[Any] | None = None,
+        params_addr: Sequence[ctypes.c_void_p] | None = None,
+        bounds: launch_bounds_t | None = None,
         max_blocks: int = 0,
         block_dim: int = 256,
         adjoint: bool = False,
@@ -5516,7 +5617,7 @@ class Launch:
         self.adjoint: bool = adjoint
         """Whether to run the adjoint kernel instead of the forward kernel."""
-    def set_dim(self, dim: Union[int, List[int], Tuple[int, ...]]):
+    def set_dim(self, dim: int | list[int] | tuple[int, ...]):
         """Set the launch dimensions.
         Args:
@@ -5554,7 +5655,7 @@ class Launch:
         if self.params_addr:
             self.params_addr[params_index] = ctypes.c_void_p(ctypes.addressof(carg))
-    def set_param_at_index_from_ctype(self, index: int, value: Union[ctypes.Structure, int, float]):
+    def set_param_at_index_from_ctype(self, index: int, value: ctypes.Structure | int | float):
         """Set a kernel parameter at an index without any type conversion.
         Args:
@@ -5617,7 +5718,7 @@ class Launch:
         for i, v in enumerate(values):
             self.set_param_at_index_from_ctype(i, v)
-    def launch(self, stream: Optional[Stream] = None) -> None:
+    def launch(self, stream: Stream | None = None) -> None:
         """Launch the kernel.
         Args:
@@ -5634,7 +5735,7 @@ class Launch:
             # If the stream is capturing, we retain the CUDA module so that it doesn't get unloaded
             # before the captured graph is released.
-            if runtime.core.cuda_stream_is_capturing(stream.cuda_stream):
+            if len(runtime.captures) > 0 and runtime.core.cuda_stream_is_capturing(stream.cuda_stream):
                 capture_id = runtime.core.cuda_stream_get_capture_id(stream.cuda_stream)
                 graph = runtime.captures.get(capture_id)
                 if graph is not None:
@@ -5666,13 +5767,13 @@ class Launch:
 def launch(
     kernel,
-    dim: Union[int, Sequence[int]],
+    dim: int | Sequence[int],
     inputs: Sequence = [],
     outputs: Sequence = [],
     adj_inputs: Sequence = [],
     adj_outputs: Sequence = [],
     device: Devicelike = None,
-    stream: Optional[Stream] = None,
+    stream: Stream | None = None,
     adjoint: bool = False,
     record_tape: bool = True,
     record_cmd: bool = False,
@@ -5824,7 +5925,7 @@ def launch(
             # If the stream is capturing, we retain the CUDA module so that it doesn't get unloaded
             # before the captured graph is released.
-            if runtime.core.cuda_stream_is_capturing(stream.cuda_stream):
+            if len(runtime.captures) > 0 and runtime.core.cuda_stream_is_capturing(stream.cuda_stream):
                 capture_id = runtime.core.cuda_stream_get_capture_id(stream.cuda_stream)
                 graph = runtime.captures.get(capture_id)
                 if graph is not None:
@@ -5968,7 +6069,7 @@ def launch_tiled(*args, **kwargs):
         raise RuntimeError("wp.launch_tiled() requires a grid with fewer than 4 dimensions")
     # add trailing dimension
-    kwargs["dim"] = dim + [kwargs["block_dim"]]
+    kwargs["dim"] = [*dim, kwargs["block_dim"]]
     # forward to original launch method
     return launch(*args, **kwargs)
@@ -6016,7 +6117,7 @@ def synchronize_device(device: Devicelike = None):
         runtime.core.cuda_context_synchronize(device.context)
-def synchronize_stream(stream_or_device: Union[Stream, Devicelike, None] = None):
+def synchronize_stream(stream_or_device: Stream | Devicelike | None = None):
     """Synchronize the calling CPU thread with any outstanding CUDA work on the specified stream.
     This function allows the host application code to ensure that all kernel launches
@@ -6046,7 +6147,7 @@ def synchronize_event(event: Event):
     runtime.core.cuda_event_synchronize(event.cuda_event)
-def force_load(device: Union[Device, str, List[Device], List[str]] = None, modules: List[Module] = None):
+def force_load(device: Device | str | list[Device] | list[str] | None = None, modules: list[Module] | None = None):
     """Force user-defined kernels to be compiled and loaded
     Args:
@@ -6078,7 +6179,7 @@ def force_load(device: Union[Device, str, List[Device], List[str]] = None, modul
 def load_module(
-    module: Union[Module, types.ModuleType, str] = None, device: Union[Device, str] = None, recursive: bool = False
+    module: Module | types.ModuleType | str | None = None, device: Device | str | None = None, recursive: bool = False
 ):
     """Force user-defined module to be compiled and loaded
@@ -6120,7 +6221,7 @@ def load_module(
     force_load(device=device, modules=modules)
-def set_module_options(options: Dict[str, Any], module: Optional[Any] = None):
+def set_module_options(options: dict[str, Any], module: Any = None):
     """Set options for the current module.
     Options can be used to control runtime compilation and code-generation
@@ -6144,7 +6245,7 @@ def set_module_options(options: Dict[str, Any], module: Optional[Any] = None):
     get_module(m.__name__).mark_modified()
-def get_module_options(module: Optional[Any] = None) -> Dict[str, Any]:
+def get_module_options(module: Any = None) -> dict[str, Any]:
     """Returns a list of options for the current module."""
     if module is None:
         m = inspect.getmodule(inspect.stack()[1][0])
@@ -6156,8 +6257,8 @@ def get_module_options(module: Optional[Any] = None) -> Dict[str, Any]:
 def capture_begin(
     device: Devicelike = None,
-    stream: Optional[Stream] = None,
-    force_module_load: Optional[bool] = None,
+    stream: Stream | None = None,
+    force_module_load: bool | None = None,
     external: bool = False,
 ):
     """Begin capture of a CUDA graph
@@ -6226,7 +6327,7 @@ def capture_begin(
     runtime.captures[capture_id] = graph
-def capture_end(device: Devicelike = None, stream: Optional[Stream] = None) -> Graph:
+def capture_end(device: Devicelike = None, stream: Stream | None = None) -> Graph:
     """End the capture of a CUDA graph.
     Args:
@@ -6255,20 +6356,324 @@ def capture_end(device: Devicelike = None, stream: Optional[Stream] = None) -> G
     del runtime.captures[graph.capture_id]
     # get the graph executable
-    graph_exec = ctypes.c_void_p()
-    result = runtime.core.cuda_graph_end_capture(device.context, stream.cuda_stream, ctypes.byref(graph_exec))
+    g = ctypes.c_void_p()
+    result = runtime.core.cuda_graph_end_capture(device.context, stream.cuda_stream, ctypes.byref(g))
     if not result:
         # A concrete error should've already been reported, so we don't need to go into details here
         raise RuntimeError(f"CUDA graph capture failed. {runtime.get_error_string()}")
     # set the graph executable
-    graph.graph_exec = graph_exec
+    graph.graph = g
+    graph.graph_exec = None  # Lazy initialization
     return graph
-def capture_launch(graph: Graph, stream: Optional[Stream] = None):
+def capture_debug_dot_print(graph: Graph, path: str, verbose: bool = False):
+    """Export a CUDA graph to a DOT file for visualization
+    Args:
+        graph: A :class:`Graph` as returned by :func:`~warp.capture_end()`
+        path: Path to save the DOT file
+        verbose: Whether to include additional debug information in the output
+    """
+    if not runtime.core.capture_debug_dot_print(graph.graph, path.encode(), 0 if verbose else 1):
+        raise RuntimeError(f"Graph debug dot print error: {runtime.get_error_string()}")
+def assert_conditional_graph_support():
+    if runtime is None:
+        init()
+    if runtime.toolkit_version < (12, 4):
+        raise RuntimeError("Warp must be built with CUDA Toolkit 12.4+ to enable conditional graph nodes")
+    if runtime.driver_version < (12, 4):
+        raise RuntimeError("Conditional graph nodes require CUDA driver 12.4+")
+def capture_pause(device: Devicelike = None, stream: Stream | None = None) -> ctypes.c_void_p:
+    if stream is not None:
+        device = stream.device
+    else:
+        device = runtime.get_device(device)
+        if not device.is_cuda:
+            raise RuntimeError("Must be a CUDA device")
+        stream = device.stream
+    graph = ctypes.c_void_p()
+    if not runtime.core.cuda_graph_pause_capture(device.context, stream.cuda_stream, ctypes.byref(graph)):
+        raise RuntimeError(runtime.get_error_string())
+    return graph
+def capture_resume(graph: ctypes.c_void_p, device: Devicelike = None, stream: Stream | None = None):
+    if stream is not None:
+        device = stream.device
+    else:
+        device = runtime.get_device(device)
+        if not device.is_cuda:
+            raise RuntimeError("Must be a CUDA device")
+        stream = device.stream
+    if not runtime.core.cuda_graph_resume_capture(device.context, stream.cuda_stream, graph):
+        raise RuntimeError(runtime.get_error_string())
+# reusable pinned readback buffer for conditions
+condition_host = None
+def capture_if(
+    condition: warp.array(dtype=int),
+    on_true: Callable | Graph | None = None,
+    on_false: Callable | Graph | None = None,
+    stream: Stream = None,
+    **kwargs,
+):
+    """Create a dynamic branch based on a condition.
+    The condition value is retrieved from the first element of the ``condition`` array.
+    This function is particularly useful with CUDA graphs, but can be used without graph capture as well.
+    CUDA 12.4+ is required to take advantage of conditional graph nodes for dynamic control flow.
+    Args:
+        condition: Warp array holding the condition value.
+        on_true: A callback function or :class:`Graph` to execute if the condition is True.
+        on_false: A callback function or :class:`Graph` to execute if the condition is False.
+        stream: The CUDA stream where the condition was written. If None, use the current stream on the device where ``condition`` resides.
+    Any additional keyword arguments are forwarded to the callback functions.
+    """
+    # if neither the IF branch nor the ELSE branch is specified, it's a no-op
+    if on_true is None and on_false is None:
+        return
+    # check condition data type
+    if not isinstance(condition, warp.array) or condition.dtype is not warp.int32:
+        raise TypeError("Condition must be a Warp array of int32 with a single element")
+    device = condition.device
+    # determine the stream and whether a graph capture is active
+    if device.is_cuda:
+        if stream is None:
+            stream = device.stream
+        graph = device.captures.get(stream)
+    else:
+        graph = None
+    if graph is None:
+        # if no graph is active, just execute the correct branch directly
+        if device.is_cuda:
+            # use a pinned buffer for condition readback to host
+            global condition_host
+            if condition_host is None:
+                condition_host = warp.empty(1, dtype=int, device="cpu", pinned=True)
+            warp.copy(condition_host, condition, stream=stream)
+            warp.synchronize_stream(stream)
+            condition_value = bool(ctypes.cast(condition_host.ptr, ctypes.POINTER(ctypes.c_int32)).contents)
+        else:
+            condition_value = bool(ctypes.cast(condition.ptr, ctypes.POINTER(ctypes.c_int32)).contents)
+        if condition_value:
+            if on_true is not None:
+                if isinstance(on_true, Callable):
+                    on_true(**kwargs)
+                elif isinstance(on_true, Graph):
+                    capture_launch(on_true, stream=stream)
+                else:
+                    raise TypeError("on_true must be a Callable or a Graph")
+        else:
+            if on_false is not None:
+                if isinstance(on_false, Callable):
+                    on_false(**kwargs)
+                elif isinstance(on_false, Graph):
+                    capture_launch(on_false, stream=stream)
+                else:
+                    raise TypeError("on_false must be a Callable or a Graph")
+        return
+    graph.has_conditional = True
+    # ensure conditional graph nodes are supported
+    assert_conditional_graph_support()
+    # insert conditional node
+    graph_on_true = ctypes.c_void_p()
+    graph_on_false = ctypes.c_void_p()
+    if not runtime.core.cuda_graph_insert_if_else(
+        device.context,
+        stream.cuda_stream,
+        ctypes.cast(condition.ptr, ctypes.POINTER(ctypes.c_int32)),
+        None if on_true is None else ctypes.byref(graph_on_true),
+        None if on_false is None else ctypes.byref(graph_on_false),
+    ):
+        raise RuntimeError(runtime.get_error_string())
+    # pause capturing parent graph
+    main_graph = capture_pause(stream=stream)
+    # capture if-graph
+    if on_true is not None:
+        capture_resume(graph_on_true, stream=stream)
+        if isinstance(on_true, Callable):
+            on_true(**kwargs)
+        elif isinstance(on_true, Graph):
+            if on_true.has_conditional:
+                raise RuntimeError(
+                    "The on_true graph contains conditional nodes, which are not allowed in child graphs"
+                )
+            if not runtime.core.cuda_graph_insert_child_graph(
+                device.context,
+                stream.cuda_stream,
+                on_true.graph,
+            ):
+                raise RuntimeError(runtime.get_error_string())
+        else:
+            raise TypeError("on_true must be a Callable or a Graph")
+        capture_pause(stream=stream)
+    # capture else-graph
+    if on_false is not None:
+        capture_resume(graph_on_false, stream=stream)
+        if isinstance(on_false, Callable):
+            on_false(**kwargs)
+        elif isinstance(on_false, Graph):
+            if on_false.has_conditional:
+                raise RuntimeError(
+                    "The on_false graph contains conditional nodes, which are not allowed in child graphs"
+                )
+            if not runtime.core.cuda_graph_insert_child_graph(
+                device.context,
+                stream.cuda_stream,
+                on_false.graph,
+            ):
+                raise RuntimeError(runtime.get_error_string())
+        else:
+            raise TypeError("on_false must be a Callable or a Graph")
+        capture_pause(stream=stream)
+    # resume capturing parent graph
+    capture_resume(main_graph, stream=stream)
+def capture_while(condition: warp.array(dtype=int), while_body: Callable | Graph, stream: Stream = None, **kwargs):
+    """Create a dynamic loop based on a condition.
+    The condition value is retrieved from the first element of the ``condition`` array.
+    The ``while_body`` callback is responsible for updating the condition value so the loop can terminate.
+    This function is particularly useful with CUDA graphs, but can be used without graph capture as well.
+    CUDA 12.4+ is required to take advantage of conditional graph nodes for dynamic control flow.
+    Args:
+        condition: Warp array holding the condition value.
+        while_body: A callback function or :class:`Graph` to execute while the loop condition is True.
+        stream: The CUDA stream where the condition was written. If None, use the current stream on the device where ``condition`` resides.
+    Any additional keyword arguments are forwarded to the callback function.
+    """
+    # check condition data type
+    if not isinstance(condition, warp.array) or condition.dtype is not warp.int32:
+        raise TypeError("Condition must be a Warp array of int32 with a single element")
+    device = condition.device
+    # determine the stream and whether a graph capture is active
+    if device.is_cuda:
+        if stream is None:
+            stream = device.stream
+        graph = device.captures.get(stream)
+    else:
+        graph = None
+    if graph is None:
+        # since no graph is active, just execute the kernels directly
+        while True:
+            if device.is_cuda:
+                # use a pinned buffer for condition readback to host
+                global condition_host
+                if condition_host is None:
+                    condition_host = warp.empty(1, dtype=int, device="cpu", pinned=True)
+                warp.copy(condition_host, condition, stream=stream)
+                warp.synchronize_stream(stream)
+                condition_value = bool(ctypes.cast(condition_host.ptr, ctypes.POINTER(ctypes.c_int32)).contents)
+            else:
+                condition_value = bool(ctypes.cast(condition.ptr, ctypes.POINTER(ctypes.c_int32)).contents)
+            if condition_value:
+                if isinstance(while_body, Callable):
+                    while_body(**kwargs)
+                elif isinstance(while_body, Graph):
+                    capture_launch(while_body, stream=stream)
+                else:
+                    raise TypeError("while_body must be a callable or a graph")
+            else:
+                break
+        return
+    graph.has_conditional = True
+    # ensure conditional graph nodes are supported
+    assert_conditional_graph_support()
+    # insert conditional while-node
+    body_graph = ctypes.c_void_p()
+    cond_handle = ctypes.c_uint64()
+    if not runtime.core.cuda_graph_insert_while(
+        device.context,
+        stream.cuda_stream,
+        ctypes.cast(condition.ptr, ctypes.POINTER(ctypes.c_int32)),
+        ctypes.byref(body_graph),
+        ctypes.byref(cond_handle),
+    ):
+        raise RuntimeError(runtime.get_error_string())
+    # pause capturing parent graph and start capturing child graph
+    main_graph = capture_pause(stream=stream)
+    capture_resume(body_graph, stream=stream)
+    # capture while-body
+    if isinstance(while_body, Callable):
+        while_body(**kwargs)
+    elif isinstance(while_body, Graph):
+        if while_body.has_conditional:
+            raise RuntimeError("The body graph contains conditional nodes, which are not allowed in child graphs")
+        if not runtime.core.cuda_graph_insert_child_graph(
+            device.context,
+            stream.cuda_stream,
+            while_body.graph,
+        ):
+            raise RuntimeError(runtime.get_error_string())
+    else:
+        raise RuntimeError(runtime.get_error_string())
+    # update condition
+    if not runtime.core.cuda_graph_set_condition(
+        device.context,
+        stream.cuda_stream,
+        ctypes.cast(condition.ptr, ctypes.POINTER(ctypes.c_int32)),
+        cond_handle,
+    ):
+        raise RuntimeError(runtime.get_error_string())
+    # stop capturing child graph and resume capturing parent graph
+    capture_pause(stream=stream)
+    capture_resume(main_graph, stream=stream)
+def capture_launch(graph: Graph, stream: Stream | None = None):
     """Launch a previously captured CUDA graph
     Args:
@@ -6284,6 +6689,13 @@ def capture_launch(graph: Graph, stream: Optional[Stream] = None):
         device = graph.device
         stream = device.stream
+    if graph.graph_exec is None:
+        g = ctypes.c_void_p()
+        result = runtime.core.cuda_graph_create_exec(graph.device.context, graph.graph, ctypes.byref(g))
+        if not result:
+            raise RuntimeError(f"Graph creation error: {runtime.get_error_string()}")
+        graph.graph_exec = g
     if not runtime.core.cuda_graph_launch(graph.graph_exec, stream.cuda_stream):
         raise RuntimeError(f"Graph launch error: {runtime.get_error_string()}")
@@ -6294,7 +6706,7 @@ def copy(
     dest_offset: int = 0,
     src_offset: int = 0,
     count: int = 0,
-    stream: Optional[Stream] = None,
+    stream: Stream | None = None,
 ):
     """Copy array contents from `src` to `dest`.
@@ -6431,11 +6843,8 @@ def copy(
         # can't copy to/from fabric arrays of arrays, because they are jagged arrays of arbitrary lengths
         # TODO?
-        if (
-            isinstance(src, (warp.fabricarray, warp.indexedfabricarray))
-            and src.ndim > 1
-            or isinstance(dest, (warp.fabricarray, warp.indexedfabricarray))
-            and dest.ndim > 1
+        if (isinstance(src, (warp.fabricarray, warp.indexedfabricarray)) and src.ndim > 1) or (
+            isinstance(dest, (warp.fabricarray, warp.indexedfabricarray)) and dest.ndim > 1
         ):
             raise RuntimeError("Copying to/from Fabric arrays of arrays is not supported")
@@ -6503,7 +6912,7 @@ def type_str(t):
         return "Callable"
     elif isinstance(t, int):
         return str(t)
-    elif isinstance(t, List):
+    elif isinstance(t, (List, tuple)):
         return "Tuple[" + ", ".join(map(type_str, t)) + "]"
     elif isinstance(t, warp.array):
         return f"Array[{type_str(t.dtype)}]"
@@ -6536,12 +6945,16 @@ def type_str(t):
         raise TypeError("Invalid vector or matrix dimensions")
     elif get_origin(t) in (list, tuple):
-        args_repr = ", ".join(type_str(x) for x in get_args(t))
-        return f"{t._name}[{args_repr}]"
+        args = get_args(t)
+        if args:
+            args_repr = ", ".join(type_str(x) for x in get_args(t))
+            return f"{t._name}[{args_repr}]"
+        else:
+            return f"{t._name}"
     elif t is Ellipsis:
         return "..."
     elif warp.types.is_tile(t):
-        return "Tile"
+        return f"Tile[{type_str(t.dtype)},{type_str(t.shape)}]"
     return t.__name__
@@ -6568,14 +6981,14 @@ def resolve_exported_function_sig(f):
     # so we can generate the return type for overloaded functions
     return_type = f.value_func(func_args, None)
+    if return_type is None or (isinstance(return_type, tuple) and len(return_type) > 1):
+        return (func_args, return_type)
     try:
-        return_type_str = ctype_ret_str(return_type)
+        ctype_ret_str(return_type)
     except Exception:
         return None
-    if return_type_str.startswith("Tuple"):
-        return None
     return (func_args, return_type)
@@ -6716,13 +7129,18 @@ def export_functions_rst(file):  # pragma: no cover
         print("---------------", file=file)
         for f, is_exported in g:
+            if not isinstance(f, Function) and callable(f):
+                # f is a plain Python function
+                print(f".. autofunction:: {f.__module__}.{f.__name__}", file=file)
+                continue
             if f.func:
                 # f is a Warp function written in Python, we can use autofunction
                 print(f".. autofunction:: {f.func.__module__}.{f.key}", file=file)
                 continue
             for f_prefix, query_type in query_types:
                 if f.key.startswith(f_prefix) and query_type not in written_query_types:
-                    print(f".. autoclass:: {query_type}", file=file)
+                    print(f".. autoclass:: warp.{query_type}", file=file)
+                    print("   :exclude-members: Var, vars", file=file)
                     written_query_types.add(query_type)
                     break
@@ -6775,6 +7193,7 @@ def export_stubs(file):  # pragma: no cover
     print('Rows = TypeVar("Rows", bound=int)', file=file)
     print('Cols = TypeVar("Cols", bound=int)', file=file)
     print('DType = TypeVar("DType")', file=file)
+    print('Shape = TypeVar("Shape")', file=file)
     print("Vector = Generic[Length, Scalar]", file=file)
     print("Matrix = Generic[Rows, Cols, Scalar]", file=file)
@@ -6783,6 +7202,7 @@ def export_stubs(file):  # pragma: no cover
     print("Array = Generic[DType]", file=file)
     print("FabricArray = Generic[DType]", file=file)
     print("IndexedFabricArray = Generic[DType]", file=file)
+    print("Tile = Generic[DType, Shape]", file=file)
     # prepend __init__.py
     with open(os.path.join(os.path.dirname(file.name), "__init__.py")) as header_file:
@@ -6817,7 +7237,7 @@ def export_stubs(file):  # pragma: no cover
         if hasattr(g, "overloads"):
             for f in g.overloads:
                 add_stub(f)
-        else:
+        elif isinstance(g, Function):
             add_stub(g)
@@ -6848,16 +7268,30 @@ def export_builtins(file: io.TextIOBase):  # pragma: no cover
             args = ", ".join(f"{ctype_arg_str(v)} {k}" for k, v in func_args.items())
             params = ", ".join(func_args.keys())
-            return_str = ctype_ret_str(return_type)
-            if args == "":
-                file.write(f"WP_API void {f.mangled_name}({return_str}* ret) {{ *ret = wp::{f.key}({params}); }}\n")
-            elif return_type is None:
+            if return_type is None:
+                # void function
                 file.write(f"WP_API void {f.mangled_name}({args}) {{ wp::{f.key}({params}); }}\n")
+            elif isinstance(return_type, tuple) and len(return_type) > 1:
+                # multiple return value function using output parameters
+                outputs = tuple(f"{ctype_ret_str(x)}& ret_{i}" for i, x in enumerate(return_type))
+                output_params = ", ".join(f"ret_{i}" for i in range(len(outputs)))
+                if args:
+                    file.write(
+                        f"WP_API void {f.mangled_name}({args}, {', '.join(outputs)}) {{ wp::{f.key}({params}, {output_params}); }}\n"
+                    )
+                else:
+                    file.write(
+                        f"WP_API void {f.mangled_name}({', '.join(outputs)}) {{ wp::{f.key}({params}, {output_params}); }}\n"
+                    )
             else:
-                file.write(
-                    f"WP_API void {f.mangled_name}({args}, {return_str}* ret) {{ *ret = wp::{f.key}({params}); }}\n"
-                )
+                # single return value function
+                return_str = ctype_ret_str(return_type)
+                if args:
+                    file.write(
+                        f"WP_API void {f.mangled_name}({args}, {return_str}* ret) {{ *ret = wp::{f.key}({params}); }}\n"
+                    )
+                else:
+                    file.write(f"WP_API void {f.mangled_name}({return_str}* ret) {{ *ret = wp::{f.key}({params}); }}\n")
     file.write('\n}  // extern "C"\n\n')
     file.write("}  // namespace wp\n")