PyPI - warp-lang - Versions diffs - 0.9.0__py3-none-win_amd64.whl → 0.11.0__py3-none-win_amd64.whl - Mend

warp-lang 0.9.0__py3-none-win_amd64.whl → 0.11.0__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (315) hide show

warp/__init__.py +15 -7
warp/__init__.pyi +1 -0
warp/bin/warp-clang.dll +0 -0
warp/bin/warp.dll +0 -0
warp/build.py +22 -443
warp/build_dll.py +384 -0
warp/builtins.py +998 -488
warp/codegen.py +1307 -739
warp/config.py +5 -3
warp/constants.py +6 -0
warp/context.py +1291 -548
warp/dlpack.py +31 -31
warp/fabric.py +326 -0
warp/fem/__init__.py +27 -0
warp/fem/cache.py +389 -0
warp/fem/dirichlet.py +181 -0
warp/fem/domain.py +263 -0
warp/fem/field/__init__.py +101 -0
warp/fem/field/field.py +149 -0
warp/fem/field/nodal_field.py +299 -0
warp/fem/field/restriction.py +21 -0
warp/fem/field/test.py +181 -0
warp/fem/field/trial.py +183 -0
warp/fem/geometry/__init__.py +19 -0
warp/fem/geometry/closest_point.py +70 -0
warp/fem/geometry/deformed_geometry.py +271 -0
warp/fem/geometry/element.py +744 -0
warp/fem/geometry/geometry.py +186 -0
warp/fem/geometry/grid_2d.py +373 -0
warp/fem/geometry/grid_3d.py +435 -0
warp/fem/geometry/hexmesh.py +953 -0
warp/fem/geometry/partition.py +376 -0
warp/fem/geometry/quadmesh_2d.py +532 -0
warp/fem/geometry/tetmesh.py +840 -0
warp/fem/geometry/trimesh_2d.py +577 -0
warp/fem/integrate.py +1616 -0
warp/fem/operator.py +191 -0
warp/fem/polynomial.py +213 -0
warp/fem/quadrature/__init__.py +2 -0
warp/fem/quadrature/pic_quadrature.py +245 -0
warp/fem/quadrature/quadrature.py +294 -0
warp/fem/space/__init__.py +292 -0
warp/fem/space/basis_space.py +489 -0
warp/fem/space/collocated_function_space.py +105 -0
warp/fem/space/dof_mapper.py +236 -0
warp/fem/space/function_space.py +145 -0
warp/fem/space/grid_2d_function_space.py +267 -0
warp/fem/space/grid_3d_function_space.py +306 -0
warp/fem/space/hexmesh_function_space.py +352 -0
warp/fem/space/partition.py +350 -0
warp/fem/space/quadmesh_2d_function_space.py +369 -0
warp/fem/space/restriction.py +160 -0
warp/fem/space/shape/__init__.py +15 -0
warp/fem/space/shape/cube_shape_function.py +738 -0
warp/fem/space/shape/shape_function.py +103 -0
warp/fem/space/shape/square_shape_function.py +611 -0
warp/fem/space/shape/tet_shape_function.py +567 -0
warp/fem/space/shape/triangle_shape_function.py +429 -0
warp/fem/space/tetmesh_function_space.py +292 -0
warp/fem/space/topology.py +295 -0
warp/fem/space/trimesh_2d_function_space.py +221 -0
warp/fem/types.py +77 -0
warp/fem/utils.py +495 -0
warp/native/array.h +164 -55
warp/native/builtin.h +150 -174
warp/native/bvh.cpp +75 -328
warp/native/bvh.cu +406 -23
warp/native/bvh.h +37 -45
warp/native/clang/clang.cpp +136 -24
warp/native/crt.cpp +1 -76
warp/native/crt.h +111 -104
warp/native/cuda_crt.h +1049 -0
warp/native/cuda_util.cpp +15 -3
warp/native/cuda_util.h +3 -1
warp/native/cutlass/tools/library/scripts/conv2d_operation.py +463 -0
warp/native/cutlass/tools/library/scripts/conv3d_operation.py +321 -0
warp/native/cutlass/tools/library/scripts/gemm_operation.py +988 -0
warp/native/cutlass/tools/library/scripts/generator.py +4625 -0
warp/native/cutlass/tools/library/scripts/library.py +799 -0
warp/native/cutlass/tools/library/scripts/manifest.py +402 -0
warp/native/cutlass/tools/library/scripts/pycutlass/docs/source/conf.py +96 -0
warp/native/cutlass/tools/library/scripts/pycutlass/profile/conv/conv2d_f16_sm80.py +106 -0
warp/native/cutlass/tools/library/scripts/pycutlass/profile/gemm/gemm_f32_sm80.py +91 -0
warp/native/cutlass/tools/library/scripts/pycutlass/setup.py +80 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/__init__.py +48 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/arguments.py +118 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/c_types.py +241 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/compiler.py +432 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/conv2d_operation.py +631 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/epilogue.py +1026 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/frontend.py +104 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/gemm_operation.py +1276 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/library.py +744 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/memory_manager.py +74 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/operation.py +110 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/parser.py +619 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/reduction_operation.py +398 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/tensor_ref.py +70 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/__init__.py +4 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/conv2d_testbed.py +646 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/gemm_grouped_testbed.py +235 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/gemm_testbed.py +557 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/profiler.py +70 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/type_hint.py +39 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/utils/__init__.py +1 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/utils/device.py +76 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/utils/reference_model.py +255 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/__init__.py +0 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py +201 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py +177 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py +98 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py +95 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_few_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.py +163 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_fixed_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.py +187 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py +309 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py +54 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py +96 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py +107 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_strided_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py +253 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py +97 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py +242 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py +96 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py +107 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/run_all_tests.py +10 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/frontend/test_frontend.py +146 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/__init__.py +0 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_bf16_sm80.py +96 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_f16_sm80.py +447 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_f32_sm80.py +146 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_f64_sm80.py +102 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_grouped_sm80.py +203 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_s8_sm80.py +229 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/run_all_tests.py +9 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/unit/test_sm80.py +453 -0
warp/native/cutlass/tools/library/scripts/rank_2k_operation.py +398 -0
warp/native/cutlass/tools/library/scripts/rank_k_operation.py +387 -0
warp/native/cutlass/tools/library/scripts/rt.py +796 -0
warp/native/cutlass/tools/library/scripts/symm_operation.py +400 -0
warp/native/cutlass/tools/library/scripts/trmm_operation.py +407 -0
warp/native/cutlass_gemm.cu +5 -3
warp/native/exports.h +1240 -949
warp/native/fabric.h +228 -0
warp/native/hashgrid.cpp +4 -4
warp/native/hashgrid.h +22 -2
warp/native/initializer_array.h +2 -2
warp/native/intersect.h +22 -7
warp/native/intersect_adj.h +8 -8
warp/native/intersect_tri.h +13 -16
warp/native/marching.cu +157 -161
warp/native/mat.h +119 -19
warp/native/matnn.h +2 -2
warp/native/mesh.cpp +108 -83
warp/native/mesh.cu +243 -6
warp/native/mesh.h +1547 -458
warp/native/nanovdb/NanoVDB.h +1 -1
warp/native/noise.h +272 -329
warp/native/quat.h +51 -8
warp/native/rand.h +45 -35
warp/native/range.h +6 -2
warp/native/reduce.cpp +157 -0
warp/native/reduce.cu +348 -0
warp/native/runlength_encode.cpp +62 -0
warp/native/runlength_encode.cu +46 -0
warp/native/scan.cu +11 -13
warp/native/scan.h +1 -0
warp/native/solid_angle.h +442 -0
warp/native/sort.cpp +13 -0
warp/native/sort.cu +9 -1
warp/native/sparse.cpp +338 -0
warp/native/sparse.cu +545 -0
warp/native/spatial.h +2 -2
warp/native/temp_buffer.h +30 -0
warp/native/vec.h +126 -24
warp/native/volume.h +120 -0
warp/native/warp.cpp +658 -53
warp/native/warp.cu +660 -68
warp/native/warp.h +112 -12
warp/optim/__init__.py +1 -0
warp/optim/linear.py +922 -0
warp/optim/sgd.py +92 -0
warp/render/render_opengl.py +392 -152
warp/render/render_usd.py +11 -11
warp/sim/__init__.py +2 -2
warp/sim/articulation.py +385 -185
warp/sim/collide.py +21 -8
warp/sim/import_mjcf.py +297 -106
warp/sim/import_urdf.py +389 -210
warp/sim/import_usd.py +198 -97
warp/sim/inertia.py +17 -18
warp/sim/integrator_euler.py +14 -8
warp/sim/integrator_xpbd.py +161 -19
warp/sim/model.py +795 -291
warp/sim/optimizer.py +2 -6
warp/sim/render.py +65 -3
warp/sim/utils.py +3 -0
warp/sparse.py +1227 -0
warp/stubs.py +665 -223
warp/tape.py +66 -15
warp/tests/__main__.py +3 -6
warp/tests/assets/curlnoise_golden.npy +0 -0
warp/tests/assets/pnoise_golden.npy +0 -0
warp/tests/assets/torus.usda +105 -105
warp/tests/{test_class_kernel.py → aux_test_class_kernel.py} +9 -1
warp/tests/aux_test_conditional_unequal_types_kernels.py +21 -0
warp/tests/{test_dependent.py → aux_test_dependent.py} +2 -2
warp/tests/{test_reference.py → aux_test_reference.py} +1 -1
warp/tests/aux_test_unresolved_func.py +14 -0
warp/tests/aux_test_unresolved_symbol.py +14 -0
warp/tests/disabled_kinematics.py +239 -0
warp/tests/run_coverage_serial.py +31 -0
warp/tests/test_adam.py +103 -106
warp/tests/test_arithmetic.py +128 -74
warp/tests/test_array.py +1497 -211
warp/tests/test_array_reduce.py +150 -0
warp/tests/test_atomic.py +64 -28
warp/tests/test_bool.py +99 -0
warp/tests/test_builtins_resolution.py +1292 -0
warp/tests/test_bvh.py +75 -43
warp/tests/test_closest_point_edge_edge.py +54 -57
warp/tests/test_codegen.py +233 -128
warp/tests/test_compile_consts.py +28 -20
warp/tests/test_conditional.py +108 -24
warp/tests/test_copy.py +10 -12
warp/tests/test_ctypes.py +112 -88
warp/tests/test_dense.py +21 -14
warp/tests/test_devices.py +98 -0
warp/tests/test_dlpack.py +136 -108
warp/tests/test_examples.py +277 -0
warp/tests/test_fabricarray.py +955 -0
warp/tests/test_fast_math.py +15 -11
warp/tests/test_fem.py +1271 -0
warp/tests/test_fp16.py +53 -19
warp/tests/test_func.py +187 -74
warp/tests/test_generics.py +194 -49
warp/tests/test_grad.py +180 -116
warp/tests/test_grad_customs.py +176 -0
warp/tests/test_hash_grid.py +52 -37
warp/tests/test_import.py +10 -23
warp/tests/test_indexedarray.py +577 -24
warp/tests/test_intersect.py +18 -9
warp/tests/test_large.py +141 -0
warp/tests/test_launch.py +251 -15
warp/tests/test_lerp.py +64 -65
warp/tests/test_linear_solvers.py +154 -0
warp/tests/test_lvalue.py +493 -0
warp/tests/test_marching_cubes.py +12 -13
warp/tests/test_mat.py +508 -2778
warp/tests/test_mat_lite.py +115 -0
warp/tests/test_mat_scalar_ops.py +2889 -0
warp/tests/test_math.py +103 -9
warp/tests/test_matmul.py +305 -69
warp/tests/test_matmul_lite.py +410 -0
warp/tests/test_mesh.py +71 -14
warp/tests/test_mesh_query_aabb.py +41 -25
warp/tests/test_mesh_query_point.py +325 -34
warp/tests/test_mesh_query_ray.py +39 -22
warp/tests/test_mlp.py +30 -22
warp/tests/test_model.py +92 -89
warp/tests/test_modules_lite.py +39 -0
warp/tests/test_multigpu.py +88 -114
warp/tests/test_noise.py +12 -11
warp/tests/test_operators.py +16 -20
warp/tests/test_options.py +11 -11
warp/tests/test_pinned.py +17 -18
warp/tests/test_print.py +32 -11
warp/tests/test_quat.py +275 -129
warp/tests/test_rand.py +18 -16
warp/tests/test_reload.py +38 -34
warp/tests/test_rounding.py +50 -43
warp/tests/test_runlength_encode.py +190 -0
warp/tests/test_smoothstep.py +9 -11
warp/tests/test_snippet.py +143 -0
warp/tests/test_sparse.py +460 -0
warp/tests/test_spatial.py +276 -243
warp/tests/test_streams.py +110 -85
warp/tests/test_struct.py +331 -85
warp/tests/test_tape.py +39 -21
warp/tests/test_torch.py +118 -89
warp/tests/test_transient_module.py +12 -13
warp/tests/test_types.py +614 -0
warp/tests/test_utils.py +494 -0
warp/tests/test_vec.py +354 -1987
warp/tests/test_vec_lite.py +73 -0
warp/tests/test_vec_scalar_ops.py +2099 -0
warp/tests/test_volume.py +457 -293
warp/tests/test_volume_write.py +124 -134
warp/tests/unittest_serial.py +35 -0
warp/tests/unittest_suites.py +341 -0
warp/tests/unittest_utils.py +568 -0
warp/tests/unused_test_misc.py +71 -0
warp/tests/{test_debug.py → walkthough_debug.py} +3 -17
warp/thirdparty/appdirs.py +36 -45
warp/thirdparty/unittest_parallel.py +549 -0
warp/torch.py +72 -30
warp/types.py +1744 -713
warp/utils.py +360 -350
warp_lang-0.11.0.dist-info/LICENSE.md +36 -0
warp_lang-0.11.0.dist-info/METADATA +238 -0
warp_lang-0.11.0.dist-info/RECORD +332 -0
{warp_lang-0.9.0.dist-info → warp_lang-0.11.0.dist-info}/WHEEL +1 -1
warp/bin/warp-clang.exp +0 -0
warp/bin/warp-clang.lib +0 -0
warp/bin/warp.exp +0 -0
warp/bin/warp.lib +0 -0
warp/tests/test_all.py +0 -215
warp/tests/test_array_scan.py +0 -60
warp/tests/test_base.py +0 -208
warp/tests/test_unresolved_func.py +0 -7
warp/tests/test_unresolved_symbol.py +0 -7
warp_lang-0.9.0.dist-info/METADATA +0 -20
warp_lang-0.9.0.dist-info/RECORD +0 -177
/warp/tests/{test_compile_consts_dummy.py → aux_test_compile_consts_dummy.py} +0 -0
/warp/tests/{test_reference_reference.py → aux_test_reference_reference.py} +0 -0
/warp/tests/{test_square.py → aux_test_square.py} +0 -0
{warp_lang-0.9.0.dist-info → warp_lang-0.11.0.dist-info}/top_level.txt +0 -0

warp/context.py CHANGED Viewed

@@ -5,37 +5,27 @@
 # distribution of this software and related documentation without an express
 # license agreement from NVIDIA CORPORATION is strictly prohibited.
-import math
-import os
-import sys
-import hashlib
+import ast
 import ctypes
+import gc
+import hashlib
+import inspect
+import io
+import os
 import platform
-import ast
+import sys
 import types
-import inspect
-from typing import Tuple
-from typing import List
-from typing import Dict
-from typing import Any
-from typing import Callable
-from typing import Union
-from typing import Mapping
-from typing import Optional
+from copy import copy as shallowcopy
 from types import ModuleType
+from typing import Any, Callable, Dict, List, Mapping, Optional, Sequence, Tuple, Union
-from copy import copy as shallowcopy
+import numpy as np
 import warp
-import warp.utils
-import warp.codegen
 import warp.build
+import warp.codegen
 import warp.config
-import numpy as np
 # represents either a built-in or user-defined function
@@ -46,6 +36,18 @@ def create_value_func(type):
     return value_func
+def get_function_args(func):
+    """Ensures that all function arguments are annotated and returns a dictionary mapping from argument name to its type."""
+    import inspect
+    argspec = inspect.getfullargspec(func)
+    # use source-level argument annotations
+    if len(argspec.annotations) < len(argspec.args):
+        raise RuntimeError(f"Incomplete argument annotations on function {func.__qualname__}")
+    return argspec.annotations
 class Function:
     def __init__(
         self,
@@ -67,6 +69,17 @@ class Function:
         generic=False,
         native_func=None,
         defaults=None,
+        custom_replay_func=None,
+        native_snippet=None,
+        adj_native_snippet=None,
+        skip_forward_codegen=False,
+        skip_reverse_codegen=False,
+        custom_reverse_num_input_args=-1,
+        custom_reverse_mode=False,
+        overloaded_annotations=None,
+        code_transformers=[],
+        skip_adding_overload=False,
+        require_original_output_arg=False,
     ):
         self.func = func  # points to Python function decorated with @wp.func, may be None for builtins
         self.key = key
@@ -80,6 +93,12 @@ class Function:
         self.module = module
         self.variadic = variadic  # function can take arbitrary number of inputs, e.g.: printf()
         self.defaults = defaults
+        # Function instance for a custom implementation of the replay pass
+        self.custom_replay_func = custom_replay_func
+        self.native_snippet = native_snippet
+        self.adj_native_snippet = adj_native_snippet
+        self.custom_grad_func = None
+        self.require_original_output_arg = require_original_output_arg
         if initializer_list_func is None:
             self.initializer_list_func = lambda x, y: False
@@ -108,7 +127,16 @@ class Function:
             self.user_overloads = {}
             # user defined (Python) function
-            self.adj = warp.codegen.Adjoint(func)
+            self.adj = warp.codegen.Adjoint(
+                func,
+                is_user_function=True,
+                skip_forward_codegen=skip_forward_codegen,
+                skip_reverse_codegen=skip_reverse_codegen,
+                custom_reverse_num_input_args=custom_reverse_num_input_args,
+                custom_reverse_mode=custom_reverse_mode,
+                overload_annotations=overloaded_annotations,
+                transformers=code_transformers,
+            )
             # record input types
             for name, type in self.adj.arg_types.items():
@@ -136,11 +164,12 @@ class Function:
             else:
                 self.mangled_name = None
-        self.add_overload(self)
+        if not skip_adding_overload:
+            self.add_overload(self)
         # add to current module
         if module:
-            module.register_function(self)
+            module.register_function(self, skip_adding_overload)
     def __call__(self, *args, **kwargs):
         # handles calling a builtin (native) function
@@ -149,124 +178,52 @@ class Function:
         # from within a kernel (experimental).
         if self.is_builtin() and self.mangled_name:
-            # store last error during overload resolution
-            error = None
-            for f in self.overloads:
-                if f.generic:
+            # For each of this function's existing overloads, we attempt to pack
+            # the given arguments into the C types expected by the corresponding
+            # parameters, and we rinse and repeat until we get a match.
+            for overload in self.overloads:
+                if overload.generic:
                     continue
-                # try and find builtin in the warp.dll
-                if hasattr(warp.context.runtime.core, f.mangled_name) == False:
-                    raise RuntimeError(
-                        f"Couldn't find function {self.key} with mangled name {f.mangled_name} in the Warp native library"
-                    )
-                try:
-                    # try and pack args into what the function expects
-                    params = []
-                    for i, (arg_name, arg_type) in enumerate(f.input_types.items()):
-                        a = args[i]
-                        # try to convert to a value type (vec3, mat33, etc)
-                        if issubclass(arg_type, ctypes.Array):
-                            # wrap the arg_type (which is an ctypes.Array) in a structure
-                            # to ensure parameter is passed to the .dll by value rather than reference
-                            class ValueArg(ctypes.Structure):
-                                _fields_ = [("value", arg_type)]
-                            x = ValueArg()
-                            # force conversion to ndarray first (handles tuple / list, Gf.Vec3 case)
-                            if isinstance(a, ctypes.Array) == False:
-                                # assume you want the float32 version of the function so it doesn't just
-                                # grab an override for a random data type:
-                                if arg_type._type_ != ctypes.c_float:
-                                    raise RuntimeError(
-                                        f"Error calling function '{f.key}', parameter for argument '{arg_name}' does not have c_float type."
-                                    )
-                                a = np.array(a)
-                                # flatten to 1D array
-                                v = a.flatten()
-                                if len(v) != arg_type._length_:
-                                    raise RuntimeError(
-                                        f"Error calling function '{f.key}', parameter for argument '{arg_name}' has length {len(v)}, but expected {arg_type._length_}. Could not convert parameter to {arg_type}."
-                                    )
-                                for i in range(arg_type._length_):
-                                    x.value[i] = v[i]
-                            else:
-                                # already a built-in type, check it matches
-                                if not warp.types.types_equal(type(a), arg_type):
-                                    raise RuntimeError(
-                                        f"Error calling function '{f.key}', parameter for argument '{arg_name}' has type '{type(a)}' but expected '{arg_type}'"
-                                    )
-                                x.value = a
-                            params.append(x)
-                        else:
-                            try:
-                                # try to pack as a scalar type
-                                params.append(arg_type._type_(a))
-                            except:
-                                raise RuntimeError(
-                                    f"Error calling function {f.key}, unable to pack function parameter type {type(a)} for param {arg_name}, expected {arg_type}"
-                                )
-                    # returns the corresponding ctype for a scalar or vector warp type
-                    def type_ctype(dtype):
-                        if dtype == float:
-                            return ctypes.c_float
-                        elif dtype == int:
-                            return ctypes.c_int32
-                        elif issubclass(dtype, ctypes.Array):
-                            return dtype
-                        elif issubclass(dtype, ctypes.Structure):
-                            return dtype
-                        else:
-                            # scalar type
-                            return dtype._type_
-                    value_type = type_ctype(f.value_func(None, None, None))
-                    # construct return value (passed by address)
-                    ret = value_type()
-                    ret_addr = ctypes.c_void_p(ctypes.addressof(ret))
+                success, return_value = call_builtin(overload, *args)
+                if success:
+                    return return_value
-                    params.append(ret_addr)
+            # overload resolution or call failed
+            raise RuntimeError(
+                f"Couldn't find a function '{self.key}' compatible with "
+                f"the arguments '{', '.join(type(x).__name__ for x in args)}'"
+            )
-                    c_func = getattr(warp.context.runtime.core, f.mangled_name)
-                    c_func(*params)
+        if hasattr(self, "user_overloads") and len(self.user_overloads):
+            # user-defined function with overloads
-                    if issubclass(value_type, ctypes.Array) or issubclass(value_type, ctypes.Structure):
-                        # return vector types as ctypes
-                        return ret
-                    else:
-                        # return scalar types as int/float
-                        return ret.value
+            if len(kwargs):
+                raise RuntimeError(
+                    f"Error calling function '{self.key}', keyword arguments are not supported for user-defined overloads."
+                )
-                except Exception as e:
-                    # couldn't pack values to match this overload
-                    # store error and move onto the next one
-                    error = e
+            # try and find a matching overload
+            for overload in self.user_overloads.values():
+                if len(overload.input_types) != len(args):
+                    continue
+                template_types = list(overload.input_types.values())
+                arg_names = list(overload.input_types.keys())
+                try:
+                    # attempt to unify argument types with function template types
+                    warp.types.infer_argument_types(args, template_types, arg_names)
+                    return overload.func(*args)
+                except Exception:
                     continue
-            # overload resolution or call failed
-            # raise the last exception encountered
-            if error:
-                raise error
-            else:
-                raise RuntimeError(f"Error calling function '{f.key}'.")
+            raise RuntimeError(f"Error calling function '{self.key}', no overload found for arguments {args}")
-        else:
-            raise RuntimeError(
-                f"Error, functions decorated with @wp.func can only be called from within Warp kernels (trying to call {self.key}())"
-            )
+        # user-defined function with no overloads
+        if self.func is None:
+            raise RuntimeError(f"Error calling function '{self.key}', function is undefined")
+        # this function has no overloads, call it like a plain Python function
+        return self.func(*args, **kwargs)
     def is_builtin(self):
         return self.func is None
@@ -286,7 +243,7 @@ class Function:
             # todo: construct a default value for each of the functions args
             # so we can generate the return type for overloaded functions
             return_type = type_str(self.value_func(None, None, None))
-        except:
+        except Exception:
             return False
         if return_type.startswith("Tuple"):
@@ -379,10 +336,187 @@ class Function:
             return None
     def __repr__(self):
-        inputs_str = ", ".join([f"{k}: {v.__name__}" for k, v in self.input_types.items()])
+        inputs_str = ", ".join([f"{k}: {warp.types.type_repr(v)}" for k, v in self.input_types.items()])
         return f"<Function {self.key}({inputs_str})>"
+def call_builtin(func: Function, *params) -> Tuple[bool, Any]:
+    uses_non_warp_array_type = False
+    # Retrieve the built-in function from Warp's dll.
+    c_func = getattr(warp.context.runtime.core, func.mangled_name)
+    # Try gathering the parameters that the function expects and pack them
+    # into their corresponding C types.
+    c_params = []
+    for i, (_, arg_type) in enumerate(func.input_types.items()):
+        param = params[i]
+        try:
+            iter(param)
+        except TypeError:
+            is_array = False
+        else:
+            is_array = True
+        if is_array:
+            if not issubclass(arg_type, ctypes.Array):
+                return (False, None)
+            # The argument expects a built-in Warp type like a vector or a matrix.
+            c_param = None
+            if isinstance(param, ctypes.Array):
+                # The given parameter is also a built-in Warp type, so we only need
+                # to make sure that it matches with the argument.
+                if not warp.types.types_equal(type(param), arg_type):
+                    return (False, None)
+                if isinstance(param, arg_type):
+                    c_param = param
+                else:
+                    # Cast the value to its argument type to make sure that it
+                    # can be assigned to the field of the `Param` struct.
+                    # This could error otherwise when, for example, the field type
+                    # is set to `vec3i` while the value is of type `vector(length=3, dtype=int)`,
+                    # even though both types are semantically identical.
+                    c_param = arg_type(param)
+            else:
+                # Flatten the parameter values into a flat 1-D array.
+                arr = []
+                ndim = 1
+                stack = [(0, param)]
+                while stack:
+                    depth, elem = stack.pop(0)
+                    try:
+                        # If `elem` is a sequence, then it should be possible
+                        # to add its elements to the stack for later processing.
+                        stack.extend((depth + 1, x) for x in elem)
+                    except TypeError:
+                        # Since `elem` doesn't seem to be a sequence,
+                        # we must have a leaf value that we need to add to our
+                        # resulting array.
+                        arr.append(elem)
+                        ndim = max(depth, ndim)
+                assert ndim > 0
+                # Ensure that if the given parameter value is, say, a 2-D array,
+                # then we try to resolve it against a matrix argument rather than
+                # a vector.
+                if ndim > len(arg_type._shape_):
+                    return (False, None)
+                elem_count = len(arr)
+                if elem_count != arg_type._length_:
+                    return (False, None)
+                # Retrieve the element type of the sequence while ensuring
+                # that it's homogeneous.
+                elem_type = type(arr[0])
+                for i in range(1, elem_count):
+                    if type(arr[i]) is not elem_type:
+                        raise ValueError("All array elements must share the same type.")
+                expected_elem_type = arg_type._wp_scalar_type_
+                if not (
+                    elem_type is expected_elem_type
+                    or (elem_type is float and expected_elem_type is warp.types.float32)
+                    or (elem_type is int and expected_elem_type is warp.types.int32)
+                    or (
+                        issubclass(elem_type, np.number)
+                        and warp.types.np_dtype_to_warp_type[np.dtype(elem_type)] is expected_elem_type
+                    )
+                ):
+                    # The parameter value has a type not matching the type defined
+                    # for the corresponding argument.
+                    return (False, None)
+                if elem_type in warp.types.int_types:
+                    # Pass the value through the expected integer type
+                    # in order to evaluate any integer wrapping.
+                    # For example `uint8(-1)` should result in the value `-255`.
+                    arr = tuple(elem_type._type_(x.value).value for x in arr)
+                elif elem_type in warp.types.float_types:
+                    # Extract the floating-point values.
+                    arr = tuple(x.value for x in arr)
+                c_param = arg_type()
+                if warp.types.type_is_matrix(arg_type):
+                    rows, cols = arg_type._shape_
+                    for i in range(rows):
+                        idx_start = i * cols
+                        idx_end = idx_start + cols
+                        c_param[i] = arr[idx_start:idx_end]
+                else:
+                    c_param[:] = arr
+                uses_non_warp_array_type = True
+            c_params.append(ctypes.byref(c_param))
+        else:
+            if issubclass(arg_type, ctypes.Array):
+                return (False, None)
+            if not (
+                isinstance(param, arg_type)
+                or (type(param) is float and arg_type is warp.types.float32)
+                or (type(param) is int and arg_type is warp.types.int32)
+                or warp.types.np_dtype_to_warp_type.get(getattr(param, "dtype", None)) is arg_type
+            ):
+                return (False, None)
+            if type(param) in warp.types.scalar_types:
+                param = param.value
+            # try to pack as a scalar type
+            if arg_type == warp.types.float16:
+                c_params.append(arg_type._type_(warp.types.float_to_half_bits(param)))
+            else:
+                c_params.append(arg_type._type_(param))
+    # returns the corresponding ctype for a scalar or vector warp type
+    value_type = func.value_func(None, None, None)
+    if value_type == float:
+        value_ctype = ctypes.c_float
+    elif value_type == int:
+        value_ctype = ctypes.c_int32
+    elif issubclass(value_type, (ctypes.Array, ctypes.Structure)):
+        value_ctype = value_type
+    else:
+        # scalar type
+        value_ctype = value_type._type_
+    # construct return value (passed by address)
+    ret = value_ctype()
+    ret_addr = ctypes.c_void_p(ctypes.addressof(ret))
+    c_params.append(ret_addr)
+    # Call the built-in function from Warp's dll.
+    c_func(*c_params)
+    if uses_non_warp_array_type:
+        warp.utils.warn(
+            "Support for built-in functions called with non-Warp array types, "
+            "such as lists, tuples, NumPy arrays, and others, will be dropped "
+            "in the future. Use a Warp type such as `wp.vec`, `wp.mat`, "
+            "`wp.quat`, or `wp.transform`.",
+            DeprecationWarning,
+            stacklevel=3,
+        )
+    if issubclass(value_ctype, ctypes.Array) or issubclass(value_ctype, ctypes.Structure):
+        # return vector types as ctypes
+        return (True, ret)
+    if value_type == warp.types.float16:
+        return (True, warp.types.half_bits_to_float(ret.value))
+    # return scalar types as int/float
+    return (True, ret.value)
 class KernelHooks:
     def __init__(self, forward, backward):
         self.forward = forward
@@ -391,13 +525,23 @@ class KernelHooks:
 # caches source and compiled entry points for a kernel (will be populated after module loads)
 class Kernel:
-    def __init__(self, func, key, module, options=None):
+    def __init__(self, func, key=None, module=None, options=None, code_transformers=[]):
         self.func = func
-        self.module = module
-        self.key = key
+        if module is None:
+            self.module = get_module(func.__module__)
+        else:
+            self.module = module
+        if key is None:
+            unique_key = self.module.generate_unique_kernel_key(func.__name__)
+            self.key = unique_key
+        else:
+            self.key = key
         self.options = {} if options is None else options
-        self.adj = warp.codegen.Adjoint(func)
+        self.adj = warp.codegen.Adjoint(func, transformers=code_transformers)
         # check if generic
         self.is_generic = False
@@ -415,8 +559,8 @@ class Kernel:
         # argument indices by name
         self.arg_indices = dict((a.label, i) for i, a in enumerate(self.adj.args))
-        if module:
-            module.register_kernel(self)
+        if self.module:
+            self.module.register_kernel(self)
     def infer_argument_types(self, args):
         template_types = list(self.adj.arg_types.values())
@@ -425,44 +569,8 @@ class Kernel:
             raise RuntimeError(f"Invalid number of arguments for kernel {self.key}")
         arg_names = list(self.adj.arg_types.keys())
-        arg_types = []
-        for i in range(len(args)):
-            arg = args[i]
-            arg_type = type(arg)
-            if arg_type in warp.types.array_types:
-                arg_types.append(arg_type(dtype=arg.dtype, ndim=arg.ndim))
-            elif arg_type in warp.types.scalar_types:
-                arg_types.append(arg_type)
-            elif arg_type in [int, float]:
-                # canonicalize type
-                arg_types.append(warp.types.type_to_warp(arg_type))
-            elif hasattr(arg_type, "_wp_scalar_type_"):
-                # vector/matrix type
-                arg_types.append(arg_type)
-            elif issubclass(arg_type, warp.codegen.StructInstance):
-                # a struct
-                arg_types.append(arg._struct_)
-            # elif arg_type in [warp.types.launch_bounds_t, warp.types.shape_t, warp.types.range_t]:
-            #     arg_types.append(arg_type)
-            # elif arg_type in [warp.hash_grid_query_t, warp.mesh_query_aabb_t, warp.bvh_query_t]:
-            #     arg_types.append(arg_type)
-            elif arg is None:
-                # allow passing None for arrays
-                t = template_types[i]
-                if warp.types.is_array(t):
-                    arg_types.append(type(t)(dtype=t.dtype, ndim=t.ndim))
-                else:
-                    raise TypeError(
-                        f"Unable to infer the type of argument '{arg_names[i]}' for kernel {self.key}, got None"
-                    )
-            else:
-                # TODO: attempt to figure out if it's a vector/matrix type given as a numpy array, list, etc.
-                raise TypeError(
-                    f"Unable to infer the type of argument '{arg_names[i]}' for kernel {self.key}, got {arg_type}"
-                )
-        return arg_types
+        return warp.types.infer_argument_types(args, template_types, arg_names)
     def add_overload(self, arg_types):
         if len(arg_types) != len(self.adj.arg_types):
@@ -529,7 +637,7 @@ def func(f):
     name = warp.codegen.make_full_qualified_name(f)
     m = get_module(f.__module__)
-    func = Function(
+    Function(
         func=f, key=name, namespace="", module=m, value_func=None
     )  # value_type not known yet, will be inferred during Adjoint.build()
@@ -537,6 +645,167 @@ def func(f):
     return m.functions[name]
+def func_native(snippet, adj_snippet=None):
+    """
+    Decorator to register native code snippet, @func_native
+    """
+    def snippet_func(f):
+        name = warp.codegen.make_full_qualified_name(f)
+        m = get_module(f.__module__)
+        func = Function(
+            func=f, key=name, namespace="", module=m, native_snippet=snippet, adj_native_snippet=adj_snippet
+        )  # cuda snippets do not have a return value_type
+        return m.functions[name]
+    return snippet_func
+def func_grad(forward_fn):
+    """
+    Decorator to register a custom gradient function for a given forward function.
+    The function signature must correspond to one of the function overloads in the following way:
+    the first part of the input arguments are the original input variables with the same types as their
+    corresponding arguments in the original function, and the second part of the input arguments are the
+    adjoint variables of the output variables (if available) of the original function with the same types as the
+    output variables. The function must not return anything.
+    """
+    def wrapper(grad_fn):
+        generic = any(warp.types.type_is_generic(x) for x in forward_fn.input_types.values())
+        if generic:
+            raise RuntimeError(
+                f"Cannot define custom grad definition for {forward_fn.key} since functions with generic input arguments are not yet supported."
+            )
+        reverse_args = {}
+        reverse_args.update(forward_fn.input_types)
+        # create temporary Adjoint instance to analyze the function signature
+        adj = warp.codegen.Adjoint(
+            grad_fn, skip_forward_codegen=True, skip_reverse_codegen=False, transformers=forward_fn.adj.transformers
+        )
+        from warp.types import types_equal
+        grad_args = adj.args
+        grad_sig = warp.types.get_signature([arg.type for arg in grad_args], func_name=forward_fn.key)
+        generic = any(warp.types.type_is_generic(x.type) for x in grad_args)
+        if generic:
+            raise RuntimeError(
+                f"Cannot define custom grad definition for {forward_fn.key} since the provided grad function has generic input arguments."
+            )
+        def match_function(f):
+            # check whether the function overload f matches the signature of the provided gradient function
+            if not hasattr(f.adj, "return_var"):
+                f.adj.build(None)
+            expected_args = list(f.input_types.items())
+            if f.adj.return_var is not None:
+                expected_args += [(f"adj_ret_{var.label}", var.type) for var in f.adj.return_var]
+            if len(grad_args) != len(expected_args):
+                return False
+            if any(not types_equal(a.type, exp_type) for a, (_, exp_type) in zip(grad_args, expected_args)):
+                return False
+            return True
+        def add_custom_grad(f: Function):
+            # register custom gradient function
+            f.custom_grad_func = Function(
+                grad_fn,
+                key=f.key,
+                namespace=f.namespace,
+                input_types=reverse_args,
+                value_func=None,
+                module=f.module,
+                template_func=f.template_func,
+                skip_forward_codegen=True,
+                custom_reverse_mode=True,
+                custom_reverse_num_input_args=len(f.input_types),
+                skip_adding_overload=False,
+                code_transformers=f.adj.transformers,
+            )
+            f.adj.skip_reverse_codegen = True
+        if hasattr(forward_fn, "user_overloads") and len(forward_fn.user_overloads):
+            # find matching overload for which this grad function is defined
+            for sig, f in forward_fn.user_overloads.items():
+                if not grad_sig.startswith(sig):
+                    continue
+                if match_function(f):
+                    add_custom_grad(f)
+                    return
+            raise RuntimeError(
+                f"No function overload found for gradient function {grad_fn.__qualname__} for function {forward_fn.key}"
+            )
+        else:
+            # resolve return variables
+            forward_fn.adj.build(None)
+            expected_args = list(forward_fn.input_types.items())
+            if forward_fn.adj.return_var is not None:
+                expected_args += [(f"adj_ret_{var.label}", var.type) for var in forward_fn.adj.return_var]
+            # check if the signature matches this function
+            if match_function(forward_fn):
+                add_custom_grad(forward_fn)
+            else:
+                raise RuntimeError(
+                    f"Gradient function {grad_fn.__qualname__} for function {forward_fn.key} has an incorrect signature. The arguments must match the "
+                    "forward function arguments plus the adjoint variables corresponding to the return variables:"
+                    f"\n{', '.join(map(lambda nt: f'{nt[0]}: {nt[1].__name__}', expected_args))}"
+                )
+    return wrapper
+def func_replay(forward_fn):
+    """
+    Decorator to register a custom replay function for a given forward function.
+    The replay function is the function version that is called in the forward phase of the backward pass (replay mode) and corresponds to the forward function by default.
+    The provided function has to match the signature of one of the original forward function overloads.
+    """
+    def wrapper(replay_fn):
+        generic = any(warp.types.type_is_generic(x) for x in forward_fn.input_types.values())
+        if generic:
+            raise RuntimeError(
+                f"Cannot define custom replay definition for {forward_fn.key} since functions with generic input arguments are not yet supported."
+            )
+        args = get_function_args(replay_fn)
+        arg_types = list(args.values())
+        generic = any(warp.types.type_is_generic(x) for x in arg_types)
+        if generic:
+            raise RuntimeError(
+                f"Cannot define custom replay definition for {forward_fn.key} since the provided replay function has generic input arguments."
+            )
+        f = forward_fn.get_overload(arg_types)
+        if f is None:
+            inputs_str = ", ".join([f"{k}: {v.__name__}" for k, v in args.items()])
+            raise RuntimeError(
+                f"Could not find forward definition of function {forward_fn.key} that matches custom replay definition with arguments:\n{inputs_str}"
+            )
+        f.custom_replay_func = Function(
+            replay_fn,
+            key=f"replay_{f.key}",
+            namespace=f.namespace,
+            input_types=f.input_types,
+            value_func=f.value_func,
+            module=f.module,
+            template_func=f.template_func,
+            skip_reverse_codegen=True,
+            skip_adding_overload=True,
+            code_transformers=f.adj.transformers,
+        )
+    return wrapper
 # decorator to register kernel, @kernel, custom_name may be a string
 # that creates a kernel with a different name from the actual function
 def kernel(f=None, *, enable_backward=None):
@@ -664,6 +933,7 @@ def add_builtin(
     missing_grad=False,
     native_func=None,
     defaults=None,
+    require_original_output_arg=False,
 ):
     # wrap simple single-type functions with a value_func()
     if value_func is None:
@@ -676,7 +946,7 @@ def add_builtin(
         def initializer_list_func(args, templates):
             return False
-    if defaults == None:
+    if defaults is None:
         defaults = {}
     # Add specialized versions of this builtin if it's generic by matching arguments against
@@ -757,8 +1027,8 @@ def add_builtin(
                 # on the generated argument list and skip generation if it fails.
                 # This also gives us the return type, which we keep for later:
                 try:
-                    return_type = value_func([warp.codegen.Var("", t) for t in argtypes], {}, [])
-                except Exception as e:
+                    return_type = value_func(argtypes, {}, [])
+                except Exception:
                     continue
                 # The return_type might just be vector_t(length=3,dtype=wp.float32), so we've got to match that
@@ -788,6 +1058,7 @@ def add_builtin(
                     hidden=True,
                     skip_replay=skip_replay,
                     missing_grad=missing_grad,
+                    require_original_output_arg=require_original_output_arg,
                 )
     func = Function(
@@ -808,6 +1079,7 @@ def add_builtin(
         generic=generic,
         native_func=native_func,
         defaults=defaults,
+        require_original_output_arg=require_original_output_arg,
     )
     if key in builtin_functions:
@@ -817,7 +1089,7 @@ def add_builtin(
         # export means the function will be added to the `warp` module namespace
         # so that users can call it directly from the Python interpreter
-        if export == True:
+        if export:
             if hasattr(warp, key):
                 # check that we haven't already created something at this location
                 # if it's just an overload stub for auto-complete then overwrite it
@@ -884,6 +1156,8 @@ class ModuleBuilder:
         for func in module.functions.values():
             for f in func.user_overloads.values():
                 self.build_function(f)
+                if f.custom_replay_func is not None:
+                    self.build_function(f.custom_replay_func)
         # build all kernel entry points
         for kernel in module.kernels.values():
@@ -900,12 +1174,13 @@ class ModuleBuilder:
         while stack:
             s = stack.pop()
-            if not s in structs:
-                structs.append(s)
+            structs.append(s)
             for var in s.vars.values():
                 if isinstance(var.type, warp.codegen.Struct):
                     stack.append(var.type)
+                elif isinstance(var.type, warp.types.array) and isinstance(var.type.dtype, warp.codegen.Struct):
+                    stack.append(var.type.dtype)
         # Build them in reverse to generate a correct dependency order.
         for s in reversed(structs):
@@ -931,7 +1206,7 @@ class ModuleBuilder:
             if not func.value_func:
                 def wrap(adj):
-                    def value_type(args, kwds, templates):
+                    def value_type(arg_types, kwds, templates):
                         if adj.return_var is None or len(adj.return_var) == 0:
                             return None
                         if len(adj.return_var) == 1:
@@ -946,56 +1221,41 @@ class ModuleBuilder:
             # use dict to preserve import order
             self.functions[func] = None
-    def codegen_cpu(self):
-        cpp_source = ""
+    def codegen(self, device):
+        source = ""
         # code-gen structs
         for struct in self.structs.keys():
-            cpp_source += warp.codegen.codegen_struct(struct)
+            source += warp.codegen.codegen_struct(struct)
         # code-gen all imported functions
         for func in self.functions.keys():
-            cpp_source += warp.codegen.codegen_func(func.adj, device="cpu")
-        for kernel in self.module.kernels.values():
-            # each kernel gets an entry point in the module
-            if not kernel.is_generic:
-                cpp_source += warp.codegen.codegen_kernel(kernel, device="cpu", options=self.options)
-                cpp_source += warp.codegen.codegen_module(kernel, device="cpu")
+            if func.native_snippet is None:
+                source += warp.codegen.codegen_func(
+                    func.adj, c_func_name=func.native_func, device=device, options=self.options
+                )
             else:
-                for k in kernel.overloads.values():
-                    cpp_source += warp.codegen.codegen_kernel(k, device="cpu", options=self.options)
-                    cpp_source += warp.codegen.codegen_module(k, device="cpu")
-        # add headers
-        cpp_source = warp.codegen.cpu_module_header + cpp_source
-        return cpp_source
-    def codegen_cuda(self):
-        cu_source = ""
-        # code-gen structs
-        for struct in self.structs.keys():
-            cu_source += warp.codegen.codegen_struct(struct)
-        # code-gen all imported functions
-        for func in self.functions.keys():
-            cu_source += warp.codegen.codegen_func(func.adj, device="cuda")
+                source += warp.codegen.codegen_snippet(
+                    func.adj, name=func.key, snippet=func.native_snippet, adj_snippet=func.adj_native_snippet
+                )
         for kernel in self.module.kernels.values():
+            # each kernel gets an entry point in the module
             if not kernel.is_generic:
-                cu_source += warp.codegen.codegen_kernel(kernel, device="cuda", options=self.options)
-                cu_source += warp.codegen.codegen_module(kernel, device="cuda")
+                source += warp.codegen.codegen_kernel(kernel, device=device, options=self.options)
+                source += warp.codegen.codegen_module(kernel, device=device)
             else:
                 for k in kernel.overloads.values():
-                    cu_source += warp.codegen.codegen_kernel(k, device="cuda", options=self.options)
-                    cu_source += warp.codegen.codegen_module(k, device="cuda")
+                    source += warp.codegen.codegen_kernel(k, device=device, options=self.options)
+                    source += warp.codegen.codegen_module(k, device=device)
         # add headers
-        cu_source = warp.codegen.cuda_module_header + cu_source
+        if device == "cpu":
+            source = warp.codegen.cpu_module_header + source
+        else:
+            source = warp.codegen.cuda_module_header + source
-        return cu_source
+        return source
 # -----------------------------------------------------
@@ -1014,7 +1274,6 @@ class Module:
         self.constants = []
         self.structs = {}
-        self.dll = None
         self.cpu_module = None
         self.cuda_modules = {}  # module lookup by CUDA context
@@ -1058,6 +1317,10 @@ class Module:
         self.content_hash = None
+        # number of times module auto-generates kernel key for user
+        # used to ensure unique kernel keys
+        self.count = 0
     def register_struct(self, struct):
         self.structs[struct.key] = struct
@@ -1072,7 +1335,7 @@ class Module:
         # for a reload of module on next launch
         self.unload()
-    def register_function(self, func):
+    def register_function(self, func, skip_adding_overload=False):
         if func.key not in self.functions:
             self.functions[func.key] = func
         else:
@@ -1092,7 +1355,7 @@ class Module:
             )
             if sig == sig_existing:
                 self.functions[func.key] = func
-            else:
+            elif not skip_adding_overload:
                 func_existing.add_overload(func)
         self.find_references(func.adj)
@@ -1100,6 +1363,11 @@ class Module:
         # for a reload of module on next launch
         self.unload()
+    def generate_unique_kernel_key(self, key):
+        unique_key = f"{key}_{self.count}"
+        self.count += 1
+        return unique_key
     # collect all referenced functions / structs
     # given the AST of a function or kernel
     def find_references(self, adj):
@@ -1113,13 +1381,13 @@ class Module:
             if isinstance(node, ast.Call):
                 try:
                     # try to resolve the function
-                    func, _ = adj.resolve_path(node.func)
+                    func, _ = adj.resolve_static_expression(node.func, eval_types=False)
                     # if this is a user-defined function, add a module reference
                     if isinstance(func, warp.context.Function) and func.module is not None:
                         add_ref(func.module)
-                except:
+                except Exception:
                     # Lookups may fail for builtins, but that's ok.
                     # Lookups may also fail for functions in this module that haven't been imported yet,
                     # and that's ok too (not an external reference).
@@ -1139,6 +1407,11 @@ class Module:
             return getattr(obj, "__annotations__", {})
+        def get_type_name(type_hint):
+            if isinstance(type_hint, warp.codegen.Struct):
+                return get_type_name(type_hint.cls)
+            return type_hint
         def hash_recursive(module, visited):
             # Hash this module, including all referenced modules recursively.
             # The visited set tracks modules already visited to avoid circular references.
@@ -1151,7 +1424,8 @@ class Module:
                 # struct source
                 for struct in module.structs.values():
                     s = ",".join(
-                        "{}: {}".format(name, type_hint) for name, type_hint in get_annotations(struct.cls).items()
+                        "{}: {}".format(name, get_type_name(type_hint))
+                        for name, type_hint in get_annotations(struct.cls).items()
                     )
                     ch.update(bytes(s, "utf-8"))
@@ -1160,13 +1434,29 @@ class Module:
                     s = func.adj.source
                     ch.update(bytes(s, "utf-8"))
+                    if func.custom_grad_func:
+                        s = func.custom_grad_func.adj.source
+                        ch.update(bytes(s, "utf-8"))
+                    if func.custom_replay_func:
+                        s = func.custom_replay_func.adj.source
+                    # cache func arg types
+                    for arg, arg_type in func.adj.arg_types.items():
+                        s = f"{arg}: {get_type_name(arg_type)}"
+                        ch.update(bytes(s, "utf-8"))
                 # kernel source
                 for kernel in module.kernels.values():
-                    if not kernel.is_generic:
-                        ch.update(bytes(kernel.adj.source, "utf-8"))
-                    else:
-                        for k in kernel.overloads.values():
-                            ch.update(bytes(k.adj.source, "utf-8"))
+                    ch.update(bytes(kernel.adj.source, "utf-8"))
+                    # cache kernel arg types
+                    for arg, arg_type in kernel.adj.arg_types.items():
+                        s = f"{arg}: {get_type_name(arg_type)}"
+                        ch.update(bytes(s, "utf-8"))
+                    # for generic kernels the Python source is always the same,
+                    # but we hash the type signatures of all the overloads
+                    if kernel.is_generic:
+                        for sig in sorted(kernel.overloads.keys()):
+                            ch.update(bytes(sig, "utf-8"))
                 module.content_hash = ch.digest()
@@ -1204,12 +1494,12 @@ class Module:
         return hash_recursive(self, visited=set())
     def load(self, device):
+        from warp.utils import ScopedTimer
         device = get_device(device)
         if device.is_cpu:
             # check if already loaded
-            if self.dll:
-                return True
             if self.cpu_module:
                 return True
             # avoid repeated build attempts
@@ -1227,7 +1517,7 @@ class Module:
             if not warp.is_cuda_available():
                 raise RuntimeError("Failed to build CUDA module because CUDA is not available")
-        with warp.utils.ScopedTimer(f"Module {self.name} load on device '{device}'", active=not warp.config.quiet):
+        with ScopedTimer(f"Module {self.name} load on device '{device}'", active=not warp.config.quiet):
             build_path = warp.build.kernel_bin_dir
             gen_path = warp.build.kernel_gen_dir
@@ -1238,89 +1528,54 @@ class Module:
             module_name = "wp_" + self.name
             module_path = os.path.join(build_path, module_name)
-            obj_path = os.path.join(gen_path, module_name)
             module_hash = self.hash_module()
             builder = ModuleBuilder(self, self.options)
             if device.is_cpu:
-                if runtime.llvm:
-                    if os.name == "nt":
-                        dll_path = obj_path + ".cpp.obj"
-                    else:
-                        dll_path = obj_path + ".cpp.o"
-                else:
-                    if os.name == "nt":
-                        dll_path = module_path + ".dll"
-                    else:
-                        dll_path = module_path + ".so"
+                obj_path = os.path.join(build_path, module_name)
+                obj_path = obj_path + ".o"
                 cpu_hash_path = module_path + ".cpu.hash"
                 # check cache
-                if warp.config.cache_kernels and os.path.isfile(cpu_hash_path) and os.path.isfile(dll_path):
+                if warp.config.cache_kernels and os.path.isfile(cpu_hash_path) and os.path.isfile(obj_path):
                     with open(cpu_hash_path, "rb") as f:
                         cache_hash = f.read()
                     if cache_hash == module_hash:
-                        if runtime.llvm:
-                            runtime.llvm.load_obj(dll_path.encode("utf-8"), module_name.encode("utf-8"))
-                            self.cpu_module = module_name
-                            return True
-                        else:
-                            self.dll = warp.build.load_dll(dll_path)
-                            if self.dll is not None:
-                                return True
+                        runtime.llvm.load_obj(obj_path.encode("utf-8"), module_name.encode("utf-8"))
+                        self.cpu_module = module_name
+                        return True
                 # build
                 try:
                     cpp_path = os.path.join(gen_path, module_name + ".cpp")
                     # write cpp sources
-                    cpp_source = builder.codegen_cpu()
+                    cpp_source = builder.codegen("cpu")
                     cpp_file = open(cpp_path, "w")
                     cpp_file.write(cpp_source)
                     cpp_file.close()
-                    bin_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "bin")
-                    if os.name == "nt":
-                        libs = ["warp.lib", f'/LIBPATH:"{bin_path}"']
-                        libs.append("/NOENTRY")
-                        libs.append("/NODEFAULTLIB")
-                    elif sys.platform == "darwin":
-                        libs = [f"-lwarp", f"-L{bin_path}", f"-Wl,-rpath,'{bin_path}'"]
-                    else:
-                        libs = ["-l:warp.so", f"-L{bin_path}", f"-Wl,-rpath,'{bin_path}'"]
-                    # build DLL or object code
-                    with warp.utils.ScopedTimer("Compile x86", active=warp.config.verbose):
-                        warp.build.build_dll(
-                            dll_path,
-                            [cpp_path],
-                            None,
-                            libs,
+                    # build object code
+                    with ScopedTimer("Compile x86", active=warp.config.verbose):
+                        warp.build.build_cpu(
+                            obj_path,
+                            cpp_path,
                             mode=self.options["mode"],
                             fast_math=self.options["fast_math"],
                             verify_fp=warp.config.verify_fp,
                         )
-                    if runtime.llvm:
-                        # load the object code
-                        obj_ext = ".obj" if os.name == "nt" else ".o"
-                        obj_path = cpp_path + obj_ext
-                        runtime.llvm.load_obj(obj_path.encode("utf-8"), module_name.encode("utf-8"))
-                        self.cpu_module = module_name
-                    else:
-                        # load the DLL
-                        self.dll = warp.build.load_dll(dll_path)
-                        if self.dll is None:
-                            raise Exception("Failed to load CPU module")
                     # update cpu hash
                     with open(cpu_hash_path, "wb") as f:
                         f.write(module_hash)
+                    # load the object code
+                    runtime.llvm.load_obj(obj_path.encode("utf-8"), module_name.encode("utf-8"))
+                    self.cpu_module = module_name
                 except Exception as e:
                     self.cpu_build_failed = True
                     raise (e)
@@ -1365,14 +1620,14 @@ class Module:
                     cu_path = os.path.join(gen_path, module_name + ".cu")
                     # write cuda sources
-                    cu_source = builder.codegen_cuda()
+                    cu_source = builder.codegen("cuda")
                     cu_file = open(cu_path, "w")
                     cu_file.write(cu_source)
                     cu_file.close()
                     # generate PTX or CUBIN
-                    with warp.utils.ScopedTimer("Compile CUDA", active=warp.config.verbose):
+                    with ScopedTimer("Compile CUDA", active=warp.config.verbose):
                         warp.build.build_cuda(
                             cu_path,
                             output_arch,
@@ -1382,6 +1637,10 @@ class Module:
                             verify_fp=warp.config.verify_fp,
                         )
+                    # update cuda hash
+                    with open(cuda_hash_path, "wb") as f:
+                        f.write(module_hash)
                     # load the module
                     cuda_module = warp.build.load_cuda(output_path, device)
                     if cuda_module is not None:
@@ -1389,10 +1648,6 @@ class Module:
                     else:
                         raise Exception("Failed to load CUDA module")
-                    # update cuda hash
-                    with open(cuda_hash_path, "wb") as f:
-                        f.write(module_hash)
                 except Exception as e:
                     self.cuda_build_failed = True
                     raise (e)
@@ -1400,10 +1655,6 @@ class Module:
             return True
     def unload(self):
-        if self.dll:
-            warp.build.unload_dll(self.dll)
-            self.dll = None
         if self.cpu_module:
             runtime.llvm.unload_obj(self.cpu_module.encode("utf-8"))
             self.cpu_module = None
@@ -1438,17 +1689,13 @@ class Module:
         name = kernel.get_mangled_name()
         if device.is_cpu:
-            if self.cpu_module:
-                func = ctypes.CFUNCTYPE(None)
-                forward = func(
-                    runtime.llvm.lookup(self.cpu_module.encode("utf-8"), (name + "_cpu_forward").encode("utf-8"))
-                )
-                backward = func(
-                    runtime.llvm.lookup(self.cpu_module.encode("utf-8"), (name + "_cpu_backward").encode("utf-8"))
-                )
-            else:
-                forward = eval("self.dll." + name + "_cpu_forward")
-                backward = eval("self.dll." + name + "_cpu_backward")
+            func = ctypes.CFUNCTYPE(None)
+            forward = func(
+                runtime.llvm.lookup(self.cpu_module.encode("utf-8"), (name + "_cpu_forward").encode("utf-8"))
+            )
+            backward = func(
+                runtime.llvm.lookup(self.cpu_module.encode("utf-8"), (name + "_cpu_backward").encode("utf-8"))
+            )
         else:
             cu_module = self.cuda_modules[device.context]
             forward = runtime.core.cuda_get_kernel(
@@ -1475,6 +1722,8 @@ class Allocator:
     def alloc(self, size_in_bytes, pinned=False):
         if self.device.is_cuda:
+            if self.device.is_capturing:
+                raise RuntimeError(f"Cannot allocate memory on device {self} while graph capture is active")
             return runtime.core.alloc_device(self.device.context, size_in_bytes)
         elif self.device.is_cpu:
             if pinned:
@@ -1484,6 +1733,8 @@ class Allocator:
     def free(self, ptr, size_in_bytes, pinned=False):
         if self.device.is_cuda:
+            if self.device.is_capturing:
+                raise RuntimeError(f"Cannot free memory on device {self} while graph capture is active")
             return runtime.core.free_device(self.device.context, ptr)
         elif self.device.is_cpu:
             if pinned:
@@ -1499,13 +1750,13 @@ class ContextGuard:
     def __enter__(self):
         if self.device.is_cuda:
             runtime.core.cuda_context_push_current(self.device.context)
-        elif is_cuda_available():
+        elif is_cuda_driver_initialized():
             self.saved_context = runtime.core.cuda_context_get_current()
     def __exit__(self, exc_type, exc_value, traceback):
         if self.device.is_cuda:
             runtime.core.cuda_context_pop_current()
-        elif is_cuda_available():
+        elif is_cuda_driver_initialized():
             runtime.core.cuda_context_set_current(self.saved_context)
@@ -1596,6 +1847,29 @@ class Event:
 class Device:
+    """A device to allocate Warp arrays and to launch kernels on.
+    Attributes:
+        ordinal: A Warp-specific integer label for the device. ``-1`` for CPU devices.
+        name: A string label for the device. By default, CPU devices will be named according to the processor name,
+            or ``"CPU"`` if the processor name cannot be determined.
+        arch: An integer representing the compute capability version number calculated as
+            ``10 * major + minor``. ``0`` for CPU devices.
+        is_uva: A boolean indicating whether or not the device supports unified addressing.
+            ``False`` for CPU devices.
+        is_cubin_supported: A boolean indicating whether or not Warp's version of NVRTC can directly
+            generate CUDA binary files (cubin) for this device's architecture. ``False`` for CPU devices.
+        is_mempool_supported: A boolean indicating whether or not the device supports using the
+            ``cuMemAllocAsync`` and ``cuMemPool`` family of APIs for stream-ordered memory allocations. ``False`` for
+            CPU devices.
+        is_primary: A boolean indicating whether or not this device's CUDA context is also the
+            device's primary context.
+        uuid: A string representing the UUID of the CUDA device. The UUID is in the same format used by
+            ``nvidia-smi -L``. ``None`` for CPU devices.
+        pci_bus_id: A string identifier for the CUDA device in the format ``[domain]:[bus]:[device]``, in which
+            ``domain``, ``bus``, and ``device`` are all hexadecimal values. ``None`` for CPU devices.
+    """
     def __init__(self, runtime, alias, ordinal=-1, is_primary=False, context=None):
         self.runtime = runtime
         self.alias = alias
@@ -1625,6 +1899,9 @@ class Device:
             self.arch = 0
             self.is_uva = False
             self.is_cubin_supported = False
+            self.is_mempool_supported = False
+            self.uuid = None
+            self.pci_bus_id = None
             # TODO: add more device-specific dispatch functions
             self.memset = runtime.core.memset_host
@@ -1637,6 +1914,26 @@ class Device:
             self.is_uva = runtime.core.cuda_device_is_uva(ordinal)
             # check whether our NVRTC can generate CUBINs for this architecture
             self.is_cubin_supported = self.arch in runtime.nvrtc_supported_archs
+            self.is_mempool_supported = runtime.core.cuda_device_is_memory_pool_supported(ordinal)
+            uuid_buffer = (ctypes.c_char * 16)()
+            runtime.core.cuda_device_get_uuid(ordinal, uuid_buffer)
+            uuid_byte_str = bytes(uuid_buffer).hex()
+            self.uuid = f"GPU-{uuid_byte_str[0:8]}-{uuid_byte_str[8:12]}-{uuid_byte_str[12:16]}-{uuid_byte_str[16:20]}-{uuid_byte_str[20:]}"
+            pci_domain_id = runtime.core.cuda_device_get_pci_domain_id(ordinal)
+            pci_bus_id = runtime.core.cuda_device_get_pci_bus_id(ordinal)
+            pci_device_id = runtime.core.cuda_device_get_pci_device_id(ordinal)
+            # This is (mis)named to correspond to the naming of cudaDeviceGetPCIBusId
+            self.pci_bus_id = f"{pci_domain_id:08X}:{pci_bus_id:02X}:{pci_device_id:02X}"
+            # Warn the user of a possible misconfiguration of their system
+            if not self.is_mempool_supported:
+                warp.utils.warn(
+                    f"Support for stream ordered memory allocators was not detected on device {ordinal}. "
+                    "This can prevent the use of graphs and/or result in poor performance. "
+                    "Is the UVM driver enabled?"
+                )
             # initialize streams unless context acquisition is postponed
             if self._context is not None:
@@ -1660,14 +1957,17 @@ class Device:
     @property
     def is_cpu(self):
+        """A boolean indicating whether or not the device is a CPU device."""
         return self.ordinal < 0
     @property
     def is_cuda(self):
+        """A boolean indicating whether or not the device is a CUDA device."""
         return self.ordinal >= 0
     @property
     def context(self):
+        """The context associated with the device."""
         if self._context is not None:
             return self._context
         elif self.is_primary:
@@ -1682,10 +1982,16 @@ class Device:
     @property
     def has_context(self):
+        """A boolean indicating whether or not the device has a CUDA context associated with it."""
         return self._context is not None
     @property
     def stream(self):
+        """The stream associated with a CUDA device.
+        Raises:
+            RuntimeError: The device is not a CUDA device.
+        """
         if self.context:
             return self._stream
         else:
@@ -1703,6 +2009,7 @@ class Device:
     @property
     def has_stream(self):
+        """A boolean indicating whether or not the device has a stream associated with it."""
         return self._stream is not None
     def __str__(self):
@@ -1778,10 +2085,10 @@ class Runtime:
             warp_lib = os.path.join(bin_path, "warp.so")
             llvm_lib = os.path.join(bin_path, "warp-clang.so")
-        self.core = warp.build.load_dll(warp_lib)
+        self.core = self.load_dll(warp_lib)
-        if llvm_lib and os.path.exists(llvm_lib):
-            self.llvm = warp.build.load_dll(llvm_lib)
+        if os.path.exists(llvm_lib):
+            self.llvm = self.load_dll(llvm_lib)
             # setup c-types for warp-clang.dll
             self.llvm.lookup.restype = ctypes.c_uint64
         else:
@@ -1852,11 +2159,106 @@ class Runtime:
         ]
         self.core.array_copy_device.restype = ctypes.c_size_t
+        self.core.array_fill_host.argtypes = [ctypes.c_void_p, ctypes.c_int, ctypes.c_void_p, ctypes.c_int]
+        self.core.array_fill_host.restype = None
+        self.core.array_fill_device.argtypes = [
+            ctypes.c_void_p,
+            ctypes.c_void_p,
+            ctypes.c_int,
+            ctypes.c_void_p,
+            ctypes.c_int,
+        ]
+        self.core.array_fill_device.restype = None
+        self.core.array_sum_double_host.argtypes = [
+            ctypes.c_uint64,
+            ctypes.c_uint64,
+            ctypes.c_int,
+            ctypes.c_int,
+            ctypes.c_int,
+        ]
+        self.core.array_sum_float_host.argtypes = [
+            ctypes.c_uint64,
+            ctypes.c_uint64,
+            ctypes.c_int,
+            ctypes.c_int,
+            ctypes.c_int,
+        ]
+        self.core.array_sum_double_device.argtypes = [
+            ctypes.c_uint64,
+            ctypes.c_uint64,
+            ctypes.c_int,
+            ctypes.c_int,
+            ctypes.c_int,
+        ]
+        self.core.array_sum_float_device.argtypes = [
+            ctypes.c_uint64,
+            ctypes.c_uint64,
+            ctypes.c_int,
+            ctypes.c_int,
+            ctypes.c_int,
+        ]
+        self.core.array_inner_double_host.argtypes = [
+            ctypes.c_uint64,
+            ctypes.c_uint64,
+            ctypes.c_uint64,
+            ctypes.c_int,
+            ctypes.c_int,
+            ctypes.c_int,
+            ctypes.c_int,
+        ]
+        self.core.array_inner_float_host.argtypes = [
+            ctypes.c_uint64,
+            ctypes.c_uint64,
+            ctypes.c_uint64,
+            ctypes.c_int,
+            ctypes.c_int,
+            ctypes.c_int,
+            ctypes.c_int,
+        ]
+        self.core.array_inner_double_device.argtypes = [
+            ctypes.c_uint64,
+            ctypes.c_uint64,
+            ctypes.c_uint64,
+            ctypes.c_int,
+            ctypes.c_int,
+            ctypes.c_int,
+            ctypes.c_int,
+        ]
+        self.core.array_inner_float_device.argtypes = [
+            ctypes.c_uint64,
+            ctypes.c_uint64,
+            ctypes.c_uint64,
+            ctypes.c_int,
+            ctypes.c_int,
+            ctypes.c_int,
+            ctypes.c_int,
+        ]
         self.core.array_scan_int_host.argtypes = [ctypes.c_uint64, ctypes.c_uint64, ctypes.c_int, ctypes.c_bool]
         self.core.array_scan_float_host.argtypes = [ctypes.c_uint64, ctypes.c_uint64, ctypes.c_int, ctypes.c_bool]
         self.core.array_scan_int_device.argtypes = [ctypes.c_uint64, ctypes.c_uint64, ctypes.c_int, ctypes.c_bool]
         self.core.array_scan_float_device.argtypes = [ctypes.c_uint64, ctypes.c_uint64, ctypes.c_int, ctypes.c_bool]
+        self.core.radix_sort_pairs_int_host.argtypes = [ctypes.c_uint64, ctypes.c_uint64, ctypes.c_int]
+        self.core.radix_sort_pairs_int_device.argtypes = [ctypes.c_uint64, ctypes.c_uint64, ctypes.c_int]
+        self.core.runlength_encode_int_host.argtypes = [
+            ctypes.c_uint64,
+            ctypes.c_uint64,
+            ctypes.c_uint64,
+            ctypes.c_uint64,
+            ctypes.c_int,
+        ]
+        self.core.runlength_encode_int_device.argtypes = [
+            ctypes.c_uint64,
+            ctypes.c_uint64,
+            ctypes.c_uint64,
+            ctypes.c_uint64,
+            ctypes.c_int,
+        ]
         self.core.bvh_create_host.restype = ctypes.c_uint64
         self.core.bvh_create_host.argtypes = [ctypes.c_void_p, ctypes.c_void_p, ctypes.c_int]
@@ -1876,6 +2278,7 @@ class Runtime:
             warp.types.array_t,
             ctypes.c_int,
             ctypes.c_int,
+            ctypes.c_int,
         ]
         self.core.mesh_create_device.restype = ctypes.c_uint64
@@ -1886,6 +2289,7 @@ class Runtime:
             warp.types.array_t,
             ctypes.c_int,
             ctypes.c_int,
+            ctypes.c_int,
         ]
         self.core.mesh_destroy_host.argtypes = [ctypes.c_uint64]
@@ -1998,6 +2402,46 @@ class Runtime:
             ctypes.POINTER(ctypes.c_float),
         ]
+        bsr_matrix_from_triplets_argtypes = [
+            ctypes.c_int,
+            ctypes.c_int,
+            ctypes.c_int,
+            ctypes.c_int,
+            ctypes.c_uint64,
+            ctypes.c_uint64,
+            ctypes.c_uint64,
+            ctypes.c_uint64,
+            ctypes.c_uint64,
+            ctypes.c_uint64,
+        ]
+        self.core.bsr_matrix_from_triplets_float_host.argtypes = bsr_matrix_from_triplets_argtypes
+        self.core.bsr_matrix_from_triplets_double_host.argtypes = bsr_matrix_from_triplets_argtypes
+        self.core.bsr_matrix_from_triplets_float_device.argtypes = bsr_matrix_from_triplets_argtypes
+        self.core.bsr_matrix_from_triplets_double_device.argtypes = bsr_matrix_from_triplets_argtypes
+        self.core.bsr_matrix_from_triplets_float_host.restype = ctypes.c_int
+        self.core.bsr_matrix_from_triplets_double_host.restype = ctypes.c_int
+        self.core.bsr_matrix_from_triplets_float_device.restype = ctypes.c_int
+        self.core.bsr_matrix_from_triplets_double_device.restype = ctypes.c_int
+        bsr_transpose_argtypes = [
+            ctypes.c_int,
+            ctypes.c_int,
+            ctypes.c_int,
+            ctypes.c_int,
+            ctypes.c_int,
+            ctypes.c_uint64,
+            ctypes.c_uint64,
+            ctypes.c_uint64,
+            ctypes.c_uint64,
+            ctypes.c_uint64,
+            ctypes.c_uint64,
+        ]
+        self.core.bsr_transpose_float_host.argtypes = bsr_transpose_argtypes
+        self.core.bsr_transpose_double_host.argtypes = bsr_transpose_argtypes
+        self.core.bsr_transpose_float_device.argtypes = bsr_transpose_argtypes
+        self.core.bsr_transpose_double_device.argtypes = bsr_transpose_argtypes
         self.core.is_cuda_enabled.argtypes = None
         self.core.is_cuda_enabled.restype = ctypes.c_int
         self.core.is_cuda_compatibility_enabled.argtypes = None
@@ -2009,6 +2453,8 @@ class Runtime:
         self.core.cuda_driver_version.restype = ctypes.c_int
         self.core.cuda_toolkit_version.argtypes = None
         self.core.cuda_toolkit_version.restype = ctypes.c_int
+        self.core.cuda_driver_is_initialized.argtypes = None
+        self.core.cuda_driver_is_initialized.restype = ctypes.c_bool
         self.core.nvrtc_supported_arch_count.argtypes = None
         self.core.nvrtc_supported_arch_count.restype = ctypes.c_int
@@ -2025,6 +2471,14 @@ class Runtime:
         self.core.cuda_device_get_arch.restype = ctypes.c_int
         self.core.cuda_device_is_uva.argtypes = [ctypes.c_int]
         self.core.cuda_device_is_uva.restype = ctypes.c_int
+        self.core.cuda_device_get_uuid.argtypes = [ctypes.c_int, ctypes.c_char * 16]
+        self.core.cuda_device_get_uuid.restype = None
+        self.core.cuda_device_get_pci_domain_id.argtypes = [ctypes.c_int]
+        self.core.cuda_device_get_pci_domain_id.restype = ctypes.c_int
+        self.core.cuda_device_get_pci_bus_id.argtypes = [ctypes.c_int]
+        self.core.cuda_device_get_pci_bus_id.restype = ctypes.c_int
+        self.core.cuda_device_get_pci_device_id.argtypes = [ctypes.c_int]
+        self.core.cuda_device_get_pci_device_id.restype = ctypes.c_int
         self.core.cuda_context_get_current.argtypes = None
         self.core.cuda_context_get_current.restype = ctypes.c_void_p
@@ -2111,6 +2565,7 @@ class Runtime:
             ctypes.c_void_p,
             ctypes.c_void_p,
             ctypes.c_size_t,
+            ctypes.c_int,
             ctypes.POINTER(ctypes.c_void_p),
         ]
         self.core.cuda_launch_kernel.restype = ctypes.c_size_t
@@ -2140,7 +2595,6 @@ class Runtime:
         self.device_map = {}  # device lookup by alias
         self.context_map = {}  # device lookup by context
-        self.graph_capture_map = {}  # indicates whether graph capture is active for a given device
         # register CPU device
         cpu_name = platform.processor()
@@ -2149,7 +2603,6 @@ class Runtime:
         self.cpu_device = Device(self, "cpu")
         self.device_map["cpu"] = self.cpu_device
         self.context_map[None] = self.cpu_device
-        self.graph_capture_map[None] = False
         cuda_device_count = self.core.cuda_device_get_count()
@@ -2183,12 +2636,9 @@ class Runtime:
                 self.set_default_device("cuda")
             else:
                 self.set_default_device("cuda:0")
-            # save the initial CUDA device for backward compatibility with ScopedCudaGuard
-            self.initial_cuda_device = self.default_device
         else:
             # CUDA not available
             self.set_default_device("cpu")
-            self.initial_cuda_device = None
         # initialize kernel cache
         warp.build.init_kernel_cache(warp.config.kernel_cache_dir)
@@ -2230,6 +2680,23 @@ class Runtime:
         # global tape
         self.tape = None
+    def load_dll(self, dll_path):
+        try:
+            if sys.version_info[0] > 3 or sys.version_info[0] == 3 and sys.version_info[1] >= 8:
+                dll = ctypes.CDLL(dll_path, winmode=0)
+            else:
+                dll = ctypes.CDLL(dll_path)
+        except OSError as e:
+            if "GLIBCXX" in str(e):
+                raise RuntimeError(
+                    f"Failed to load the shared library '{dll_path}'.\n"
+                    "The execution environment's libstdc++ runtime is older than the version the Warp library was built for.\n"
+                    "See https://nvidia.github.io/warp/_build/html/installation.html#conda-environments for details."
+                ) from e
+            else:
+                raise RuntimeError(f"Failed to load the shared library '{dll_path}'") from e
+        return dll
     def get_device(self, ident: Devicelike = None) -> Device:
         if isinstance(ident, Device):
             return ident
@@ -2345,15 +2812,7 @@ def assert_initialized():
 # global entry points
 def is_cpu_available():
-    if runtime.llvm:
-        return True
-    # initialize host build env (do this lazily) since
-    # it takes 5secs to run all the batch files to locate MSVC
-    if warp.config.host_compiler is None:
-        warp.config.host_compiler = warp.build.find_host_compiler()
-    return warp.config.host_compiler != ""
+    return runtime.llvm
 def is_cuda_available():
@@ -2364,6 +2823,21 @@ def is_device_available(device):
     return device in get_devices()
+def is_cuda_driver_initialized() -> bool:
+    """Returns ``True`` if the CUDA driver is initialized.
+    This is a stricter test than ``is_cuda_available()`` since a CUDA driver
+    call to ``cuCtxGetCurrent`` is made, and the result is compared to
+    `CUDA_SUCCESS`. Note that `CUDA_SUCCESS` is returned by ``cuCtxGetCurrent``
+    even if there is no context bound to the calling CPU thread.
+    This can be helpful in cases in which ``cuInit()`` was called before a fork.
+    """
+    assert_initialized()
+    return runtime.core.cuda_driver_is_initialized()
 def get_devices() -> List[Device]:
     """Returns a list of devices supported in this environment."""
@@ -2590,63 +3064,53 @@ def zeros(
         A warp.array object representing the allocation
     """
-    # backwards compatibility for case where users did wp.zeros(n, dtype=..), or wp.zeros(n=length, dtype=..)
-    if isinstance(shape, int):
-        shape = (shape,)
-    elif "n" in kwargs:
-        shape = (kwargs["n"],)
+    arr = empty(shape=shape, dtype=dtype, device=device, requires_grad=requires_grad, pinned=pinned, **kwargs)
-    # compute num els
-    num_elements = 1
-    for d in shape:
-        num_elements *= d
+    # use the CUDA default stream for synchronous behaviour with other streams
+    with warp.ScopedStream(arr.device.null_stream):
+        arr.zero_()
-    num_bytes = num_elements * warp.types.type_size_in_bytes(dtype)
+    return arr
-    device = get_device(device)
-    ptr = None
-    grad_ptr = None
+def zeros_like(
+    src: warp.array, device: Devicelike = None, requires_grad: bool = None, pinned: bool = None
+) -> warp.array:
+    """Return a zero-initialized array with the same type and dimension of another array
-    if num_bytes > 0:
-        if device.is_capturing:
-            raise RuntimeError(f"Cannot allocate memory while graph capture is active on device {device}.")
-        ptr = device.allocator.alloc(num_bytes, pinned=pinned)
-        if ptr is None:
-            raise RuntimeError("Memory allocation failed on device: {} for {} bytes".format(device, num_bytes))
-        # use the CUDA default stream for synchronous behaviour with other streams
-        with warp.ScopedStream(device.null_stream):
-            device.memset(ptr, 0, num_bytes)
-        if requires_grad:
-            # allocate gradient array
-            grad_ptr = device.allocator.alloc(num_bytes, pinned=pinned)
-            if grad_ptr is None:
-                raise RuntimeError("Memory allocation failed on device: {} for {} bytes".format(device, num_bytes))
-            with warp.ScopedStream(device.null_stream):
-                device.memset(grad_ptr, 0, num_bytes)
-    # construct array
-    return warp.types.array(
-        dtype=dtype,
-        shape=shape,
-        capacity=num_bytes,
-        ptr=ptr,
-        grad_ptr=grad_ptr,
-        device=device,
-        owner=True,
-        requires_grad=requires_grad,
-        pinned=pinned,
-    )
+    Args:
+        src: The template array to use for shape, data type, and device
+        device: The device where the new array will be created (defaults to src.device)
+        requires_grad: Whether the array will be tracked for back propagation
+        pinned: Whether the array uses pinned host memory (only applicable to CPU arrays)
+    Returns:
+        A warp.array object representing the allocation
+    """
+    arr = empty_like(src, device=device, requires_grad=requires_grad, pinned=pinned)
-def zeros_like(src: warp.array, requires_grad: bool = None, pinned: bool = None) -> warp.array:
-    """Return a zero-initialized array with the same type and dimension of another array
+    arr.zero_()
+    return arr
+def full(
+    shape: Tuple = None,
+    value=0,
+    dtype=Any,
+    device: Devicelike = None,
+    requires_grad: bool = False,
+    pinned: bool = False,
+    **kwargs,
+) -> warp.array:
+    """Return an array with all elements initialized to the given value
     Args:
-        src: The template array to use for length, data type, and device
+        shape: Array dimensions
+        value: Element value
+        dtype: Type of each element, e.g.: float, warp.vec3, warp.mat33, etc
+        device: Device that array will live on
         requires_grad: Whether the array will be tracked for back propagation
         pinned: Whether the array uses pinned host memory (only applicable to CPU arrays)
@@ -2654,24 +3118,78 @@ def zeros_like(src: warp.array, requires_grad: bool = None, pinned: bool = None)
         A warp.array object representing the allocation
     """
-    if requires_grad is None:
-        if hasattr(src, "requires_grad"):
-            requires_grad = src.requires_grad
+    if dtype == Any:
+        # determine dtype from value
+        value_type = type(value)
+        if value_type == int:
+            dtype = warp.int32
+        elif value_type == float:
+            dtype = warp.float32
+        elif value_type in warp.types.scalar_types or hasattr(value_type, "_wp_scalar_type_"):
+            dtype = value_type
+        elif isinstance(value, warp.codegen.StructInstance):
+            dtype = value._cls
+        elif hasattr(value, "__len__"):
+            # a sequence, assume it's a vector or matrix value
+            try:
+                # try to convert to a numpy array first
+                na = np.array(value, copy=False)
+            except Exception as e:
+                raise ValueError(f"Failed to interpret the value as a vector or matrix: {e}")
+            # determine the scalar type
+            scalar_type = warp.types.np_dtype_to_warp_type.get(na.dtype)
+            if scalar_type is None:
+                raise ValueError(f"Failed to convert {na.dtype} to a Warp data type")
+            # determine if vector or matrix
+            if na.ndim == 1:
+                dtype = warp.types.vector(na.size, scalar_type)
+            elif na.ndim == 2:
+                dtype = warp.types.matrix(na.shape, scalar_type)
+            else:
+                raise ValueError("Values with more than two dimensions are not supported")
         else:
-            requires_grad = False
+            raise ValueError(f"Invalid value type for Warp array: {value_type}")
-    if pinned is None:
-        pinned = src.pinned
+    arr = empty(shape=shape, dtype=dtype, device=device, requires_grad=requires_grad, pinned=pinned, **kwargs)
+    # use the CUDA default stream for synchronous behaviour with other streams
+    with warp.ScopedStream(arr.device.null_stream):
+        arr.fill_(value)
+    return arr
+def full_like(
+    src: warp.array, value: Any, device: Devicelike = None, requires_grad: bool = None, pinned: bool = None
+) -> warp.array:
+    """Return an array with all elements initialized to the given value with the same type and dimension of another array
+    Args:
+        src: The template array to use for shape, data type, and device
+        value: Element value
+        device: The device where the new array will be created (defaults to src.device)
+        requires_grad: Whether the array will be tracked for back propagation
+        pinned: Whether the array uses pinned host memory (only applicable to CPU arrays)
+    Returns:
+        A warp.array object representing the allocation
+    """
+    arr = empty_like(src, device=device, requires_grad=requires_grad, pinned=pinned)
+    arr.fill_(value)
-    arr = zeros(shape=src.shape, dtype=src.dtype, device=src.device, requires_grad=requires_grad, pinned=pinned)
     return arr
-def clone(src: warp.array, requires_grad: bool = None, pinned: bool = None) -> warp.array:
+def clone(src: warp.array, device: Devicelike = None, requires_grad: bool = None, pinned: bool = None) -> warp.array:
     """Clone an existing array, allocates a copy of the src memory
     Args:
         src: The source array to copy
+        device: The device where the new array will be created (defaults to src.device)
         requires_grad: Whether the array will be tracked for back propagation
         pinned: Whether the array uses pinned host memory (only applicable to CPU arrays)
@@ -2679,19 +3197,11 @@ def clone(src: warp.array, requires_grad: bool = None, pinned: bool = None) -> w
         A warp.array object representing the allocation
     """
-    if requires_grad is None:
-        if hasattr(src, "requires_grad"):
-            requires_grad = src.requires_grad
-        else:
-            requires_grad = False
-    if pinned is None:
-        pinned = src.pinned
+    arr = empty_like(src, device=device, requires_grad=requires_grad, pinned=pinned)
-    dest = empty(shape=src.shape, dtype=src.dtype, device=src.device, requires_grad=requires_grad, pinned=pinned)
-    copy(dest, src)
+    warp.copy(arr, src)
-    return dest
+    return arr
 def empty(
@@ -2705,7 +3215,7 @@ def empty(
     """Returns an uninitialized array
     Args:
-        n: Number of elements
+        shape: Array dimensions
         dtype: Type of each element, e.g.: `warp.vec3`, `warp.mat33`, etc
         device: Device that array will live on
         requires_grad: Whether the array will be tracked for back propagation
@@ -2715,15 +3225,26 @@ def empty(
         A warp.array object representing the allocation
     """
-    # todo: implement uninitialized allocation
-    return zeros(shape, dtype, device, requires_grad=requires_grad, pinned=pinned, **kwargs)
+    # backwards compatibility for case where users called wp.empty(n=length, ...)
+    if "n" in kwargs:
+        shape = (kwargs["n"],)
+        del kwargs["n"]
+    # ensure shape is specified, even if creating a zero-sized array
+    if shape is None:
+        shape = 0
+    return warp.array(shape=shape, dtype=dtype, device=device, requires_grad=requires_grad, pinned=pinned, **kwargs)
-def empty_like(src: warp.array, requires_grad: bool = None, pinned: bool = None) -> warp.array:
+def empty_like(
+    src: warp.array, device: Devicelike = None, requires_grad: bool = None, pinned: bool = None
+) -> warp.array:
     """Return an uninitialized array with the same type and dimension of another array
     Args:
-        src: The template array to use for length, data type, and device
+        src: The template array to use for shape, data type, and device
+        device: The device where the new array will be created (defaults to src.device)
         requires_grad: Whether the array will be tracked for back propagation
         pinned: Whether the array uses pinned host memory (only applicable to CPU arrays)
@@ -2731,6 +3252,9 @@ def empty_like(src: warp.array, requires_grad: bool = None, pinned: bool = None)
         A warp.array object representing the allocation
     """
+    if device is None:
+        device = src.device
     if requires_grad is None:
         if hasattr(src, "requires_grad"):
             requires_grad = src.requires_grad
@@ -2738,14 +3262,246 @@ def empty_like(src: warp.array, requires_grad: bool = None, pinned: bool = None)
             requires_grad = False
     if pinned is None:
-        pinned = src.pinned
+        if hasattr(src, "pinned"):
+            pinned = src.pinned
+        else:
+            pinned = False
-    arr = empty(shape=src.shape, dtype=src.dtype, device=src.device, requires_grad=requires_grad, pinned=pinned)
+    arr = empty(shape=src.shape, dtype=src.dtype, device=device, requires_grad=requires_grad, pinned=pinned)
     return arr
-def from_numpy(arr, dtype, device: Devicelike = None, requires_grad=False):
-    return warp.array(data=arr, dtype=dtype, device=device, requires_grad=requires_grad)
+def from_numpy(
+    arr: np.ndarray,
+    dtype: Optional[type] = None,
+    shape: Optional[Sequence[int]] = None,
+    device: Optional[Devicelike] = None,
+    requires_grad: bool = False,
+) -> warp.array:
+    if dtype is None:
+        base_type = warp.types.np_dtype_to_warp_type.get(arr.dtype)
+        if base_type is None:
+            raise RuntimeError("Unsupported NumPy data type '{}'.".format(arr.dtype))
+        dim_count = len(arr.shape)
+        if dim_count == 2:
+            dtype = warp.types.vector(length=arr.shape[1], dtype=base_type)
+        elif dim_count == 3:
+            dtype = warp.types.matrix(shape=(arr.shape[1], arr.shape[2]), dtype=base_type)
+        else:
+            dtype = base_type
+    return warp.array(
+        data=arr,
+        dtype=dtype,
+        shape=shape,
+        owner=False,
+        device=device,
+        requires_grad=requires_grad,
+    )
+# given a kernel destination argument type and a value convert
+#  to a c-type that can be passed to a kernel
+def pack_arg(kernel, arg_type, arg_name, value, device, adjoint=False):
+    if warp.types.is_array(arg_type):
+        if value is None:
+            # allow for NULL arrays
+            return arg_type.__ctype__()
+        else:
+            # check for array type
+            # - in forward passes, array types have to match
+            # - in backward passes, indexed array gradients are regular arrays
+            if adjoint:
+                array_matches = isinstance(value, warp.array)
+            else:
+                array_matches = type(value) is type(arg_type)
+            if not array_matches:
+                adj = "adjoint " if adjoint else ""
+                raise RuntimeError(
+                    f"Error launching kernel '{kernel.key}', {adj}argument '{arg_name}' expects an array of type {type(arg_type)}, but passed value has type {type(value)}."
+                )
+            # check subtype
+            if not warp.types.types_equal(value.dtype, arg_type.dtype):
+                adj = "adjoint " if adjoint else ""
+                raise RuntimeError(
+                    f"Error launching kernel '{kernel.key}', {adj}argument '{arg_name}' expects an array with dtype={arg_type.dtype} but passed array has dtype={value.dtype}."
+                )
+            # check dimensions
+            if value.ndim != arg_type.ndim:
+                adj = "adjoint " if adjoint else ""
+                raise RuntimeError(
+                    f"Error launching kernel '{kernel.key}', {adj}argument '{arg_name}' expects an array with {arg_type.ndim} dimension(s) but the passed array has {value.ndim} dimension(s)."
+                )
+            # check device
+            # if a.device != device and not device.can_access(a.device):
+            if value.device != device:
+                raise RuntimeError(
+                    f"Error launching kernel '{kernel.key}', trying to launch on device='{device}', but input array for argument '{arg_name}' is on device={value.device}."
+                )
+            return value.__ctype__()
+    elif isinstance(arg_type, warp.codegen.Struct):
+        assert value is not None
+        return value.__ctype__()
+    # try to convert to a value type (vec3, mat33, etc)
+    elif issubclass(arg_type, ctypes.Array):
+        if warp.types.types_equal(type(value), arg_type):
+            return value
+        else:
+            # try constructing the required value from the argument (handles tuple / list, Gf.Vec3 case)
+            try:
+                return arg_type(value)
+            except Exception:
+                raise ValueError(f"Failed to convert argument for param {arg_name} to {type_str(arg_type)}")
+    elif isinstance(value, bool):
+        return ctypes.c_bool(value)
+    elif isinstance(value, arg_type):
+        try:
+            # try to pack as a scalar type
+            if arg_type is warp.types.float16:
+                return arg_type._type_(warp.types.float_to_half_bits(value.value))
+            else:
+                return arg_type._type_(value.value)
+        except Exception:
+            raise RuntimeError(
+                "Error launching kernel, unable to pack kernel parameter type "
+                f"{type(value)} for param {arg_name}, expected {arg_type}"
+            )
+    else:
+        try:
+            # try to pack as a scalar type
+            if arg_type is warp.types.float16:
+                return arg_type._type_(warp.types.float_to_half_bits(value))
+            else:
+                return arg_type._type_(value)
+        except Exception as e:
+            print(e)
+            raise RuntimeError(
+                "Error launching kernel, unable to pack kernel parameter type "
+                f"{type(value)} for param {arg_name}, expected {arg_type}"
+            )
+# represents all data required for a kernel launch
+# so that launches can be replayed quickly, use `wp.launch(..., record_cmd=True)`
+class Launch:
+    def __init__(self, kernel, device, hooks=None, params=None, params_addr=None, bounds=None, max_blocks=0):
+        # if not specified look up hooks
+        if not hooks:
+            module = kernel.module
+            if not module.load(device):
+                return
+            hooks = module.get_kernel_hooks(kernel, device)
+        # if not specified set a zero bound
+        if not bounds:
+            bounds = warp.types.launch_bounds_t(0)
+        # if not specified then build a list of default value params for args
+        if not params:
+            params = []
+            params.append(bounds)
+            for a in kernel.adj.args:
+                if isinstance(a.type, warp.types.array):
+                    params.append(a.type.__ctype__())
+                elif isinstance(a.type, warp.codegen.Struct):
+                    params.append(a.type().__ctype__())
+                else:
+                    params.append(pack_arg(kernel, a.type, a.label, 0, device, False))
+            kernel_args = [ctypes.c_void_p(ctypes.addressof(x)) for x in params]
+            kernel_params = (ctypes.c_void_p * len(kernel_args))(*kernel_args)
+            params_addr = kernel_params
+        self.kernel = kernel
+        self.hooks = hooks
+        self.params = params
+        self.params_addr = params_addr
+        self.device = device
+        self.bounds = bounds
+        self.max_blocks = max_blocks
+    def set_dim(self, dim):
+        self.bounds = warp.types.launch_bounds_t(dim)
+        # launch bounds always at index 0
+        self.params[0] = self.bounds
+        # for CUDA kernels we need to update the address to each arg
+        if self.params_addr:
+            self.params_addr[0] = ctypes.c_void_p(ctypes.addressof(self.bounds))
+    # set kernel param at an index, will convert to ctype as necessary
+    def set_param_at_index(self, index, value):
+        arg_type = self.kernel.adj.args[index].type
+        arg_name = self.kernel.adj.args[index].label
+        carg = pack_arg(self.kernel, arg_type, arg_name, value, self.device, False)
+        self.params[index + 1] = carg
+        # for CUDA kernels we need to update the address to each arg
+        if self.params_addr:
+            self.params_addr[index + 1] = ctypes.c_void_p(ctypes.addressof(carg))
+    # set kernel param at an index without any type conversion
+    # args must be passed as ctypes or basic int / float types
+    def set_param_at_index_from_ctype(self, index, value):
+        if isinstance(value, ctypes.Structure):
+            # not sure how to directly assign struct->struct without reallocating using ctypes
+            self.params[index + 1] = value
+            # for CUDA kernels we need to update the address to each arg
+            if self.params_addr:
+                self.params_addr[index + 1] = ctypes.c_void_p(ctypes.addressof(value))
+        else:
+            self.params[index + 1].__init__(value)
+    # set kernel param by argument name
+    def set_param_by_name(self, name, value):
+        for i, arg in enumerate(self.kernel.adj.args):
+            if arg.label == name:
+                self.set_param_at_index(i, value)
+    # set kernel param by argument name with no type conversions
+    def set_param_by_name_from_ctype(self, name, value):
+        # lookup argument index
+        for i, arg in enumerate(self.kernel.adj.args):
+            if arg.label == name:
+                self.set_param_at_index_from_ctype(i, value)
+    # set all params
+    def set_params(self, values):
+        for i, v in enumerate(values):
+            self.set_param_at_index(i, v)
+    # set all params without performing type-conversions
+    def set_params_from_ctypes(self, values):
+        for i, v in enumerate(values):
+            self.set_param_at_index_from_ctype(i, v)
+    def launch(self) -> Any:
+        if self.device.is_cpu:
+            self.hooks.forward(*self.params)
+        else:
+            runtime.core.cuda_launch_kernel(
+                self.device.context, self.hooks.forward, self.bounds.size, self.max_blocks, self.params_addr
+            )
 def launch(
@@ -2759,6 +3515,8 @@ def launch(
     stream: Stream = None,
     adjoint=False,
     record_tape=True,
+    record_cmd=False,
+    max_blocks=0,
 ):
     """Launch a Warp kernel on the target device
@@ -2774,6 +3532,10 @@ def launch(
         device: The device to launch on (optional)
         stream: The stream to launch on (optional)
         adjoint: Whether to run forward or backward pass (typically use False)
+        record_tape: When true the launch will be recorded the global wp.Tape() object when present
+        record_cmd: When True the launch will be returned as a ``Launch`` command object, the launch will not occur until the user calls ``cmd.launch()``
+        max_blocks: The maximum number of CUDA thread blocks to use. Only has an effect for CUDA kernel launches.
+            If negative or zero, the maximum hardware value will be used.
     """
     assert_initialized()
@@ -2785,7 +3547,7 @@ def launch(
         device = runtime.get_device(device)
     # check function is a Kernel
-    if isinstance(kernel, Kernel) == False:
+    if not isinstance(kernel, Kernel):
         raise RuntimeError("Error launching kernel, can only launch functions decorated with @wp.kernel.")
     # debugging aid
@@ -2806,85 +3568,7 @@ def launch(
                 arg_type = kernel.adj.args[i].type
                 arg_name = kernel.adj.args[i].label
-                if warp.types.is_array(arg_type):
-                    if a is None:
-                        # allow for NULL arrays
-                        params.append(arg_type.__ctype__())
-                    else:
-                        # check for array type
-                        # - in forward passes, array types have to match
-                        # - in backward passes, indexed array gradients are regular arrays
-                        if adjoint:
-                            array_matches = type(a) == warp.array
-                        else:
-                            array_matches = type(a) == type(arg_type)
-                        if not array_matches:
-                            adj = "adjoint " if adjoint else ""
-                            raise RuntimeError(
-                                f"Error launching kernel '{kernel.key}', {adj}argument '{arg_name}' expects an array of type {type(arg_type)}, but passed value has type {type(a)}."
-                            )
-                        # check subtype
-                        if not warp.types.types_equal(a.dtype, arg_type.dtype):
-                            adj = "adjoint " if adjoint else ""
-                            raise RuntimeError(
-                                f"Error launching kernel '{kernel.key}', {adj}argument '{arg_name}' expects an array with dtype={arg_type.dtype} but passed array has dtype={a.dtype}."
-                            )
-                        # check dimensions
-                        if a.ndim != arg_type.ndim:
-                            adj = "adjoint " if adjoint else ""
-                            raise RuntimeError(
-                                f"Error launching kernel '{kernel.key}', {adj}argument '{arg_name}' expects an array with {arg_type.ndim} dimension(s) but the passed array has {a.ndim} dimension(s)."
-                            )
-                        # check device
-                        # if a.device != device and not device.can_access(a.device):
-                        if a.device != device:
-                            raise RuntimeError(
-                                f"Error launching kernel '{kernel.key}', trying to launch on device='{device}', but input array for argument '{arg_name}' is on device={a.device}."
-                            )
-                        params.append(a.__ctype__())
-                elif isinstance(arg_type, warp.codegen.Struct):
-                    assert a is not None
-                    params.append(a.__ctype__())
-                # try to convert to a value type (vec3, mat33, etc)
-                elif issubclass(arg_type, ctypes.Array):
-                    if warp.types.types_equal(type(a), arg_type):
-                        params.append(a)
-                    else:
-                        # try constructing the required value from the argument (handles tuple / list, Gf.Vec3 case)
-                        try:
-                            params.append(arg_type(a))
-                        except:
-                            raise ValueError(f"Failed to convert argument for param {arg_name} to {type_str(arg_type)}")
-                elif isinstance(a, bool):
-                    params.append(ctypes.c_bool(a))
-                elif isinstance(a, arg_type):
-                    try:
-                        # try to pack as a scalar type
-                        params.append(arg_type._type_(a.value))
-                    except:
-                        raise RuntimeError(
-                            f"Error launching kernel, unable to pack kernel parameter type {type(a)} for param {arg_name}, expected {arg_type}"
-                        )
-                else:
-                    try:
-                        # try to pack as a scalar type
-                        params.append(arg_type._type_(a))
-                    except Exception as e:
-                        print(e)
-                        raise RuntimeError(
-                            f"Error launching kernel, unable to pack kernel parameter type {type(a)} for param {arg_name}, expected {arg_type}"
-                        )
+                params.append(pack_arg(kernel, arg_type, arg_name, a, device, adjoint))
         fwd_args = inputs + outputs
         adj_args = adj_inputs + adj_outputs
@@ -2926,7 +3610,13 @@ def launch(
                         f"Failed to find forward kernel '{kernel.key}' from module '{kernel.module.name}' for device '{device}'"
                     )
-                hooks.forward(*params)
+                if record_cmd:
+                    launch = Launch(
+                        kernel=kernel, hooks=hooks, params=params, params_addr=None, bounds=bounds, device=device
+                    )
+                    return launch
+                else:
+                    hooks.forward(*params)
         else:
             kernel_args = [ctypes.c_void_p(ctypes.addressof(x)) for x in params]
@@ -2939,7 +3629,9 @@ def launch(
                             f"Failed to find backward kernel '{kernel.key}' from module '{kernel.module.name}' for device '{device}'"
                         )
-                    runtime.core.cuda_launch_kernel(device.context, hooks.backward, bounds.size, kernel_params)
+                    runtime.core.cuda_launch_kernel(
+                        device.context, hooks.backward, bounds.size, max_blocks, kernel_params
+                    )
                 else:
                     if hooks.forward is None:
@@ -2947,7 +3639,22 @@ def launch(
                             f"Failed to find forward kernel '{kernel.key}' from module '{kernel.module.name}' for device '{device}'"
                         )
-                    runtime.core.cuda_launch_kernel(device.context, hooks.forward, bounds.size, kernel_params)
+                    if record_cmd:
+                        launch = Launch(
+                            kernel=kernel,
+                            hooks=hooks,
+                            params=params,
+                            params_addr=kernel_params,
+                            bounds=bounds,
+                            device=device,
+                        )
+                        return launch
+                    else:
+                        # launch
+                        runtime.core.cuda_launch_kernel(
+                            device.context, hooks.forward, bounds.size, max_blocks, kernel_params
+                        )
                 try:
                     runtime.verify_cuda_device(device)
@@ -2957,7 +3664,7 @@ def launch(
     # record on tape if one is active
     if runtime.tape and record_tape:
-        runtime.tape.record_launch(kernel, dim, inputs, outputs, device)
+        runtime.tape.record_launch(kernel, dim, max_blocks, inputs, outputs, device)
 def synchronize():
@@ -2967,7 +3674,7 @@ def synchronize():
     or memory copies have completed.
     """
-    if is_cuda_available():
+    if is_cuda_driver_initialized():
         # save the original context to avoid side effects
         saved_context = runtime.core.cuda_context_get_current()
@@ -3017,7 +3724,7 @@ def synchronize_stream(stream_or_device=None):
     runtime.core.cuda_stream_synchronize(stream.device.context, stream.cuda_stream)
-def force_load(device: Union[Device, str] = None, modules: List[Module] = None):
+def force_load(device: Union[Device, str, List[Device], List[str]] = None, modules: List[Module] = None):
     """Force user-defined kernels to be compiled and loaded
     Args:
@@ -3025,12 +3732,14 @@ def force_load(device: Union[Device, str] = None, modules: List[Module] = None):
         modules: List of modules to load.  If None, load all imported modules.
     """
-    if is_cuda_available():
+    if is_cuda_driver_initialized():
         # save original context to avoid side effects
         saved_context = runtime.core.cuda_context_get_current()
     if device is None:
         devices = get_devices()
+    elif isinstance(device, list):
+        devices = [get_device(device_item) for device_item in device]
     else:
         devices = [get_device(device)]
@@ -3122,7 +3831,7 @@ def get_module_options(module: Optional[Any] = None) -> Dict[str, Any]:
     return get_module(m.__name__).options
-def capture_begin(device: Devicelike = None, stream=None, force_module_load=True):
+def capture_begin(device: Devicelike = None, stream=None, force_module_load=None):
     """Begin capture of a CUDA graph
     Captures all subsequent kernel launches and memory operations on CUDA devices.
@@ -3136,7 +3845,10 @@ def capture_begin(device: Devicelike = None, stream=None, force_module_load=True
     """
-    if warp.config.verify_cuda == True:
+    if force_module_load is None:
+        force_module_load = warp.config.graph_capture_module_load_default
+    if warp.config.verify_cuda:
         raise RuntimeError("Cannot use CUDA error verification during graph capture")
     if stream is not None:
@@ -3151,6 +3863,9 @@ def capture_begin(device: Devicelike = None, stream=None, force_module_load=True
     device.is_capturing = True
+    # disable garbage collection to avoid older allocations getting collected during graph capture
+    gc.disable()
     with warp.ScopedStream(stream):
         runtime.core.cuda_graph_begin_capture(device.context)
@@ -3174,6 +3889,9 @@ def capture_end(device: Devicelike = None, stream=None) -> Graph:
     device.is_capturing = False
+    # re-enable GC
+    gc.enable()
     if graph is None:
         raise RuntimeError(
             "Error occurred during CUDA graph capture. This could be due to an unintended allocation or CPU/GPU synchronization event."
@@ -3226,7 +3944,14 @@ def copy(
     if count == 0:
         return
-    has_grad = hasattr(src, "grad_ptr") and hasattr(dest, "grad_ptr") and src.grad_ptr and dest.grad_ptr
+    # copying non-contiguous arrays requires that they are on the same device
+    if not (src.is_contiguous and dest.is_contiguous) and src.device != dest.device:
+        if dest.is_contiguous:
+            # make a contiguous copy of the source array
+            src = src.contiguous()
+        else:
+            # make a copy of the source array on the destination device
+            src = src.to(dest.device)
     if src.is_contiguous and dest.is_contiguous:
         bytes_to_copy = count * warp.types.type_size_in_bytes(src.dtype)
@@ -3240,10 +3965,6 @@ def copy(
         src_ptr = src.ptr + src_offset_in_bytes
         dst_ptr = dest.ptr + dst_offset_in_bytes
-        if has_grad:
-            src_grad_ptr = src.grad_ptr + src_offset_in_bytes
-            dst_grad_ptr = dest.grad_ptr + dst_offset_in_bytes
         if src_offset_in_bytes + bytes_to_copy > src_size_in_bytes:
             raise RuntimeError(
                 f"Trying to copy source buffer with size ({bytes_to_copy}) from offset ({src_offset_in_bytes}) is larger than source size ({src_size_in_bytes})"
@@ -3256,8 +3977,6 @@ def copy(
         if src.device.is_cpu and dest.device.is_cpu:
             runtime.core.memcpy_h2h(dst_ptr, src_ptr, bytes_to_copy)
-            if has_grad:
-                runtime.core.memcpy_h2h(dst_grad_ptr, src_grad_ptr, bytes_to_copy)
         else:
             # figure out the CUDA context/stream for the copy
             if stream is not None:
@@ -3270,32 +3989,19 @@ def copy(
             with warp.ScopedStream(stream):
                 if src.device.is_cpu and dest.device.is_cuda:
                     runtime.core.memcpy_h2d(copy_device.context, dst_ptr, src_ptr, bytes_to_copy)
-                    if has_grad:
-                        runtime.core.memcpy_h2d(copy_device.context, dst_grad_ptr, src_grad_ptr, bytes_to_copy)
                 elif src.device.is_cuda and dest.device.is_cpu:
                     runtime.core.memcpy_d2h(copy_device.context, dst_ptr, src_ptr, bytes_to_copy)
-                    if has_grad:
-                        runtime.core.memcpy_d2h(copy_device.context, dst_grad_ptr, src_grad_ptr, bytes_to_copy)
                 elif src.device.is_cuda and dest.device.is_cuda:
                     if src.device == dest.device:
                         runtime.core.memcpy_d2d(copy_device.context, dst_ptr, src_ptr, bytes_to_copy)
-                        if has_grad:
-                            runtime.core.memcpy_d2d(copy_device.context, dst_grad_ptr, src_grad_ptr, bytes_to_copy)
                     else:
                         runtime.core.memcpy_peer(copy_device.context, dst_ptr, src_ptr, bytes_to_copy)
-                        if has_grad:
-                            runtime.core.memcpy_peer(copy_device.context, dst_grad_ptr, src_grad_ptr, bytes_to_copy)
                 else:
                     raise RuntimeError("Unexpected source and destination combination")
     else:
         # handle non-contiguous and indexed arrays
-        if src.device != dest.device:
-            raise RuntimeError(
-                f"Copies between non-contiguous arrays must be on the same device, got {dest.device} and {src.device}"
-            )
         if src.shape != dest.shape:
             raise RuntimeError("Incompatible array shapes")
@@ -3305,18 +4011,22 @@ def copy(
         if src_elem_size != dst_elem_size:
             raise RuntimeError("Incompatible array data types")
-        def array_type(a):
-            if isinstance(a, warp.types.array):
-                return warp.types.ARRAY_TYPE_REGULAR
-            elif isinstance(a, warp.types.indexedarray):
-                return warp.types.ARRAY_TYPE_INDEXED
+        # can't copy to/from fabric arrays of arrays, because they are jagged arrays of arbitrary lengths
+        # TODO?
+        if (
+            isinstance(src, (warp.fabricarray, warp.indexedfabricarray))
+            and src.ndim > 1
+            or isinstance(dest, (warp.fabricarray, warp.indexedfabricarray))
+            and dest.ndim > 1
+        ):
+            raise RuntimeError("Copying to/from Fabric arrays of arrays is not supported")
         src_desc = src.__ctype__()
         dst_desc = dest.__ctype__()
         src_ptr = ctypes.pointer(src_desc)
         dst_ptr = ctypes.pointer(dst_desc)
-        src_type = array_type(src)
-        dst_type = array_type(dest)
+        src_type = warp.types.array_type_id(src)
+        dst_type = warp.types.array_type_id(dest)
         if src.device.is_cuda:
             with warp.ScopedStream(stream):
@@ -3324,6 +4034,10 @@ def copy(
         else:
             runtime.core.array_copy_host(dst_ptr, src_ptr, dst_type, src_type, src_elem_size)
+    # copy gradient, if needed
+    if hasattr(src, "grad") and src.grad is not None and hasattr(dest, "grad") and dest.grad is not None:
+        copy(dest.grad, src.grad, stream=stream)
 def type_str(t):
     if t is None:
@@ -3342,6 +4056,10 @@ def type_str(t):
         return f"Array[{type_str(t.dtype)}]"
     elif isinstance(t, warp.indexedarray):
         return f"IndexedArray[{type_str(t.dtype)}]"
+    elif isinstance(t, warp.fabricarray):
+        return f"FabricArray[{type_str(t.dtype)}]"
+    elif isinstance(t, warp.indexedfabricarray):
+        return f"IndexedFabricArray[{type_str(t.dtype)}]"
     elif hasattr(t, "_wp_generic_type_str_"):
         generic_type = t._wp_generic_type_str_
@@ -3368,7 +4086,7 @@ def type_str(t):
         return t.__name__
-def print_function(f, file, noentry=False):
+def print_function(f, file, noentry=False):  # pragma: no cover
     """Writes a function definition to a file for use in reST documentation
     Args:
@@ -3392,7 +4110,7 @@ def print_function(f, file, noentry=False):
         # todo: construct a default value for each of the functions args
         # so we can generate the return type for overloaded functions
         return_type = " -> " + type_str(f.value_func(None, None, None))
-    except:
+    except Exception:
         pass
     print(f".. function:: {f.key}({args}){return_type}", file=file)
@@ -3413,7 +4131,7 @@ def print_function(f, file, noentry=False):
     return True
-def print_builtins(file):
+def export_functions_rst(file):  # pragma: no cover
     header = (
         "..\n"
         "   Autogenerated File - Do not edit. Run build_docs.py to generate.\n"
@@ -3433,6 +4151,8 @@ def print_builtins(file):
     for t in warp.types.scalar_types:
         print(f".. class:: {t.__name__}", file=file)
+    # Manually add wp.bool since it's inconvenient to add to wp.types.scalar_types:
+    print(f".. class:: {warp.types.bool.__name__}", file=file)
     print("\n\nVector Types", file=file)
     print("------------", file=file)
@@ -3443,14 +4163,22 @@ def print_builtins(file):
     print("\nGeneric Types", file=file)
     print("-------------", file=file)
-    print(f".. class:: Int", file=file)
-    print(f".. class:: Float", file=file)
-    print(f".. class:: Scalar", file=file)
-    print(f".. class:: Vector", file=file)
-    print(f".. class:: Matrix", file=file)
-    print(f".. class:: Quaternion", file=file)
-    print(f".. class:: Transformation", file=file)
-    print(f".. class:: Array", file=file)
+    print(".. class:: Int", file=file)
+    print(".. class:: Float", file=file)
+    print(".. class:: Scalar", file=file)
+    print(".. class:: Vector", file=file)
+    print(".. class:: Matrix", file=file)
+    print(".. class:: Quaternion", file=file)
+    print(".. class:: Transformation", file=file)
+    print(".. class:: Array", file=file)
+    print("\nQuery Types", file=file)
+    print("-------------", file=file)
+    print(".. autoclass:: bvh_query_t", file=file)
+    print(".. autoclass:: hash_grid_query_t", file=file)
+    print(".. autoclass:: mesh_query_aabb_t", file=file)
+    print(".. autoclass:: mesh_query_point_t", file=file)
+    print(".. autoclass:: mesh_query_ray_t", file=file)
     # build dictionary of all functions by group
     groups = {}
@@ -3485,7 +4213,7 @@ def print_builtins(file):
     print(".. [1] Note: function gradients not implemented for backpropagation.", file=file)
-def export_stubs(file):
+def export_stubs(file):  # pragma: no cover
     """Generates stub file for auto-complete of builtin functions"""
     import textwrap
@@ -3517,6 +4245,8 @@ def export_stubs(file):
     print("Quaternion = Generic[Float]", file=file)
     print("Transformation = Generic[Float]", file=file)
     print("Array = Generic[DType]", file=file)
+    print("FabricArray = Generic[DType]", file=file)
+    print("IndexedFabricArray = Generic[DType]", file=file)
     # prepend __init__.py
     with open(os.path.join(os.path.dirname(file.name), "__init__.py")) as header_file:
@@ -3533,7 +4263,7 @@ def export_stubs(file):
             return_str = ""
-            if f.export == False or f.hidden == True:  # or f.generic:
+            if not f.export or f.hidden:  # or f.generic:
                 continue
             try:
@@ -3543,29 +4273,42 @@ def export_stubs(file):
                 if return_type:
                     return_str = " -> " + type_str(return_type)
-            except:
+            except Exception:
                 pass
             print("@over", file=file)
             print(f"def {f.key}({args}){return_str}:", file=file)
-            print(f'    """', file=file)
+            print('    """', file=file)
             print(textwrap.indent(text=f.doc, prefix="    "), file=file)
-            print(f'    """', file=file)
-            print(f"    ...\n\n", file=file)
+            print('    """', file=file)
+            print("    ...\n\n", file=file)
-def export_builtins(file):
-    def ctype_str(t):
+def export_builtins(file: io.TextIOBase):  # pragma: no cover
+    def ctype_arg_str(t):
         if isinstance(t, int):
             return "int"
         elif isinstance(t, float):
             return "float"
+        elif t in warp.types.vector_types:
+            return f"{t.__name__}&"
         else:
             return t.__name__
+    def ctype_ret_str(t):
+        if isinstance(t, int):
+            return "int"
+        elif isinstance(t, float):
+            return "float"
+        else:
+            return t.__name__
+    file.write("namespace wp {\n\n")
+    file.write('extern "C" {\n\n')
     for k, g in builtin_functions.items():
         for f in g.overloads:
-            if f.export == False or f.generic:
+            if not f.export or f.generic:
                 continue
             simple = True
@@ -3579,7 +4322,7 @@ def export_builtins(file):
             if not simple or f.variadic:
                 continue
-            args = ", ".join(f"{ctype_str(v)} {k}" for k, v in f.input_types.items())
+            args = ", ".join(f"{ctype_arg_str(v)} {k}" for k, v in f.input_types.items())
             params = ", ".join(f.input_types.keys())
             return_type = ""
@@ -3587,25 +4330,25 @@ def export_builtins(file):
             try:
                 # todo: construct a default value for each of the functions args
                 # so we can generate the return type for overloaded functions
-                return_type = ctype_str(f.value_func(None, None, None))
-            except:
+                return_type = ctype_ret_str(f.value_func(None, None, None))
+            except Exception:
                 continue
             if return_type.startswith("Tuple"):
                 continue
             if args == "":
-                print(
-                    f"WP_API void {f.mangled_name}({return_type}* ret) {{ *ret = wp::{f.key}({params}); }}", file=file
-                )
+                file.write(f"WP_API void {f.mangled_name}({return_type}* ret) {{ *ret = wp::{f.key}({params}); }}\n")
             elif return_type == "None":
-                print(f"WP_API void {f.mangled_name}({args}) {{ wp::{f.key}({params}); }}", file=file)
+                file.write(f"WP_API void {f.mangled_name}({args}) {{ wp::{f.key}({params}); }}\n")
             else:
-                print(
-                    f"WP_API void {f.mangled_name}({args}, {return_type}* ret) {{ *ret = wp::{f.key}({params}); }}",
-                    file=file,
+                file.write(
+                    f"WP_API void {f.mangled_name}({args}, {return_type}* ret) {{ *ret = wp::{f.key}({params}); }}\n"
                 )
+    file.write('\n}  // extern "C"\n\n')
+    file.write("}  // namespace wp\n")
 # initialize global runtime
 runtime = None