PyPI - warp-lang - Versions diffs - 1.9.0__py3-none-manylinux_2_34_aarch64.whl → 1.10.0__py3-none-manylinux_2_34_aarch64.whl - Mend

warp-lang 1.9.0__py3-none-manylinux_2_34_aarch64.whl → 1.10.0__py3-none-manylinux_2_34_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (350) hide show

warp/__init__.py +301 -287
warp/__init__.pyi +2302 -307
warp/_src/__init__.py +14 -0
warp/_src/autograd.py +1077 -0
warp/_src/build.py +620 -0
warp/_src/build_dll.py +642 -0
warp/{builtins.py → _src/builtins.py} +1546 -224
warp/_src/codegen.py +4361 -0
warp/{config.py → _src/config.py} +178 -169
warp/_src/constants.py +59 -0
warp/_src/context.py +8352 -0
warp/_src/dlpack.py +464 -0
warp/_src/fabric.py +362 -0
warp/_src/fem/__init__.py +14 -0
warp/_src/fem/adaptivity.py +510 -0
warp/_src/fem/cache.py +689 -0
warp/_src/fem/dirichlet.py +190 -0
warp/{fem → _src/fem}/domain.py +42 -30
warp/_src/fem/field/__init__.py +131 -0
warp/_src/fem/field/field.py +703 -0
warp/{fem → _src/fem}/field/nodal_field.py +32 -15
warp/{fem → _src/fem}/field/restriction.py +3 -1
warp/{fem → _src/fem}/field/virtual.py +55 -27
warp/_src/fem/geometry/__init__.py +32 -0
warp/{fem → _src/fem}/geometry/adaptive_nanogrid.py +79 -163
warp/_src/fem/geometry/closest_point.py +99 -0
warp/{fem → _src/fem}/geometry/deformed_geometry.py +16 -22
warp/{fem → _src/fem}/geometry/element.py +34 -10
warp/{fem → _src/fem}/geometry/geometry.py +50 -20
warp/{fem → _src/fem}/geometry/grid_2d.py +14 -23
warp/{fem → _src/fem}/geometry/grid_3d.py +14 -23
warp/{fem → _src/fem}/geometry/hexmesh.py +42 -63
warp/{fem → _src/fem}/geometry/nanogrid.py +256 -247
warp/{fem → _src/fem}/geometry/partition.py +123 -63
warp/{fem → _src/fem}/geometry/quadmesh.py +28 -45
warp/{fem → _src/fem}/geometry/tetmesh.py +42 -63
warp/{fem → _src/fem}/geometry/trimesh.py +28 -45
warp/{fem → _src/fem}/integrate.py +166 -158
warp/_src/fem/linalg.py +385 -0
warp/_src/fem/operator.py +398 -0
warp/_src/fem/polynomial.py +231 -0
warp/{fem → _src/fem}/quadrature/pic_quadrature.py +17 -20
warp/{fem → _src/fem}/quadrature/quadrature.py +97 -47
warp/_src/fem/space/__init__.py +248 -0
warp/{fem → _src/fem}/space/basis_function_space.py +22 -11
warp/_src/fem/space/basis_space.py +681 -0
warp/{fem → _src/fem}/space/dof_mapper.py +5 -3
warp/{fem → _src/fem}/space/function_space.py +16 -13
warp/{fem → _src/fem}/space/grid_2d_function_space.py +6 -7
warp/{fem → _src/fem}/space/grid_3d_function_space.py +6 -4
warp/{fem → _src/fem}/space/hexmesh_function_space.py +6 -10
warp/{fem → _src/fem}/space/nanogrid_function_space.py +5 -9
warp/{fem → _src/fem}/space/partition.py +119 -60
warp/{fem → _src/fem}/space/quadmesh_function_space.py +6 -10
warp/{fem → _src/fem}/space/restriction.py +68 -33
warp/_src/fem/space/shape/__init__.py +152 -0
warp/{fem → _src/fem}/space/shape/cube_shape_function.py +11 -9
warp/{fem → _src/fem}/space/shape/shape_function.py +10 -9
warp/{fem → _src/fem}/space/shape/square_shape_function.py +8 -6
warp/{fem → _src/fem}/space/shape/tet_shape_function.py +5 -3
warp/{fem → _src/fem}/space/shape/triangle_shape_function.py +5 -3
warp/{fem → _src/fem}/space/tetmesh_function_space.py +5 -9
warp/_src/fem/space/topology.py +461 -0
warp/{fem → _src/fem}/space/trimesh_function_space.py +5 -9
warp/_src/fem/types.py +114 -0
warp/_src/fem/utils.py +488 -0
warp/_src/jax.py +188 -0
warp/_src/jax_experimental/__init__.py +14 -0
warp/_src/jax_experimental/custom_call.py +389 -0
warp/_src/jax_experimental/ffi.py +1286 -0
warp/_src/jax_experimental/xla_ffi.py +658 -0
warp/_src/marching_cubes.py +710 -0
warp/_src/math.py +416 -0
warp/_src/optim/__init__.py +14 -0
warp/_src/optim/adam.py +165 -0
warp/_src/optim/linear.py +1608 -0
warp/_src/optim/sgd.py +114 -0
warp/_src/paddle.py +408 -0
warp/_src/render/__init__.py +14 -0
warp/_src/render/imgui_manager.py +291 -0
warp/_src/render/render_opengl.py +3638 -0
warp/_src/render/render_usd.py +939 -0
warp/_src/render/utils.py +162 -0
warp/_src/sparse.py +2718 -0
warp/_src/tape.py +1208 -0
warp/{thirdparty → _src/thirdparty}/unittest_parallel.py +9 -2
warp/_src/torch.py +393 -0
warp/_src/types.py +5888 -0
warp/_src/utils.py +1695 -0
warp/autograd.py +12 -1054
warp/bin/warp-clang.so +0 -0
warp/bin/warp.so +0 -0
warp/build.py +8 -588
warp/build_dll.py +6 -471
warp/codegen.py +6 -4246
warp/constants.py +6 -39
warp/context.py +12 -7851
warp/dlpack.py +6 -444
warp/examples/distributed/example_jacobi_mpi.py +4 -5
warp/examples/fem/example_adaptive_grid.py +1 -1
warp/examples/fem/example_apic_fluid.py +1 -1
warp/examples/fem/example_burgers.py +8 -8
warp/examples/fem/example_diffusion.py +1 -1
warp/examples/fem/example_distortion_energy.py +1 -1
warp/examples/fem/example_mixed_elasticity.py +2 -2
warp/examples/fem/example_navier_stokes.py +1 -1
warp/examples/fem/example_nonconforming_contact.py +7 -7
warp/examples/fem/example_stokes.py +1 -1
warp/examples/fem/example_stokes_transfer.py +1 -1
warp/examples/fem/utils.py +2 -2
warp/examples/interop/example_jax_callable.py +1 -1
warp/examples/interop/example_jax_ffi_callback.py +1 -1
warp/examples/interop/example_jax_kernel.py +3 -2
warp/examples/tile/example_tile_mcgp.py +191 -0
warp/fabric.py +6 -337
warp/fem/__init__.py +159 -97
warp/fem/adaptivity.py +7 -489
warp/fem/cache.py +9 -648
warp/fem/dirichlet.py +6 -184
warp/fem/field/__init__.py +8 -109
warp/fem/field/field.py +7 -652
warp/fem/geometry/__init__.py +7 -18
warp/fem/geometry/closest_point.py +11 -77
warp/fem/linalg.py +18 -366
warp/fem/operator.py +11 -369
warp/fem/polynomial.py +9 -209
warp/fem/space/__init__.py +5 -211
warp/fem/space/basis_space.py +6 -662
warp/fem/space/shape/__init__.py +41 -118
warp/fem/space/topology.py +6 -437
warp/fem/types.py +6 -81
warp/fem/utils.py +11 -444
warp/jax.py +8 -165
warp/jax_experimental/__init__.py +14 -1
warp/jax_experimental/custom_call.py +8 -342
warp/jax_experimental/ffi.py +17 -853
warp/jax_experimental/xla_ffi.py +5 -596
warp/marching_cubes.py +5 -689
warp/math.py +16 -393
warp/native/array.h +385 -37
warp/native/builtin.h +316 -39
warp/native/bvh.cpp +43 -9
warp/native/bvh.cu +62 -27
warp/native/bvh.h +310 -309
warp/native/clang/clang.cpp +102 -97
warp/native/coloring.cpp +0 -1
warp/native/crt.h +208 -0
warp/native/exports.h +156 -0
warp/native/hashgrid.cu +2 -0
warp/native/intersect.h +24 -1
warp/native/intersect_tri.h +44 -35
warp/native/mat.h +1456 -276
warp/native/mesh.cpp +4 -4
warp/native/mesh.cu +4 -2
warp/native/mesh.h +176 -61
warp/native/quat.h +0 -52
warp/native/scan.cu +2 -0
warp/native/sort.cu +22 -13
warp/native/sort.h +2 -0
warp/native/sparse.cu +7 -3
warp/native/spatial.h +12 -0
warp/native/tile.h +837 -70
warp/native/tile_radix_sort.h +3 -3
warp/native/tile_reduce.h +394 -46
warp/native/tile_scan.h +4 -4
warp/native/vec.h +469 -53
warp/native/version.h +23 -0
warp/native/volume.cpp +1 -1
warp/native/volume.cu +1 -0
warp/native/volume.h +1 -1
warp/native/volume_builder.cu +2 -0
warp/native/warp.cpp +60 -32
warp/native/warp.cu +581 -280
warp/native/warp.h +14 -11
warp/optim/__init__.py +6 -3
warp/optim/adam.py +6 -145
warp/optim/linear.py +14 -1585
warp/optim/sgd.py +6 -94
warp/paddle.py +6 -388
warp/render/__init__.py +8 -4
warp/render/imgui_manager.py +7 -267
warp/render/render_opengl.py +6 -3616
warp/render/render_usd.py +6 -918
warp/render/utils.py +6 -142
warp/sparse.py +37 -2563
warp/tape.py +6 -1188
warp/tests/__main__.py +1 -1
warp/tests/cuda/test_async.py +4 -4
warp/tests/cuda/test_conditional_captures.py +1 -1
warp/tests/cuda/test_multigpu.py +1 -1
warp/tests/cuda/test_streams.py +58 -1
warp/tests/geometry/test_bvh.py +157 -22
warp/tests/geometry/test_hash_grid.py +38 -0
warp/tests/geometry/test_marching_cubes.py +0 -1
warp/tests/geometry/test_mesh.py +5 -3
warp/tests/geometry/test_mesh_query_aabb.py +5 -12
warp/tests/geometry/test_mesh_query_point.py +5 -2
warp/tests/geometry/test_mesh_query_ray.py +15 -3
warp/tests/geometry/test_volume_write.py +5 -5
warp/tests/interop/test_dlpack.py +18 -17
warp/tests/interop/test_jax.py +1382 -79
warp/tests/interop/test_paddle.py +1 -1
warp/tests/test_adam.py +0 -1
warp/tests/test_arithmetic.py +9 -9
warp/tests/test_array.py +580 -100
warp/tests/test_array_reduce.py +3 -3
warp/tests/test_atomic.py +12 -8
warp/tests/test_atomic_bitwise.py +209 -0
warp/tests/test_atomic_cas.py +4 -4
warp/tests/test_bool.py +2 -2
warp/tests/test_builtins_resolution.py +5 -571
warp/tests/test_codegen.py +34 -15
warp/tests/test_conditional.py +1 -1
warp/tests/test_context.py +6 -6
warp/tests/test_copy.py +242 -161
warp/tests/test_ctypes.py +3 -3
warp/tests/test_devices.py +24 -2
warp/tests/test_examples.py +16 -84
warp/tests/test_fabricarray.py +35 -35
warp/tests/test_fast_math.py +0 -2
warp/tests/test_fem.py +60 -14
warp/tests/test_fixedarray.py +3 -3
warp/tests/test_func.py +8 -5
warp/tests/test_generics.py +1 -1
warp/tests/test_indexedarray.py +24 -24
warp/tests/test_intersect.py +39 -9
warp/tests/test_large.py +1 -1
warp/tests/test_lerp.py +3 -1
warp/tests/test_linear_solvers.py +1 -1
warp/tests/test_map.py +49 -4
warp/tests/test_mat.py +52 -62
warp/tests/test_mat_constructors.py +4 -5
warp/tests/test_mat_lite.py +1 -1
warp/tests/test_mat_scalar_ops.py +121 -121
warp/tests/test_math.py +34 -0
warp/tests/test_module_aot.py +4 -4
warp/tests/test_modules_lite.py +28 -2
warp/tests/test_print.py +11 -11
warp/tests/test_quat.py +93 -58
warp/tests/test_runlength_encode.py +1 -1
warp/tests/test_scalar_ops.py +38 -10
warp/tests/test_smoothstep.py +1 -1
warp/tests/test_sparse.py +126 -15
warp/tests/test_spatial.py +105 -87
warp/tests/test_special_values.py +6 -6
warp/tests/test_static.py +7 -7
warp/tests/test_struct.py +13 -2
warp/tests/test_triangle_closest_point.py +48 -1
warp/tests/test_tuple.py +96 -0
warp/tests/test_types.py +82 -9
warp/tests/test_utils.py +52 -52
warp/tests/test_vec.py +29 -29
warp/tests/test_vec_constructors.py +5 -5
warp/tests/test_vec_scalar_ops.py +97 -97
warp/tests/test_version.py +75 -0
warp/tests/tile/test_tile.py +239 -0
warp/tests/tile/test_tile_atomic_bitwise.py +403 -0
warp/tests/tile/test_tile_cholesky.py +7 -4
warp/tests/tile/test_tile_load.py +26 -2
warp/tests/tile/test_tile_mathdx.py +3 -3
warp/tests/tile/test_tile_matmul.py +1 -1
warp/tests/tile/test_tile_mlp.py +2 -4
warp/tests/tile/test_tile_reduce.py +214 -13
warp/tests/unittest_suites.py +6 -14
warp/tests/unittest_utils.py +10 -9
warp/tests/walkthrough_debug.py +3 -1
warp/torch.py +6 -373
warp/types.py +29 -5750
warp/utils.py +10 -1659
{warp_lang-1.9.0.dist-info → warp_lang-1.10.0.dist-info}/METADATA +47 -103
warp_lang-1.10.0.dist-info/RECORD +468 -0
warp_lang-1.10.0.dist-info/licenses/licenses/Gaia-LICENSE.txt +6 -0
warp_lang-1.10.0.dist-info/licenses/licenses/appdirs-LICENSE.txt +22 -0
warp_lang-1.10.0.dist-info/licenses/licenses/asset_pixel_jpg-LICENSE.txt +3 -0
warp_lang-1.10.0.dist-info/licenses/licenses/cuda-LICENSE.txt +1582 -0
warp_lang-1.10.0.dist-info/licenses/licenses/dlpack-LICENSE.txt +201 -0
warp_lang-1.10.0.dist-info/licenses/licenses/fp16-LICENSE.txt +28 -0
warp_lang-1.10.0.dist-info/licenses/licenses/libmathdx-LICENSE.txt +220 -0
warp_lang-1.10.0.dist-info/licenses/licenses/llvm-LICENSE.txt +279 -0
warp_lang-1.10.0.dist-info/licenses/licenses/moller-LICENSE.txt +16 -0
warp_lang-1.10.0.dist-info/licenses/licenses/nanovdb-LICENSE.txt +2 -0
warp_lang-1.10.0.dist-info/licenses/licenses/nvrtc-LICENSE.txt +1592 -0
warp_lang-1.10.0.dist-info/licenses/licenses/svd-LICENSE.txt +23 -0
warp_lang-1.10.0.dist-info/licenses/licenses/unittest_parallel-LICENSE.txt +21 -0
warp_lang-1.10.0.dist-info/licenses/licenses/usd-LICENSE.txt +213 -0
warp_lang-1.10.0.dist-info/licenses/licenses/windingnumber-LICENSE.txt +21 -0
warp/examples/assets/cartpole.urdf +0 -110
warp/examples/assets/crazyflie.usd +0 -0
warp/examples/assets/nv_ant.xml +0 -92
warp/examples/assets/nv_humanoid.xml +0 -183
warp/examples/assets/quadruped.urdf +0 -268
warp/examples/optim/example_bounce.py +0 -266
warp/examples/optim/example_cloth_throw.py +0 -228
warp/examples/optim/example_drone.py +0 -870
warp/examples/optim/example_inverse_kinematics.py +0 -182
warp/examples/optim/example_inverse_kinematics_torch.py +0 -191
warp/examples/optim/example_softbody_properties.py +0 -400
warp/examples/optim/example_spring_cage.py +0 -245
warp/examples/optim/example_trajectory.py +0 -227
warp/examples/sim/example_cartpole.py +0 -143
warp/examples/sim/example_cloth.py +0 -225
warp/examples/sim/example_cloth_self_contact.py +0 -316
warp/examples/sim/example_granular.py +0 -130
warp/examples/sim/example_granular_collision_sdf.py +0 -202
warp/examples/sim/example_jacobian_ik.py +0 -244
warp/examples/sim/example_particle_chain.py +0 -124
warp/examples/sim/example_quadruped.py +0 -203
warp/examples/sim/example_rigid_chain.py +0 -203
warp/examples/sim/example_rigid_contact.py +0 -195
warp/examples/sim/example_rigid_force.py +0 -133
warp/examples/sim/example_rigid_gyroscopic.py +0 -115
warp/examples/sim/example_rigid_soft_contact.py +0 -140
warp/examples/sim/example_soft_body.py +0 -196
warp/examples/tile/example_tile_walker.py +0 -327
warp/sim/__init__.py +0 -74
warp/sim/articulation.py +0 -793
warp/sim/collide.py +0 -2570
warp/sim/graph_coloring.py +0 -307
warp/sim/import_mjcf.py +0 -791
warp/sim/import_snu.py +0 -227
warp/sim/import_urdf.py +0 -579
warp/sim/import_usd.py +0 -898
warp/sim/inertia.py +0 -357
warp/sim/integrator.py +0 -245
warp/sim/integrator_euler.py +0 -2000
warp/sim/integrator_featherstone.py +0 -2101
warp/sim/integrator_vbd.py +0 -2487
warp/sim/integrator_xpbd.py +0 -3295
warp/sim/model.py +0 -4821
warp/sim/particles.py +0 -121
warp/sim/render.py +0 -431
warp/sim/utils.py +0 -431
warp/tests/sim/disabled_kinematics.py +0 -244
warp/tests/sim/test_cloth.py +0 -863
warp/tests/sim/test_collision.py +0 -743
warp/tests/sim/test_coloring.py +0 -347
warp/tests/sim/test_inertia.py +0 -161
warp/tests/sim/test_model.py +0 -226
warp/tests/sim/test_sim_grad.py +0 -287
warp/tests/sim/test_sim_grad_bounce_linear.py +0 -212
warp/tests/sim/test_sim_kinematics.py +0 -98
warp/thirdparty/__init__.py +0 -0
warp_lang-1.9.0.dist-info/RECORD +0 -456
/warp/{fem → _src/fem}/quadrature/__init__.py +0 -0
/warp/{tests/sim → _src/thirdparty}/__init__.py +0 -0
/warp/{thirdparty → _src/thirdparty}/appdirs.py +0 -0
/warp/{thirdparty → _src/thirdparty}/dlpack.py +0 -0
{warp_lang-1.9.0.dist-info → warp_lang-1.10.0.dist-info}/WHEEL +0 -0
{warp_lang-1.9.0.dist-info → warp_lang-1.10.0.dist-info}/licenses/LICENSE.md +0 -0
{warp_lang-1.9.0.dist-info → warp_lang-1.10.0.dist-info}/top_level.txt +0 -0

warp/native/warp.cu CHANGED Viewed

@@ -19,6 +19,7 @@
 #include "scan.h"
 #include "cuda_util.h"
 #include "error.h"
+#include "sort.h"
 #include <cstdlib>
 #include <fstream>
@@ -37,6 +38,7 @@
 #include <iterator>
 #include <list>
 #include <map>
+#include <mutex>
 #include <string>
 #include <unordered_map>
 #include <unordered_set>
@@ -175,11 +177,20 @@ struct ContextInfo
     CUmodule conditional_module = NULL;
 };
+// Information used for freeing allocations.
+struct FreeInfo
+{
+    void* context = NULL;
+    void* ptr = NULL;
+    bool is_async = false;
+};
 struct CaptureInfo
 {
     CUstream stream = NULL;  // the main stream where capture begins and ends
     uint64_t id = 0;  // unique capture id from CUDA
     bool external = false;  // whether this is an external capture
+    std::vector<FreeInfo> tmp_allocs;  // temporary allocations owned by the graph (e.g., staged array fill values)
 };
 struct StreamInfo
@@ -188,9 +199,13 @@ struct StreamInfo
     CaptureInfo* capture = NULL;  // capture info (only if started on this stream)
 };
-struct GraphInfo
+// Extra resources tied to a graph, freed after the graph is released by CUDA.
+// Used with the on_graph_destroy() callback.
+struct GraphDestroyCallbackInfo
 {
-    std::vector<void*> unfreed_allocs;
+    void* context = NULL;  // graph CUDA context
+    std::vector<void*> unfreed_allocs;  // graph allocations not freed by the graph
+    std::vector<FreeInfo> tmp_allocs;  // temporary allocations owned by the graph (e.g., staged array fill values)
 };
 // Information for graph allocations that are not freed by the graph.
@@ -206,19 +221,19 @@ struct GraphAllocInfo
     bool graph_destroyed = false;  // whether graph instance was destroyed
 };
-// Information used when deferring deallocations.
-struct FreeInfo
+// Information used when deferring module unloading.
+struct ModuleInfo
 {
     void* context = NULL;
-    void* ptr = NULL;
-    bool is_async = false;
+    void* module = NULL;
 };
-// Information used when deferring module unloading.
-struct ModuleInfo
+// Information used when deferring graph destruction.
+struct GraphDestroyInfo
 {
     void* context = NULL;
-    void* module = NULL;
+    void* graph = NULL;
+    void* graph_exec = NULL;
 };
 static std::unordered_map<CUfunction, std::string> g_kernel_names;
@@ -252,6 +267,15 @@ static std::vector<FreeInfo> g_deferred_free_list;
 // Call unload_deferred_modules() to release.
 static std::vector<ModuleInfo> g_deferred_module_list;
+// Graphs that cannot be destroyed immediately get queued here.
+// Call destroy_deferred_graphs() to release.
+static std::vector<GraphDestroyInfo> g_deferred_graph_list;
+// Data from on_graph_destroy() callbacks that run on a different thread.
+static std::vector<GraphDestroyCallbackInfo*> g_deferred_graph_destroy_list;
+static std::mutex g_graph_destroy_mutex;
 void wp_cuda_set_context_restore_policy(bool always_restore)
 {
     ContextGuard::always_restore = always_restore;
@@ -337,7 +361,7 @@ int cuda_init()
 }
-static inline CUcontext get_current_context()
+CUcontext get_current_context()
 {
     CUcontext ctx;
     if (check_cu(cuCtxGetCurrent_f(&ctx)))
@@ -407,6 +431,114 @@ static inline StreamInfo* get_stream_info(CUstream stream)
         return NULL;
 }
+static inline CaptureInfo* get_capture_info(CUstream stream)
+{
+    if (!g_captures.empty() && wp_cuda_stream_is_capturing(stream))
+    {
+        uint64_t capture_id = get_capture_id(stream);
+        auto capture_iter = g_captures.find(capture_id);
+        if (capture_iter != g_captures.end())
+            return capture_iter->second;
+    }
+    return NULL;
+}
+// helper function to copy a value to device memory in a graph-friendly way
+static bool capturable_tmp_alloc(void* context, const void* data, size_t size, void** devptr_ret, bool* free_devptr_ret)
+{
+    ContextGuard guard(context);
+    CUstream stream = get_current_stream();
+    CaptureInfo* capture_info = get_capture_info(stream);
+    int device_ordinal = wp_cuda_context_get_device_ordinal(context);
+    void* devptr = NULL;
+    bool free_devptr = true;
+    if (capture_info)
+    {
+        // ongoing graph capture - need to stage the fill value so that it persists with the graph
+        if (CUDA_VERSION >= 12040 && wp_cuda_driver_version() >= 12040)
+        {
+            // pause the capture so that the alloc/memcpy won't be captured
+            void* graph = NULL;
+            if (!wp_cuda_graph_pause_capture(WP_CURRENT_CONTEXT, stream, &graph))
+                return false;
+            // copy value to device memory
+            devptr = wp_alloc_device(WP_CURRENT_CONTEXT, size);
+            if (!devptr)
+            {
+                fprintf(stderr, "Warp error: Failed to allocate %llu bytes on device 'cuda:%d' (in function %s)\n", (unsigned long long)size, device_ordinal, __FUNCTION__);
+                return false;
+            }
+            if (!check_cuda(cudaMemcpyAsync(devptr, data, size, cudaMemcpyHostToDevice, stream)))
+                return false;
+            // graph takes ownership of the value storage
+            FreeInfo free_info;
+            free_info.context = context ? context : get_current_context();
+            free_info.ptr = devptr;
+            free_info.is_async = wp_cuda_device_is_mempool_supported(device_ordinal);
+            // allocation will be freed when graph is destroyed
+            capture_info->tmp_allocs.push_back(free_info);
+            // resume the capture
+            if (!wp_cuda_graph_resume_capture(WP_CURRENT_CONTEXT, stream, graph))
+                return false;
+            free_devptr = false;  // memory is owned by the graph, doesn't need to be freed
+        }
+        else
+        {
+            // older CUDA can't pause/resume the capture, so stage in CPU memory
+            void* hostptr = wp_alloc_host(size);
+            if (!hostptr)
+            {
+                fprintf(stderr, "Warp error: Failed to allocate %llu bytes on device 'cpu' (in function %s)\n", (unsigned long long)size, __FUNCTION__);
+                return false;
+            }
+            memcpy(hostptr, data, size);
+            // the device allocation and h2d copy will be captured in the graph
+            devptr = wp_alloc_device(WP_CURRENT_CONTEXT, size);
+            if (!devptr)
+            {
+                fprintf(stderr, "Warp error: Failed to allocate %llu bytes on device 'cuda:%d' (in function %s)\n", (unsigned long long)size, device_ordinal, __FUNCTION__);
+                return false;
+            }
+            if (!check_cuda(cudaMemcpyAsync(devptr, hostptr, size, cudaMemcpyHostToDevice, stream)))
+                return false;
+            // graph takes ownership of the value storage
+            FreeInfo free_info;
+            free_info.context = NULL;
+            free_info.ptr = hostptr;
+            free_info.is_async = false;
+            // allocation will be freed when graph is destroyed
+            capture_info->tmp_allocs.push_back(free_info);
+        }
+    }
+    else
+    {
+        // not capturing, copy the value to device memory
+        devptr = wp_alloc_device(WP_CURRENT_CONTEXT, size);
+        if (!devptr)
+        {
+            fprintf(stderr, "Warp error: Failed to allocate %llu bytes on device 'cuda:%d' (in function %s)\n", (unsigned long long)size, device_ordinal, __FUNCTION__);
+            return false;
+        }
+        if (!check_cuda(cudaMemcpyAsync(devptr, data, size, cudaMemcpyHostToDevice, stream)))
+            return false;
+    }
+    *devptr_ret = devptr;
+    *free_devptr_ret = free_devptr;
+    return true;
+}
 static void deferred_free(void* ptr, void* context, bool is_async)
 {
     FreeInfo free_info;
@@ -494,34 +626,124 @@ static int unload_deferred_modules(void* context = NULL)
     return num_unloaded_modules;
 }
-static void CUDART_CB on_graph_destroy(void* user_data)
+static int destroy_deferred_graphs(void* context = NULL)
 {
-    if (!user_data)
-        return;
+    if (g_deferred_graph_list.empty() || !g_captures.empty())
+        return 0;
+    int num_destroyed_graphs = 0;
+    for (auto it = g_deferred_graph_list.begin(); it != g_deferred_graph_list.end(); /*noop*/)
+    {
+        // destroy the graph if it matches the given context or if the context is unspecified
+        const GraphDestroyInfo& graph_info = *it;
+        if (graph_info.context == context || !context)
+        {
+            if (graph_info.graph)
+            {
+                check_cuda(cudaGraphDestroy((cudaGraph_t)graph_info.graph));
+            }
+            if (graph_info.graph_exec)
+            {
+                check_cuda(cudaGraphExecDestroy((cudaGraphExec_t)graph_info.graph_exec));
+            }
+            ++num_destroyed_graphs;
+            it = g_deferred_graph_list.erase(it);
+        }
+        else
+        {
+            ++it;
+        }
+    }
+    return num_destroyed_graphs;
+}
+static int process_deferred_graph_destroy_callbacks(void* context = NULL)
+{
+    int num_freed = 0;
-    GraphInfo* graph_info = static_cast<GraphInfo*>(user_data);
+    std::lock_guard<std::mutex> lock(g_graph_destroy_mutex);
-    for (void* ptr : graph_info->unfreed_allocs)
+    for (auto it = g_deferred_graph_destroy_list.begin(); it != g_deferred_graph_destroy_list.end(); /*noop*/)
     {
-        auto alloc_iter = g_graph_allocs.find(ptr);
-        if (alloc_iter != g_graph_allocs.end())
+        GraphDestroyCallbackInfo* graph_info = *it;
+        if (graph_info->context == context || !context)
         {
-            GraphAllocInfo& alloc_info = alloc_iter->second;
-            if (alloc_info.ref_exists)
+            // handle unfreed graph allocations (may have outstanding user references)
+            for (void* ptr : graph_info->unfreed_allocs)
             {
-                // unreference from graph so the pointer will be deallocated when the user reference goes away
-                alloc_info.graph_destroyed = true;
+                auto alloc_iter = g_graph_allocs.find(ptr);
+                if (alloc_iter != g_graph_allocs.end())
+                {
+                    GraphAllocInfo& alloc_info = alloc_iter->second;
+                    if (alloc_info.ref_exists)
+                    {
+                        // unreference from graph so the pointer will be deallocated when the user reference goes away
+                        alloc_info.graph_destroyed = true;
+                    }
+                    else
+                    {
+                        // the pointer can be freed, no references remain
+                        wp_free_device_async(alloc_info.context, ptr);
+                        g_graph_allocs.erase(alloc_iter);
+                    }
+                }
             }
-            else
+            // handle temporary allocations owned by the graph (no user references)
+            for (const FreeInfo& tmp_info : graph_info->tmp_allocs)
             {
-                // the pointer can be freed, but we can't call CUDA functions in this callback, so defer it
-                deferred_free(ptr, alloc_info.context, true);
-                g_graph_allocs.erase(alloc_iter);
+                if (tmp_info.context)
+                {
+                    // GPU alloc
+                    if (tmp_info.is_async)
+                    {
+                        wp_free_device_async(tmp_info.context, tmp_info.ptr);
+                    }
+                    else
+                    {
+                        wp_free_device_default(tmp_info.context, tmp_info.ptr);
+                    }
+                }
+                else
+                {
+                    // CPU alloc
+                    wp_free_host(tmp_info.ptr);
+                }
             }
+            ++num_freed;
+            delete graph_info;
+            it = g_deferred_graph_destroy_list.erase(it);
+        }
+        else
+        {
+            ++it;
         }
     }
-    delete graph_info;
+    return num_freed;
+}
+static int run_deferred_actions(void* context = NULL)
+{
+    int num_actions = 0;
+    num_actions += free_deferred_allocs(context);
+    num_actions += unload_deferred_modules(context);
+    num_actions += destroy_deferred_graphs(context);
+    num_actions += process_deferred_graph_destroy_callbacks(context);
+    return num_actions;
+}
+// Callback used when a graph is destroyed.
+// NOTE: this runs on an internal CUDA thread and requires synchronization.
+static void CUDART_CB on_graph_destroy(void* user_data)
+{
+    if (user_data)
+    {
+        std::lock_guard<std::mutex> lock(g_graph_destroy_mutex);
+        g_deferred_graph_destroy_list.push_back(static_cast<GraphDestroyCallbackInfo*>(user_data));
+    }
 }
 static inline const char* get_cuda_kernel_name(void* kernel)
@@ -973,30 +1195,36 @@ void wp_memtile_device(void* context, void* dst, const void* src, size_t srcsize
     else
     {
         // generic version
+        void* value_devptr = NULL;  // fill value in device memory
+        bool free_devptr = true;  // whether we need to free the memory
-        // copy value to device memory
-        // TODO: use a persistent stream-local staging buffer to avoid allocs?
-        void* src_devptr = wp_alloc_device(WP_CURRENT_CONTEXT, srcsize);
-        check_cuda(cudaMemcpyAsync(src_devptr, src, srcsize, cudaMemcpyHostToDevice, get_current_stream()));
-        wp_launch_device(WP_CURRENT_CONTEXT, memtile_kernel, n, (dst, src_devptr, srcsize, n));
+        // prepare the fill value in a graph-friendly way
+        if (!capturable_tmp_alloc(WP_CURRENT_CONTEXT, src, srcsize, &value_devptr, &free_devptr))
+        {
+            fprintf(stderr, "Warp fill error: failed to copy value to device memory\n");
+            return;
+        }
-        wp_free_device(WP_CURRENT_CONTEXT, src_devptr);
+        wp_launch_device(WP_CURRENT_CONTEXT, memtile_kernel, n, (dst, value_devptr, srcsize, n));
+        if (free_devptr)
+        {
+            wp_free_device(WP_CURRENT_CONTEXT, value_devptr);
+        }
     }
 }
 static __global__ void array_copy_1d_kernel(void* dst, const void* src,
-                                        int dst_stride, int src_stride,
+                                        size_t dst_stride, size_t src_stride,
                                         const int* dst_indices, const int* src_indices,
-                                        int n, int elem_size)
+                                        size_t n, size_t elem_size)
 {
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t i = size_t(blockIdx.x) * size_t(blockDim.x) + size_t(threadIdx.x);
     if (i < n)
     {
-        int src_idx = src_indices ? src_indices[i] : i;
-        int dst_idx = dst_indices ? dst_indices[i] : i;
+        size_t src_idx = src_indices ? src_indices[i] : i;
+        size_t dst_idx = dst_indices ? dst_indices[i] : i;
         const char* p = (const char*)src + src_idx * src_stride;
         char* q = (char*)dst + dst_idx * dst_stride;
         memcpy(q, p, elem_size);
@@ -1004,20 +1232,20 @@ static __global__ void array_copy_1d_kernel(void* dst, const void* src,
 }
 static __global__ void array_copy_2d_kernel(void* dst, const void* src,
-                                        wp::vec_t<2, int> dst_strides, wp::vec_t<2, int> src_strides,
+                                        wp::vec_t<2, size_t> dst_strides, wp::vec_t<2, size_t> src_strides,
                                         wp::vec_t<2, const int*> dst_indices, wp::vec_t<2, const int*> src_indices,
-                                        wp::vec_t<2, int> shape, int elem_size)
+                                        wp::vec_t<2, size_t> shape, size_t elem_size)
 {
-    int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    int n = shape[1];
-    int i = tid / n;
-    int j = tid % n;
+    size_t tid = size_t(blockIdx.x) * size_t(blockDim.x) + size_t(threadIdx.x);
+    size_t n = shape[1];
+    size_t i = tid / n;
+    size_t j = tid % n;
     if (i < shape[0] /*&& j < shape[1]*/)
     {
-        int src_idx0 = src_indices[0] ? src_indices[0][i] : i;
-        int dst_idx0 = dst_indices[0] ? dst_indices[0][i] : i;
-        int src_idx1 = src_indices[1] ? src_indices[1][j] : j;
-        int dst_idx1 = dst_indices[1] ? dst_indices[1][j] : j;
+        size_t src_idx0 = src_indices[0] ? src_indices[0][i] : i;
+        size_t dst_idx0 = dst_indices[0] ? dst_indices[0][i] : i;
+        size_t src_idx1 = src_indices[1] ? src_indices[1][j] : j;
+        size_t dst_idx1 = dst_indices[1] ? dst_indices[1][j] : j;
         const char* p = (const char*)src + src_idx0 * src_strides[0] + src_idx1 * src_strides[1];
         char* q = (char*)dst + dst_idx0 * dst_strides[0] + dst_idx1 * dst_strides[1];
         memcpy(q, p, elem_size);
@@ -1025,24 +1253,24 @@ static __global__ void array_copy_2d_kernel(void* dst, const void* src,
 }
 static __global__ void array_copy_3d_kernel(void* dst, const void* src,
-                                        wp::vec_t<3, int> dst_strides, wp::vec_t<3, int> src_strides,
+                                        wp::vec_t<3, size_t> dst_strides, wp::vec_t<3, size_t> src_strides,
                                         wp::vec_t<3, const int*> dst_indices, wp::vec_t<3, const int*> src_indices,
-                                        wp::vec_t<3, int> shape, int elem_size)
-{
-    int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    int n = shape[1];
-    int o = shape[2];
-    int i = tid / (n * o);
-    int j = tid % (n * o) / o;
-    int k = tid % o;
+                                        wp::vec_t<3, size_t> shape, size_t elem_size)
+{
+    size_t tid = size_t(blockIdx.x) * size_t(blockDim.x) + size_t(threadIdx.x);
+    size_t n = shape[1];
+    size_t o = shape[2];
+    size_t i = tid / (n * o);
+    size_t j = tid % (n * o) / o;
+    size_t k = tid % o;
     if (i < shape[0] && j < shape[1] /*&& k < shape[2]*/)
     {
-        int src_idx0 = src_indices[0] ? src_indices[0][i] : i;
-        int dst_idx0 = dst_indices[0] ? dst_indices[0][i] : i;
-        int src_idx1 = src_indices[1] ? src_indices[1][j] : j;
-        int dst_idx1 = dst_indices[1] ? dst_indices[1][j] : j;
-        int src_idx2 = src_indices[2] ? src_indices[2][k] : k;
-        int dst_idx2 = dst_indices[2] ? dst_indices[2][k] : k;
+        size_t src_idx0 = src_indices[0] ? src_indices[0][i] : i;
+        size_t dst_idx0 = dst_indices[0] ? dst_indices[0][i] : i;
+        size_t src_idx1 = src_indices[1] ? src_indices[1][j] : j;
+        size_t dst_idx1 = dst_indices[1] ? dst_indices[1][j] : j;
+        size_t src_idx2 = src_indices[2] ? src_indices[2][k] : k;
+        size_t dst_idx2 = dst_indices[2] ? dst_indices[2][k] : k;
         const char* p = (const char*)src + src_idx0 * src_strides[0]
                                          + src_idx1 * src_strides[1]
                                          + src_idx2 * src_strides[2];
@@ -1054,28 +1282,28 @@ static __global__ void array_copy_3d_kernel(void* dst, const void* src,
 }
 static __global__ void array_copy_4d_kernel(void* dst, const void* src,
-                                        wp::vec_t<4, int> dst_strides, wp::vec_t<4, int> src_strides,
+                                        wp::vec_t<4, size_t> dst_strides, wp::vec_t<4, size_t> src_strides,
                                         wp::vec_t<4, const int*> dst_indices, wp::vec_t<4, const int*> src_indices,
-                                        wp::vec_t<4, int> shape, int elem_size)
-{
-    int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    int n = shape[1];
-    int o = shape[2];
-    int p = shape[3];
-    int i = tid / (n * o * p);
-    int j = tid % (n * o * p) / (o * p);
-    int k = tid % (o * p) / p;
-    int l = tid % p;
+                                        wp::vec_t<4, size_t> shape, size_t elem_size)
+{
+    size_t tid = size_t(blockIdx.x) * size_t(blockDim.x) + size_t(threadIdx.x);
+    size_t n = shape[1];
+    size_t o = shape[2];
+    size_t p = shape[3];
+    size_t i = tid / (n * o * p);
+    size_t j = tid % (n * o * p) / (o * p);
+    size_t k = tid % (o * p) / p;
+    size_t l = tid % p;
     if (i < shape[0] && j < shape[1] && k < shape[2] /*&& l < shape[3]*/)
     {
-        int src_idx0 = src_indices[0] ? src_indices[0][i] : i;
-        int dst_idx0 = dst_indices[0] ? dst_indices[0][i] : i;
-        int src_idx1 = src_indices[1] ? src_indices[1][j] : j;
-        int dst_idx1 = dst_indices[1] ? dst_indices[1][j] : j;
-        int src_idx2 = src_indices[2] ? src_indices[2][k] : k;
-        int dst_idx2 = dst_indices[2] ? dst_indices[2][k] : k;
-        int src_idx3 = src_indices[3] ? src_indices[3][l] : l;
-        int dst_idx3 = dst_indices[3] ? dst_indices[3][l] : l;
+        size_t src_idx0 = src_indices[0] ? src_indices[0][i] : i;
+        size_t dst_idx0 = dst_indices[0] ? dst_indices[0][i] : i;
+        size_t src_idx1 = src_indices[1] ? src_indices[1][j] : j;
+        size_t dst_idx1 = dst_indices[1] ? dst_indices[1][j] : j;
+        size_t src_idx2 = src_indices[2] ? src_indices[2][k] : k;
+        size_t dst_idx2 = dst_indices[2] ? dst_indices[2][k] : k;
+        size_t src_idx3 = src_indices[3] ? src_indices[3][l] : l;
+        size_t dst_idx3 = dst_indices[3] ? dst_indices[3][l] : l;
         const char* p = (const char*)src + src_idx0 * src_strides[0]
                                          + src_idx1 * src_strides[1]
                                          + src_idx2 * src_strides[2]
@@ -1090,14 +1318,14 @@ static __global__ void array_copy_4d_kernel(void* dst, const void* src,
 static __global__ void array_copy_from_fabric_kernel(wp::fabricarray_t<void> src,
-                                                     void* dst_data, int dst_stride, const int* dst_indices,
-                                                     int elem_size)
+                                                     void* dst_data, size_t dst_stride, const int* dst_indices,
+                                                     size_t elem_size)
 {
-    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t tid = size_t(blockIdx.x) * size_t(blockDim.x) + size_t(threadIdx.x);
     if (tid < src.size)
     {
-        int dst_idx = dst_indices ? dst_indices[tid] : tid;
+        size_t dst_idx = dst_indices ? dst_indices[tid] : tid;
         void* dst_ptr = (char*)dst_data + dst_idx * dst_stride;
         const void* src_ptr = fabricarray_element_ptr(src, tid, elem_size);
         memcpy(dst_ptr, src_ptr, elem_size);
@@ -1105,15 +1333,15 @@ static __global__ void array_copy_from_fabric_kernel(wp::fabricarray_t<void> src
 }
 static __global__ void array_copy_from_fabric_indexed_kernel(wp::indexedfabricarray_t<void> src,
-                                                             void* dst_data, int dst_stride, const int* dst_indices,
-                                                             int elem_size)
+                                                             void* dst_data, size_t dst_stride, const int* dst_indices,
+                                                             size_t elem_size)
 {
-    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t tid = size_t(blockIdx.x) * size_t(blockDim.x) + size_t(threadIdx.x);
     if (tid < src.size)
     {
-        int src_index = src.indices[tid];
-        int dst_idx = dst_indices ? dst_indices[tid] : tid;
+        size_t src_index = src.indices[tid];
+        size_t dst_idx = dst_indices ? dst_indices[tid] : tid;
         void* dst_ptr = (char*)dst_data + dst_idx * dst_stride;
         const void* src_ptr = fabricarray_element_ptr(src.fa, src_index, elem_size);
         memcpy(dst_ptr, src_ptr, elem_size);
@@ -1121,14 +1349,14 @@ static __global__ void array_copy_from_fabric_indexed_kernel(wp::indexedfabricar
 }
 static __global__ void array_copy_to_fabric_kernel(wp::fabricarray_t<void> dst,
-                                                   const void* src_data, int src_stride, const int* src_indices,
-                                                   int elem_size)
+                                                   const void* src_data, size_t src_stride, const int* src_indices,
+                                                   size_t elem_size)
 {
-    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t tid = size_t(blockIdx.x) * size_t(blockDim.x) + size_t(threadIdx.x);
     if (tid < dst.size)
     {
-        int src_idx = src_indices ? src_indices[tid] : tid;
+        size_t src_idx = src_indices ? src_indices[tid] : tid;
         const void* src_ptr = (const char*)src_data + src_idx * src_stride;
         void* dst_ptr = fabricarray_element_ptr(dst, tid, elem_size);
         memcpy(dst_ptr, src_ptr, elem_size);
@@ -1136,25 +1364,25 @@ static __global__ void array_copy_to_fabric_kernel(wp::fabricarray_t<void> dst,
 }
 static __global__ void array_copy_to_fabric_indexed_kernel(wp::indexedfabricarray_t<void> dst,
-                                                           const void* src_data, int src_stride, const int* src_indices,
-                                                           int elem_size)
+                                                           const void* src_data, size_t src_stride, const int* src_indices,
+                                                           size_t elem_size)
 {
-    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t tid = size_t(blockIdx.x) * size_t(blockDim.x) + size_t(threadIdx.x);
     if (tid < dst.size)
     {
-        int src_idx = src_indices ? src_indices[tid] : tid;
+        size_t src_idx = src_indices ? src_indices[tid] : tid;
         const void* src_ptr = (const char*)src_data + src_idx * src_stride;
-        int dst_idx = dst.indices[tid];
+        size_t dst_idx = dst.indices[tid];
         void* dst_ptr = fabricarray_element_ptr(dst.fa, dst_idx, elem_size);
         memcpy(dst_ptr, src_ptr, elem_size);
     }
 }
-static __global__ void array_copy_fabric_to_fabric_kernel(wp::fabricarray_t<void> dst, wp::fabricarray_t<void> src, int elem_size)
+static __global__ void array_copy_fabric_to_fabric_kernel(wp::fabricarray_t<void> dst, wp::fabricarray_t<void> src, size_t elem_size)
 {
-    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t tid = size_t(blockIdx.x) * size_t(blockDim.x) + size_t(threadIdx.x);
     if (tid < dst.size)
     {
@@ -1165,27 +1393,27 @@ static __global__ void array_copy_fabric_to_fabric_kernel(wp::fabricarray_t<void
 }
-static __global__ void array_copy_fabric_to_fabric_indexed_kernel(wp::indexedfabricarray_t<void> dst, wp::fabricarray_t<void> src, int elem_size)
+static __global__ void array_copy_fabric_to_fabric_indexed_kernel(wp::indexedfabricarray_t<void> dst, wp::fabricarray_t<void> src, size_t elem_size)
 {
-    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t tid = size_t(blockIdx.x) * size_t(blockDim.x) + size_t(threadIdx.x);
     if (tid < dst.size)
     {
         const void* src_ptr = fabricarray_element_ptr(src, tid, elem_size);
-        int dst_index = dst.indices[tid];
+        size_t dst_index = dst.indices[tid];
         void* dst_ptr = fabricarray_element_ptr(dst.fa, dst_index, elem_size);
         memcpy(dst_ptr, src_ptr, elem_size);
     }
 }
-static __global__ void array_copy_fabric_indexed_to_fabric_kernel(wp::fabricarray_t<void> dst, wp::indexedfabricarray_t<void> src, int elem_size)
+static __global__ void array_copy_fabric_indexed_to_fabric_kernel(wp::fabricarray_t<void> dst, wp::indexedfabricarray_t<void> src, size_t elem_size)
 {
-    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t tid = size_t(blockIdx.x) * size_t(blockDim.x) + size_t(threadIdx.x);
     if (tid < dst.size)
     {
-        int src_index = src.indices[tid];
+        size_t src_index = src.indices[tid];
         const void* src_ptr = fabricarray_element_ptr(src.fa, src_index, elem_size);
         void* dst_ptr = fabricarray_element_ptr(dst, tid, elem_size);
         memcpy(dst_ptr, src_ptr, elem_size);
@@ -1193,14 +1421,14 @@ static __global__ void array_copy_fabric_indexed_to_fabric_kernel(wp::fabricarra
 }
-static __global__ void array_copy_fabric_indexed_to_fabric_indexed_kernel(wp::indexedfabricarray_t<void> dst, wp::indexedfabricarray_t<void> src, int elem_size)
+static __global__ void array_copy_fabric_indexed_to_fabric_indexed_kernel(wp::indexedfabricarray_t<void> dst, wp::indexedfabricarray_t<void> src, size_t elem_size)
 {
-    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t tid = size_t(blockIdx.x) * size_t(blockDim.x) + size_t(threadIdx.x);
     if (tid < dst.size)
     {
-        int src_index = src.indices[tid];
-        int dst_index = dst.indices[tid];
+        size_t src_index = src.indices[tid];
+        size_t dst_index = dst.indices[tid];
         const void* src_ptr = fabricarray_element_ptr(src.fa, src_index, elem_size);
         void* dst_ptr = fabricarray_element_ptr(dst.fa, dst_index, elem_size);
         memcpy(dst_ptr, src_ptr, elem_size);
@@ -1439,9 +1667,9 @@ WP_API bool wp_array_copy_device(void* context, void* dst, void* src, int dst_ty
     }
     case 2:
     {
-        wp::vec_t<2, int> shape_v(src_shape[0], src_shape[1]);
-        wp::vec_t<2, int> src_strides_v(src_strides[0], src_strides[1]);
-        wp::vec_t<2, int> dst_strides_v(dst_strides[0], dst_strides[1]);
+        wp::vec_t<2, size_t> shape_v(src_shape[0], src_shape[1]);
+        wp::vec_t<2, size_t> src_strides_v(src_strides[0], src_strides[1]);
+        wp::vec_t<2, size_t> dst_strides_v(dst_strides[0], dst_strides[1]);
         wp::vec_t<2, const int*> src_indices_v(src_indices[0], src_indices[1]);
         wp::vec_t<2, const int*> dst_indices_v(dst_indices[0], dst_indices[1]);
@@ -1453,9 +1681,9 @@ WP_API bool wp_array_copy_device(void* context, void* dst, void* src, int dst_ty
     }
     case 3:
     {
-        wp::vec_t<3, int> shape_v(src_shape[0], src_shape[1], src_shape[2]);
-        wp::vec_t<3, int> src_strides_v(src_strides[0], src_strides[1], src_strides[2]);
-        wp::vec_t<3, int> dst_strides_v(dst_strides[0], dst_strides[1], dst_strides[2]);
+        wp::vec_t<3, size_t> shape_v(src_shape[0], src_shape[1], src_shape[2]);
+        wp::vec_t<3, size_t> src_strides_v(src_strides[0], src_strides[1], src_strides[2]);
+        wp::vec_t<3, size_t> dst_strides_v(dst_strides[0], dst_strides[1], dst_strides[2]);
         wp::vec_t<3, const int*> src_indices_v(src_indices[0], src_indices[1], src_indices[2]);
         wp::vec_t<3, const int*> dst_indices_v(dst_indices[0], dst_indices[1], dst_indices[2]);
@@ -1467,9 +1695,9 @@ WP_API bool wp_array_copy_device(void* context, void* dst, void* src, int dst_ty
     }
     case 4:
     {
-        wp::vec_t<4, int> shape_v(src_shape[0], src_shape[1], src_shape[2], src_shape[3]);
-        wp::vec_t<4, int> src_strides_v(src_strides[0], src_strides[1], src_strides[2], src_strides[3]);
-        wp::vec_t<4, int> dst_strides_v(dst_strides[0], dst_strides[1], dst_strides[2], dst_strides[3]);
+        wp::vec_t<4, size_t> shape_v(src_shape[0], src_shape[1], src_shape[2], src_shape[3]);
+        wp::vec_t<4, size_t> src_strides_v(src_strides[0], src_strides[1], src_strides[2], src_strides[3]);
+        wp::vec_t<4, size_t> dst_strides_v(dst_strides[0], dst_strides[1], dst_strides[2], dst_strides[3]);
         wp::vec_t<4, const int*> src_indices_v(src_indices[0], src_indices[1], src_indices[2], src_indices[3]);
         wp::vec_t<4, const int*> dst_indices_v(dst_indices[0], dst_indices[1], dst_indices[2], dst_indices[3]);
@@ -1489,94 +1717,94 @@ WP_API bool wp_array_copy_device(void* context, void* dst, void* src, int dst_ty
 static __global__ void array_fill_1d_kernel(void* data,
-                                            int n,
-                                            int stride,
+                                            size_t n,
+                                            size_t stride,
                                             const int* indices,
                                             const void* value,
-                                            int value_size)
+                                            size_t value_size)
 {
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t i = size_t(blockIdx.x) * size_t(blockDim.x) + size_t(threadIdx.x);
     if (i < n)
     {
-        int idx = indices ? indices[i] : i;
+        size_t idx = indices ? indices[i] : i;
         char* p = (char*)data + idx * stride;
         memcpy(p, value, value_size);
     }
 }
 static __global__ void array_fill_2d_kernel(void* data,
-                                            wp::vec_t<2, int> shape,
-                                            wp::vec_t<2, int> strides,
+                                            wp::vec_t<2, size_t> shape,
+                                            wp::vec_t<2, size_t> strides,
                                             wp::vec_t<2, const int*> indices,
                                             const void* value,
-                                            int value_size)
+                                            size_t value_size)
 {
-    int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    int n = shape[1];
-    int i = tid / n;
-    int j = tid % n;
+    size_t tid = size_t(blockIdx.x) * size_t(blockDim.x) + size_t(threadIdx.x);
+    size_t n = shape[1];
+    size_t i = tid / n;
+    size_t j = tid % n;
     if (i < shape[0] /*&& j < shape[1]*/)
     {
-        int idx0 = indices[0] ? indices[0][i] : i;
-        int idx1 = indices[1] ? indices[1][j] : j;
+        size_t idx0 = indices[0] ? indices[0][i] : i;
+        size_t idx1 = indices[1] ? indices[1][j] : j;
         char* p = (char*)data + idx0 * strides[0] + idx1 * strides[1];
         memcpy(p, value, value_size);
     }
 }
 static __global__ void array_fill_3d_kernel(void* data,
-                                            wp::vec_t<3, int> shape,
-                                            wp::vec_t<3, int> strides,
+                                            wp::vec_t<3, size_t> shape,
+                                            wp::vec_t<3, size_t> strides,
                                             wp::vec_t<3, const int*> indices,
                                             const void* value,
-                                            int value_size)
-{
-    int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    int n = shape[1];
-    int o = shape[2];
-    int i = tid / (n * o);
-    int j = tid % (n * o) / o;
-    int k = tid % o;
+                                            size_t value_size)
+{
+    size_t tid = size_t(blockIdx.x) * size_t(blockDim.x) + size_t(threadIdx.x);
+    size_t n = shape[1];
+    size_t o = shape[2];
+    size_t i = tid / (n * o);
+    size_t j = tid % (n * o) / o;
+    size_t k = tid % o;
     if (i < shape[0] && j < shape[1] /*&& k < shape[2]*/)
     {
-        int idx0 = indices[0] ? indices[0][i] : i;
-        int idx1 = indices[1] ? indices[1][j] : j;
-        int idx2 = indices[2] ? indices[2][k] : k;
+        size_t idx0 = indices[0] ? indices[0][i] : i;
+        size_t idx1 = indices[1] ? indices[1][j] : j;
+        size_t idx2 = indices[2] ? indices[2][k] : k;
         char* p = (char*)data + idx0 * strides[0] + idx1 * strides[1] + idx2 * strides[2];
         memcpy(p, value, value_size);
     }
 }
 static __global__ void array_fill_4d_kernel(void* data,
-                                            wp::vec_t<4, int> shape,
-                                            wp::vec_t<4, int> strides,
+                                            wp::vec_t<4, size_t> shape,
+                                            wp::vec_t<4, size_t> strides,
                                             wp::vec_t<4, const int*> indices,
                                             const void* value,
-                                            int value_size)
-{
-    int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    int n = shape[1];
-    int o = shape[2];
-    int p = shape[3];
-    int i = tid / (n * o * p);
-    int j = tid % (n * o * p) / (o * p);
-    int k = tid % (o * p) / p;
-    int l = tid % p;
+                                            size_t value_size)
+{
+    size_t tid = size_t(blockIdx.x) * size_t(blockDim.x) + size_t(threadIdx.x);
+    size_t n = shape[1];
+    size_t o = shape[2];
+    size_t p = shape[3];
+    size_t i = tid / (n * o * p);
+    size_t j = tid % (n * o * p) / (o * p);
+    size_t k = tid % (o * p) / p;
+    size_t l = tid % p;
     if (i < shape[0] && j < shape[1] && k < shape[2] /*&& l < shape[3]*/)
     {
-        int idx0 = indices[0] ? indices[0][i] : i;
-        int idx1 = indices[1] ? indices[1][j] : j;
-        int idx2 = indices[2] ? indices[2][k] : k;
-        int idx3 = indices[3] ? indices[3][l] : l;
+        size_t idx0 = indices[0] ? indices[0][i] : i;
+        size_t idx1 = indices[1] ? indices[1][j] : j;
+        size_t idx2 = indices[2] ? indices[2][k] : k;
+        size_t idx3 = indices[3] ? indices[3][l] : l;
         char* p = (char*)data + idx0 * strides[0] + idx1 * strides[1] + idx2 * strides[2] + idx3 * strides[3];
         memcpy(p, value, value_size);
     }
 }
-static __global__ void array_fill_fabric_kernel(wp::fabricarray_t<void> fa, const void* value, int value_size)
+static __global__ void array_fill_fabric_kernel(wp::fabricarray_t<void> fa, const void* value, size_t value_size)
 {
-    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t tid = size_t(blockIdx.x) * size_t(blockDim.x) + size_t(threadIdx.x);
     if (tid < fa.size)
     {
         void* dst_ptr = fabricarray_element_ptr(fa, tid, value_size);
@@ -1585,9 +1813,9 @@ static __global__ void array_fill_fabric_kernel(wp::fabricarray_t<void> fa, cons
 }
-static __global__ void array_fill_fabric_indexed_kernel(wp::indexedfabricarray_t<void> ifa, const void* value, int value_size)
+static __global__ void array_fill_fabric_indexed_kernel(wp::indexedfabricarray_t<void> ifa, const void* value, size_t value_size)
 {
-    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t tid = size_t(blockIdx.x) * size_t(blockDim.x) + size_t(threadIdx.x);
     if (tid < ifa.size)
     {
         size_t idx = size_t(ifa.indices[tid]);
@@ -1654,67 +1882,76 @@ WP_API void wp_array_fill_device(void* context, void* arr_ptr, int arr_type, con
     ContextGuard guard(context);
-    // copy value to device memory
-    // TODO: use a persistent stream-local staging buffer to avoid allocs?
-    void* value_devptr = wp_alloc_device(WP_CURRENT_CONTEXT, value_size);
-    check_cuda(cudaMemcpyAsync(value_devptr, value_ptr, value_size, cudaMemcpyHostToDevice, get_current_stream()));
+    void* value_devptr = NULL;  // fill value in device memory
+    bool free_devptr = true;  // whether we need to free the memory
+    // prepare the fill value in a graph-friendly way
+    if (!capturable_tmp_alloc(WP_CURRENT_CONTEXT, value_ptr, value_size, &value_devptr, &free_devptr))
+    {
+        fprintf(stderr, "Warp fill error: failed to copy value to device memory\n");
+        return;
+    }
-    // handle fabric arrays
     if (fa)
     {
+        // handle fabric arrays
         wp_launch_device(WP_CURRENT_CONTEXT, array_fill_fabric_kernel, n,
                          (*fa, value_devptr, value_size));
-        return;
     }
     else if (ifa)
     {
+        // handle indexed fabric arrays
         wp_launch_device(WP_CURRENT_CONTEXT, array_fill_fabric_indexed_kernel, n,
                          (*ifa, value_devptr, value_size));
-        return;
-    }
-    // handle regular or indexed arrays
-    switch (ndim)
-    {
-    case 1:
-    {
-        wp_launch_device(WP_CURRENT_CONTEXT, array_fill_1d_kernel, n,
-                         (data, shape[0], strides[0], indices[0], value_devptr, value_size));
-        break;
-    }
-    case 2:
-    {
-        wp::vec_t<2, int> shape_v(shape[0], shape[1]);
-        wp::vec_t<2, int> strides_v(strides[0], strides[1]);
-        wp::vec_t<2, const int*> indices_v(indices[0], indices[1]);
-        wp_launch_device(WP_CURRENT_CONTEXT, array_fill_2d_kernel, n,
-                         (data, shape_v, strides_v, indices_v, value_devptr, value_size));
-        break;
     }
-    case 3:
+    else
     {
-        wp::vec_t<3, int> shape_v(shape[0], shape[1], shape[2]);
-        wp::vec_t<3, int> strides_v(strides[0], strides[1], strides[2]);
-        wp::vec_t<3, const int*> indices_v(indices[0], indices[1], indices[2]);
-        wp_launch_device(WP_CURRENT_CONTEXT, array_fill_3d_kernel, n,
-                         (data, shape_v, strides_v, indices_v, value_devptr, value_size));
-        break;
+        // handle regular or indexed arrays
+        switch (ndim)
+        {
+        case 1:
+        {
+            wp_launch_device(WP_CURRENT_CONTEXT, array_fill_1d_kernel, n,
+                            (data, shape[0], strides[0], indices[0], value_devptr, value_size));
+            break;
+        }
+        case 2:
+        {
+            wp::vec_t<2, size_t> shape_v(shape[0], shape[1]);
+            wp::vec_t<2, size_t> strides_v(strides[0], strides[1]);
+            wp::vec_t<2, const int*> indices_v(indices[0], indices[1]);
+            wp_launch_device(WP_CURRENT_CONTEXT, array_fill_2d_kernel, n,
+                            (data, shape_v, strides_v, indices_v, value_devptr, value_size));
+            break;
+        }
+        case 3:
+        {
+            wp::vec_t<3, size_t> shape_v(shape[0], shape[1], shape[2]);
+            wp::vec_t<3, size_t> strides_v(strides[0], strides[1], strides[2]);
+            wp::vec_t<3, const int*> indices_v(indices[0], indices[1], indices[2]);
+            wp_launch_device(WP_CURRENT_CONTEXT, array_fill_3d_kernel, n,
+                            (data, shape_v, strides_v, indices_v, value_devptr, value_size));
+            break;
+        }
+        case 4:
+        {
+            wp::vec_t<4, size_t> shape_v(shape[0], shape[1], shape[2], shape[3]);
+            wp::vec_t<4, size_t> strides_v(strides[0], strides[1], strides[2], strides[3]);
+            wp::vec_t<4, const int*> indices_v(indices[0], indices[1], indices[2], indices[3]);
+            wp_launch_device(WP_CURRENT_CONTEXT, array_fill_4d_kernel, n,
+                            (data, shape_v, strides_v, indices_v, value_devptr, value_size));
+            break;
+        }
+        default:
+            fprintf(stderr, "Warp fill error: invalid array dimensionality (%d)\n", ndim);
+            break;
+        }
     }
-    case 4:
+    if (free_devptr)
     {
-        wp::vec_t<4, int> shape_v(shape[0], shape[1], shape[2], shape[3]);
-        wp::vec_t<4, int> strides_v(strides[0], strides[1], strides[2], strides[3]);
-        wp::vec_t<4, const int*> indices_v(indices[0], indices[1], indices[2], indices[3]);
-        wp_launch_device(WP_CURRENT_CONTEXT, array_fill_4d_kernel, n,
-                         (data, shape_v, strides_v, indices_v, value_devptr, value_size));
-        break;
+        wp_free_device(WP_CURRENT_CONTEXT, value_devptr);
     }
-    default:
-        fprintf(stderr, "Warp fill error: invalid array dimensionality (%d)\n", ndim);
-        return;
-    }
-    wp_free_device(WP_CURRENT_CONTEXT, value_devptr);
 }
 void wp_array_scan_int_device(uint64_t in, uint64_t out, int len, bool inclusive)
@@ -2071,14 +2308,15 @@ void wp_cuda_context_synchronize(void* context)
     check_cu(cuCtxSynchronize_f());
-    if (free_deferred_allocs(context ? context : get_current_context()) > 0)
+    if (!context)
+        context = get_current_context();
+    if (run_deferred_actions(context) > 0)
     {
-        // ensure deferred asynchronous deallocations complete
+        // ensure deferred asynchronous operations complete
         check_cu(cuCtxSynchronize_f());
     }
-    unload_deferred_modules(context);
     // check_cuda(cudaDeviceGraphMemTrim(wp_cuda_context_get_device_ordinal(context)));
 }
@@ -2448,6 +2686,9 @@ void wp_cuda_stream_destroy(void* context, void* stream)
     wp_cuda_stream_unregister(context, stream);
+    // release temporary radix sort buffer associated with this stream
+    radix_sort_release(context, stream);
     check_cu(cuStreamDestroy_f(static_cast<CUstream>(stream)));
 }
@@ -2510,15 +2751,36 @@ void wp_cuda_stream_synchronize(void* stream)
     check_cu(cuStreamSynchronize_f(static_cast<CUstream>(stream)));
 }
-void wp_cuda_stream_wait_event(void* stream, void* event)
+void wp_cuda_stream_wait_event(void* stream, void* event, bool external)
 {
-    check_cu(cuStreamWaitEvent_f(static_cast<CUstream>(stream), static_cast<CUevent>(event), 0));
+    // the external flag can only be used during graph capture
+    if (external && !g_captures.empty() && wp_cuda_stream_is_capturing(stream))
+    {
+        // wait for an external event during graph capture
+        check_cu(cuStreamWaitEvent_f(static_cast<CUstream>(stream), static_cast<CUevent>(event), CU_EVENT_WAIT_EXTERNAL));
+    }
+    else
+    {
+        check_cu(cuStreamWaitEvent_f(static_cast<CUstream>(stream), static_cast<CUevent>(event), CU_EVENT_WAIT_DEFAULT));
+    }
 }
-void wp_cuda_stream_wait_stream(void* stream, void* other_stream, void* event)
+void wp_cuda_stream_wait_stream(void* stream, void* other_stream, void* event, bool external)
 {
-    check_cu(cuEventRecord_f(static_cast<CUevent>(event), static_cast<CUstream>(other_stream)));
-    check_cu(cuStreamWaitEvent_f(static_cast<CUstream>(stream), static_cast<CUevent>(event), 0));
+    unsigned record_flags = CU_EVENT_RECORD_DEFAULT;
+    unsigned wait_flags = CU_EVENT_WAIT_DEFAULT;
+    // the external flag can only be used during graph capture
+    if (external && !g_captures.empty())
+    {
+        if (wp_cuda_stream_is_capturing(other_stream))
+            record_flags = CU_EVENT_RECORD_EXTERNAL;
+        if (wp_cuda_stream_is_capturing(stream))
+            wait_flags = CU_EVENT_WAIT_EXTERNAL;
+    }
+    check_cu(cuEventRecordWithFlags_f(static_cast<CUevent>(event), static_cast<CUstream>(other_stream), record_flags));
+    check_cu(cuStreamWaitEvent_f(static_cast<CUstream>(stream), static_cast<CUevent>(event), wait_flags));
 }
 int wp_cuda_stream_is_capturing(void* stream)
@@ -2571,11 +2833,12 @@ int wp_cuda_event_query(void* event)
     return res;
 }
-void wp_cuda_event_record(void* event, void* stream, bool timing)
+void wp_cuda_event_record(void* event, void* stream, bool external)
 {
-    if (timing && !g_captures.empty() && wp_cuda_stream_is_capturing(stream))
+    // the external flag can only be used during graph capture
+    if (external && !g_captures.empty() && wp_cuda_stream_is_capturing(stream))
     {
-        // record timing event during graph capture
+        // record external event during graph capture (e.g., for timing or when explicitly specified by the user)
         check_cu(cuEventRecordWithFlags_f(static_cast<CUevent>(event), static_cast<CUstream>(stream), CU_EVENT_RECORD_EXTERNAL));
     }
     else
@@ -2625,7 +2888,7 @@ bool wp_cuda_graph_begin_capture(void* context, void* stream, int external)
     else
     {
         // start the capture
-        if (!check_cuda(cudaStreamBeginCapture(cuda_stream, cudaStreamCaptureModeGlobal)))
+        if (!check_cuda(cudaStreamBeginCapture(cuda_stream, cudaStreamCaptureModeThreadLocal)))
             return false;
     }
@@ -2669,6 +2932,7 @@ bool wp_cuda_graph_end_capture(void* context, void* stream, void** graph_ret)
     // get capture info
     bool external = capture->external;
     uint64_t capture_id = capture->id;
+    std::vector<FreeInfo> tmp_allocs = capture->tmp_allocs;
     // clear capture info
     stream_info->capture = NULL;
@@ -2738,15 +3002,17 @@ bool wp_cuda_graph_end_capture(void* context, void* stream, void** graph_ret)
             unfreed_allocs.push_back(it->first);
     }
-    if (!unfreed_allocs.empty())
+    if (!unfreed_allocs.empty() || !tmp_allocs.empty())
     {
         // Create a user object that will notify us when the instantiated graph is destroyed.
         // This works for external captures also, since we wouldn't otherwise know when
         // the externally-created graph instance gets deleted.
         // This callback is guaranteed to arrive after the graph has finished executing on the device,
         // not necessarily when cudaGraphExecDestroy() is called.
-        GraphInfo* graph_info = new GraphInfo;
+        GraphDestroyCallbackInfo* graph_info = new GraphDestroyCallbackInfo;
+        graph_info->context = context ? context : get_current_context();
         graph_info->unfreed_allocs = unfreed_allocs;
+        graph_info->tmp_allocs = tmp_allocs;
         cudaUserObject_t user_object;
         check_cuda(cudaUserObjectCreate(&user_object, graph_info, on_graph_destroy, 1, cudaUserObjectNoDestructorSync));
         check_cuda(cudaGraphRetainUserObject(graph, user_object, 1, cudaGraphUserObjectMove));
@@ -2770,8 +3036,7 @@ bool wp_cuda_graph_end_capture(void* context, void* stream, void** graph_ret)
     // process deferred free list if no more captures are ongoing
     if (g_captures.empty())
     {
-        free_deferred_allocs();
-        unload_deferred_modules();
+        run_deferred_actions();
     }
     if (graph_ret)
@@ -2811,11 +3076,12 @@ bool wp_cuda_graph_create_exec(void* context, void* stream, void* graph, void**
 // Support for conditional graph nodes available with CUDA 12.4+.
 #if CUDA_VERSION >= 12040
-// CUBIN data for compiled conditional modules, loaded on demand, keyed on device architecture
-static std::map<int, void*> g_conditional_cubins;
+// CUBIN or PTX data for compiled conditional modules, loaded on demand, keyed on device architecture
+using ModuleKey = std::pair<int, bool>; // <arch, use_ptx>
+static std::map<ModuleKey, void*> g_conditional_modules;
 // Compile module with conditional helper kernels
-static void* compile_conditional_module(int arch)
+static void* compile_conditional_module(int arch, bool use_ptx)
 {
     static const char* kernel_source = R"(
         typedef __device_builtin__ unsigned long long cudaGraphConditionalHandle;
@@ -2844,8 +3110,9 @@ static void* compile_conditional_module(int arch)
     )";
     // avoid recompilation
-    auto it = g_conditional_cubins.find(arch);
-    if (it != g_conditional_cubins.end())
+    ModuleKey key = {arch, use_ptx};
+    auto it = g_conditional_modules.find(key);
+    if (it != g_conditional_modules.end())
         return it->second;
     nvrtcProgram prog;
@@ -2853,11 +3120,23 @@ static void* compile_conditional_module(int arch)
         return NULL;
     char arch_opt[128];
-    snprintf(arch_opt, sizeof(arch_opt), "--gpu-architecture=sm_%d", arch);
+    if (use_ptx)
+        snprintf(arch_opt, sizeof(arch_opt), "--gpu-architecture=compute_%d", arch);
+    else
+        snprintf(arch_opt, sizeof(arch_opt), "--gpu-architecture=sm_%d", arch);
     std::vector<const char*> opts;
     opts.push_back(arch_opt);
+    const bool print_debug = (std::getenv("WARP_DEBUG") != nullptr);
+    if (print_debug)
+    {
+        printf("NVRTC options (conditional module, arch=%d, use_ptx=%s):\n", arch, use_ptx ? "true" : "false");
+        for(auto o: opts) {
+            printf("%s\n", o);
+        }
+    }
     if (!check_nvrtc(nvrtcCompileProgram(prog, int(opts.size()), opts.data())))
     {
         size_t log_size;
@@ -2874,23 +3153,37 @@ static void* compile_conditional_module(int arch)
     // get output
     char* output = NULL;
     size_t output_size = 0;
-    check_nvrtc(nvrtcGetCUBINSize(prog, &output_size));
-    if (output_size > 0)
+    if (use_ptx)
+    {
+        check_nvrtc(nvrtcGetPTXSize(prog, &output_size));
+        if (output_size > 0)
+        {
+            output = new char[output_size];
+            if (check_nvrtc(nvrtcGetPTX(prog, output)))
+                g_conditional_modules[key] = output;
+        }
+    }
+    else
     {
-        output = new char[output_size];
-        if (check_nvrtc(nvrtcGetCUBIN(prog, output)))
-            g_conditional_cubins[arch] = output;
+        check_nvrtc(nvrtcGetCUBINSize(prog, &output_size));
+        if (output_size > 0)
+        {
+            output = new char[output_size];
+            if (check_nvrtc(nvrtcGetCUBIN(prog, output)))
+                g_conditional_modules[key] = output;
+        }
     }
     nvrtcDestroyProgram(&prog);
-    // return CUBIN data
+    // return CUBIN or PTX data
     return output;
 }
 // Load module with conditional helper kernels
-static CUmodule load_conditional_module(void* context)
+static CUmodule load_conditional_module(void* context, int arch, bool use_ptx)
 {
     ContextInfo* context_info = get_context_info(context);
     if (!context_info)
@@ -2900,17 +3193,15 @@ static CUmodule load_conditional_module(void* context)
     if (context_info->conditional_module)
         return context_info->conditional_module;
-    int arch = context_info->device_info->arch;
     // compile if needed
-    void* compiled_module = compile_conditional_module(arch);
+    void* compiled_module = compile_conditional_module(arch, use_ptx);
     if (!compiled_module)
     {
         fprintf(stderr, "Warp error: Failed to compile conditional kernels\n");
         return NULL;
     }
-    // load module
+    // load module (handles both PTX and CUBIN data automatically)
     CUmodule module = NULL;
     if (!check_cu(cuModuleLoadDataEx_f(&module, compiled_module, 0, NULL, NULL)))
     {
@@ -2923,10 +3214,10 @@ static CUmodule load_conditional_module(void* context)
     return module;
 }
-static CUfunction get_conditional_kernel(void* context, const char* name)
+static CUfunction get_conditional_kernel(void* context, int arch, bool use_ptx, const char* name)
 {
     // load module if needed
-    CUmodule module = load_conditional_module(context);
+    CUmodule module = load_conditional_module(context, arch, use_ptx);
     if (!module)
         return NULL;
@@ -2966,7 +3257,7 @@ bool wp_cuda_graph_resume_capture(void* context, void* stream, void* graph)
                                   leaf_nodes.data(),
                                   nullptr,
                                   leaf_nodes.size(),
-                                  cudaStreamCaptureModeGlobal)))
+                                  cudaStreamCaptureModeThreadLocal)))
         return false;
     return true;
@@ -2976,7 +3267,7 @@ bool wp_cuda_graph_resume_capture(void* context, void* stream, void* graph)
 // https://developer.nvidia.com/blog/dynamic-control-flow-in-cuda-graphs-with-conditional-nodes/
 // condition is a gpu pointer
 // if_graph_ret and else_graph_ret should be NULL if not needed
-bool wp_cuda_graph_insert_if_else(void* context, void* stream, int* condition, void** if_graph_ret, void** else_graph_ret)
+bool wp_cuda_graph_insert_if_else(void* context, void* stream, int arch, bool use_ptx, int* condition, void** if_graph_ret, void** else_graph_ret)
 {
     bool has_if = if_graph_ret != NULL;
     bool has_else = else_graph_ret != NULL;
@@ -3019,9 +3310,9 @@ bool wp_cuda_graph_insert_if_else(void* context, void* stream, int* condition, v
         // (need to negate the condition if only the else branch is used)
         CUfunction kernel;
         if (has_if)
-            kernel = get_conditional_kernel(context, "set_conditional_if_handle_kernel");
+            kernel = get_conditional_kernel(context, arch, use_ptx, "set_conditional_if_handle_kernel");
         else
-            kernel = get_conditional_kernel(context, "set_conditional_else_handle_kernel");
+            kernel = get_conditional_kernel(context, arch, use_ptx, "set_conditional_else_handle_kernel");
         if (!kernel)
         {
@@ -3072,7 +3363,7 @@ bool wp_cuda_graph_insert_if_else(void* context, void* stream, int* condition, v
         check_cuda(cudaGraphConditionalHandleCreate(&if_handle, cuda_graph));
         check_cuda(cudaGraphConditionalHandleCreate(&else_handle, cuda_graph));
-        CUfunction kernel = get_conditional_kernel(context, "set_conditional_if_else_handles_kernel");
+        CUfunction kernel = get_conditional_kernel(context, arch, use_ptx, "set_conditional_if_else_handles_kernel");
         if (!kernel)
         {
             wp::set_error_string("Failed to get built-in conditional kernel");
@@ -3273,7 +3564,7 @@ bool wp_cuda_graph_insert_child_graph(void* context, void* stream, void* child_g
     return true;
 }
-bool wp_cuda_graph_insert_while(void* context, void* stream, int* condition, void** body_graph_ret, uint64_t* handle_ret)
+bool wp_cuda_graph_insert_while(void* context, void* stream, int arch, bool use_ptx, int* condition, void** body_graph_ret, uint64_t* handle_ret)
 {
     // if there's no body, it's a no-op
     if (!body_graph_ret)
@@ -3303,7 +3594,7 @@ bool wp_cuda_graph_insert_while(void* context, void* stream, int* condition, voi
         return false;
     // launch a kernel to set the condition handle from condition pointer
-    CUfunction kernel = get_conditional_kernel(context, "set_conditional_if_handle_kernel");
+    CUfunction kernel = get_conditional_kernel(context, arch, use_ptx, "set_conditional_if_handle_kernel");
     if (!kernel)
     {
         wp::set_error_string("Failed to get built-in conditional kernel");
@@ -3339,14 +3630,14 @@ bool wp_cuda_graph_insert_while(void* context, void* stream, int* condition, voi
     return true;
 }
-bool wp_cuda_graph_set_condition(void* context, void* stream, int* condition, uint64_t handle)
+bool wp_cuda_graph_set_condition(void* context, void* stream, int arch, bool use_ptx, int* condition, uint64_t handle)
 {
     ContextGuard guard(context);
     CUstream cuda_stream = static_cast<CUstream>(stream);
     // launch a kernel to set the condition handle from condition pointer
-    CUfunction kernel = get_conditional_kernel(context, "set_conditional_if_handle_kernel");
+    CUfunction kernel = get_conditional_kernel(context, arch, use_ptx, "set_conditional_if_handle_kernel");
     if (!kernel)
     {
         wp::set_error_string("Failed to get built-in conditional kernel");
@@ -3378,19 +3669,19 @@ bool wp_cuda_graph_resume_capture(void* context, void* stream, void* graph)
     return false;
 }
-bool wp_cuda_graph_insert_if_else(void* context, void* stream, int* condition, void** if_graph_ret, void** else_graph_ret)
+bool wp_cuda_graph_insert_if_else(void* context, void* stream, int arch, bool use_ptx, int* condition, void** if_graph_ret, void** else_graph_ret)
 {
     wp::set_error_string("Warp error: Warp must be built with CUDA Toolkit 12.4+ to enable conditional graph nodes");
     return false;
 }
-bool wp_cuda_graph_insert_while(void* context, void* stream, int* condition, void** body_graph_ret, uint64_t* handle_ret)
+bool wp_cuda_graph_insert_while(void* context, void* stream, int arch, bool use_ptx, int* condition, void** body_graph_ret, uint64_t* handle_ret)
 {
     wp::set_error_string("Warp error: Warp must be built with CUDA Toolkit 12.4+ to enable conditional graph nodes");
     return false;
 }
-bool wp_cuda_graph_set_condition(void* context, void* stream, int* condition, uint64_t handle)
+bool wp_cuda_graph_set_condition(void* context, void* stream, int arch, bool use_ptx, int* condition, uint64_t handle)
 {
     wp::set_error_string("Warp error: Warp must be built with CUDA Toolkit 12.4+ to enable conditional graph nodes");
     return false;
@@ -3425,16 +3716,38 @@ bool wp_cuda_graph_launch(void* graph_exec, void* stream)
 bool wp_cuda_graph_destroy(void* context, void* graph)
 {
-    ContextGuard guard(context);
-    return check_cuda(cudaGraphDestroy((cudaGraph_t)graph));
+    // ensure there are no graph captures in progress
+    if (g_captures.empty())
+    {
+        ContextGuard guard(context);
+        return check_cuda(cudaGraphDestroy((cudaGraph_t)graph));
+    }
+    else
+    {
+        GraphDestroyInfo info;
+        info.context = context ? context : get_current_context();
+        info.graph = graph;
+        g_deferred_graph_list.push_back(info);
+        return true;
+    }
 }
 bool wp_cuda_graph_exec_destroy(void* context, void* graph_exec)
 {
-    ContextGuard guard(context);
-    return check_cuda(cudaGraphExecDestroy((cudaGraphExec_t)graph_exec));
+    // ensure there are no graph captures in progress
+    if (g_captures.empty())
+    {
+        ContextGuard guard(context);
+        return check_cuda(cudaGraphExecDestroy((cudaGraphExec_t)graph_exec));
+    }
+    else
+    {
+        GraphDestroyInfo info;
+        info.context = context ? context : get_current_context();
+        info.graph_exec = graph_exec;
+        g_deferred_graph_list.push_back(info);
+        return true;
+    }
 }
 bool write_file(const char* data, size_t size, std::string filename, const char* mode)
@@ -4287,17 +4600,5 @@ void wp_cuda_timing_end(timing_result_t* results, int size)
     g_cuda_timing_state = parent_state;
 }
-// impl. files
-#include "bvh.cu"
-#include "mesh.cu"
-#include "sort.cu"
-#include "hashgrid.cu"
-#include "reduce.cu"
-#include "runlength_encode.cu"
-#include "scan.cu"
-#include "sparse.cu"
-#include "volume.cu"
-#include "volume_builder.cu"
 //#include "spline.inl"
 //#include "volume.inl"