PyPI - warp-lang - Versions diffs - 1.9.1__py3-none-manylinux_2_34_aarch64.whl → 1.10.0rc2__py3-none-manylinux_2_34_aarch64.whl - Mend

warp-lang 1.9.1__py3-none-manylinux_2_34_aarch64.whl → 1.10.0rc2__py3-none-manylinux_2_34_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (346) hide show

warp/__init__.py +301 -287
warp/__init__.pyi +794 -305
warp/_src/__init__.py +14 -0
warp/_src/autograd.py +1075 -0
warp/_src/build.py +618 -0
warp/_src/build_dll.py +640 -0
warp/{builtins.py → _src/builtins.py} +1382 -377
warp/_src/codegen.py +4359 -0
warp/{config.py → _src/config.py} +178 -169
warp/_src/constants.py +57 -0
warp/_src/context.py +8294 -0
warp/_src/dlpack.py +462 -0
warp/_src/fabric.py +355 -0
warp/_src/fem/__init__.py +14 -0
warp/_src/fem/adaptivity.py +508 -0
warp/_src/fem/cache.py +687 -0
warp/_src/fem/dirichlet.py +188 -0
warp/{fem → _src/fem}/domain.py +40 -30
warp/_src/fem/field/__init__.py +131 -0
warp/_src/fem/field/field.py +701 -0
warp/{fem → _src/fem}/field/nodal_field.py +30 -15
warp/{fem → _src/fem}/field/restriction.py +1 -1
warp/{fem → _src/fem}/field/virtual.py +53 -27
warp/_src/fem/geometry/__init__.py +32 -0
warp/{fem → _src/fem}/geometry/adaptive_nanogrid.py +77 -163
warp/_src/fem/geometry/closest_point.py +97 -0
warp/{fem → _src/fem}/geometry/deformed_geometry.py +14 -22
warp/{fem → _src/fem}/geometry/element.py +32 -10
warp/{fem → _src/fem}/geometry/geometry.py +48 -20
warp/{fem → _src/fem}/geometry/grid_2d.py +12 -23
warp/{fem → _src/fem}/geometry/grid_3d.py +12 -23
warp/{fem → _src/fem}/geometry/hexmesh.py +40 -63
warp/{fem → _src/fem}/geometry/nanogrid.py +255 -248
warp/{fem → _src/fem}/geometry/partition.py +121 -63
warp/{fem → _src/fem}/geometry/quadmesh.py +26 -45
warp/{fem → _src/fem}/geometry/tetmesh.py +40 -63
warp/{fem → _src/fem}/geometry/trimesh.py +26 -45
warp/{fem → _src/fem}/integrate.py +164 -158
warp/_src/fem/linalg.py +383 -0
warp/_src/fem/operator.py +396 -0
warp/_src/fem/polynomial.py +229 -0
warp/{fem → _src/fem}/quadrature/pic_quadrature.py +15 -20
warp/{fem → _src/fem}/quadrature/quadrature.py +95 -47
warp/_src/fem/space/__init__.py +248 -0
warp/{fem → _src/fem}/space/basis_function_space.py +20 -11
warp/_src/fem/space/basis_space.py +679 -0
warp/{fem → _src/fem}/space/dof_mapper.py +3 -3
warp/{fem → _src/fem}/space/function_space.py +14 -13
warp/{fem → _src/fem}/space/grid_2d_function_space.py +4 -7
warp/{fem → _src/fem}/space/grid_3d_function_space.py +4 -4
warp/{fem → _src/fem}/space/hexmesh_function_space.py +4 -10
warp/{fem → _src/fem}/space/nanogrid_function_space.py +3 -9
warp/{fem → _src/fem}/space/partition.py +117 -60
warp/{fem → _src/fem}/space/quadmesh_function_space.py +4 -10
warp/{fem → _src/fem}/space/restriction.py +66 -33
warp/_src/fem/space/shape/__init__.py +152 -0
warp/{fem → _src/fem}/space/shape/cube_shape_function.py +9 -9
warp/{fem → _src/fem}/space/shape/shape_function.py +8 -9
warp/{fem → _src/fem}/space/shape/square_shape_function.py +6 -6
warp/{fem → _src/fem}/space/shape/tet_shape_function.py +3 -3
warp/{fem → _src/fem}/space/shape/triangle_shape_function.py +3 -3
warp/{fem → _src/fem}/space/tetmesh_function_space.py +3 -9
warp/_src/fem/space/topology.py +459 -0
warp/{fem → _src/fem}/space/trimesh_function_space.py +3 -9
warp/_src/fem/types.py +112 -0
warp/_src/fem/utils.py +486 -0
warp/_src/jax.py +186 -0
warp/_src/jax_experimental/__init__.py +14 -0
warp/_src/jax_experimental/custom_call.py +387 -0
warp/_src/jax_experimental/ffi.py +1284 -0
warp/_src/jax_experimental/xla_ffi.py +656 -0
warp/_src/marching_cubes.py +708 -0
warp/_src/math.py +414 -0
warp/_src/optim/__init__.py +14 -0
warp/_src/optim/adam.py +163 -0
warp/_src/optim/linear.py +1606 -0
warp/_src/optim/sgd.py +112 -0
warp/_src/paddle.py +406 -0
warp/_src/render/__init__.py +14 -0
warp/_src/render/imgui_manager.py +289 -0
warp/_src/render/render_opengl.py +3636 -0
warp/_src/render/render_usd.py +937 -0
warp/_src/render/utils.py +160 -0
warp/_src/sparse.py +2716 -0
warp/_src/tape.py +1206 -0
warp/{thirdparty → _src/thirdparty}/unittest_parallel.py +9 -2
warp/_src/torch.py +391 -0
warp/_src/types.py +5870 -0
warp/_src/utils.py +1693 -0
warp/autograd.py +12 -1054
warp/bin/warp-clang.so +0 -0
warp/bin/warp.so +0 -0
warp/build.py +8 -588
warp/build_dll.py +6 -721
warp/codegen.py +6 -4251
warp/constants.py +6 -39
warp/context.py +12 -8062
warp/dlpack.py +6 -444
warp/examples/distributed/example_jacobi_mpi.py +4 -5
warp/examples/fem/example_adaptive_grid.py +1 -1
warp/examples/fem/example_apic_fluid.py +1 -1
warp/examples/fem/example_burgers.py +8 -8
warp/examples/fem/example_diffusion.py +1 -1
warp/examples/fem/example_distortion_energy.py +1 -1
warp/examples/fem/example_mixed_elasticity.py +2 -2
warp/examples/fem/example_navier_stokes.py +1 -1
warp/examples/fem/example_nonconforming_contact.py +7 -7
warp/examples/fem/example_stokes.py +1 -1
warp/examples/fem/example_stokes_transfer.py +1 -1
warp/examples/fem/utils.py +2 -2
warp/examples/interop/example_jax_callable.py +1 -1
warp/examples/interop/example_jax_ffi_callback.py +1 -1
warp/examples/interop/example_jax_kernel.py +1 -1
warp/examples/tile/example_tile_mcgp.py +191 -0
warp/fabric.py +6 -337
warp/fem/__init__.py +159 -97
warp/fem/adaptivity.py +7 -489
warp/fem/cache.py +9 -648
warp/fem/dirichlet.py +6 -184
warp/fem/field/__init__.py +8 -109
warp/fem/field/field.py +7 -652
warp/fem/geometry/__init__.py +7 -18
warp/fem/geometry/closest_point.py +11 -77
warp/fem/linalg.py +18 -366
warp/fem/operator.py +11 -369
warp/fem/polynomial.py +9 -209
warp/fem/space/__init__.py +5 -211
warp/fem/space/basis_space.py +6 -662
warp/fem/space/shape/__init__.py +41 -118
warp/fem/space/topology.py +6 -437
warp/fem/types.py +6 -81
warp/fem/utils.py +11 -444
warp/jax.py +8 -165
warp/jax_experimental/__init__.py +14 -1
warp/jax_experimental/custom_call.py +8 -365
warp/jax_experimental/ffi.py +17 -873
warp/jax_experimental/xla_ffi.py +5 -605
warp/marching_cubes.py +5 -689
warp/math.py +16 -393
warp/native/array.h +385 -37
warp/native/builtin.h +314 -37
warp/native/bvh.cpp +43 -9
warp/native/bvh.cu +62 -27
warp/native/bvh.h +310 -309
warp/native/clang/clang.cpp +102 -97
warp/native/coloring.cpp +0 -1
warp/native/crt.h +208 -0
warp/native/exports.h +156 -0
warp/native/hashgrid.cu +2 -0
warp/native/intersect.h +24 -1
warp/native/intersect_tri.h +44 -35
warp/native/mat.h +1456 -276
warp/native/mesh.cpp +4 -4
warp/native/mesh.cu +4 -2
warp/native/mesh.h +176 -61
warp/native/quat.h +0 -52
warp/native/scan.cu +2 -0
warp/native/sparse.cu +7 -3
warp/native/spatial.h +12 -0
warp/native/tile.h +681 -89
warp/native/tile_radix_sort.h +1 -1
warp/native/tile_reduce.h +394 -46
warp/native/tile_scan.h +4 -4
warp/native/vec.h +469 -0
warp/native/version.h +23 -0
warp/native/volume.cpp +1 -1
warp/native/volume.cu +1 -0
warp/native/volume.h +1 -1
warp/native/volume_builder.cu +2 -0
warp/native/warp.cpp +57 -29
warp/native/warp.cu +253 -171
warp/native/warp.h +11 -8
warp/optim/__init__.py +6 -3
warp/optim/adam.py +6 -145
warp/optim/linear.py +14 -1585
warp/optim/sgd.py +6 -94
warp/paddle.py +6 -388
warp/render/__init__.py +8 -4
warp/render/imgui_manager.py +7 -267
warp/render/render_opengl.py +6 -3618
warp/render/render_usd.py +6 -919
warp/render/utils.py +6 -142
warp/sparse.py +37 -2563
warp/tape.py +6 -1188
warp/tests/__main__.py +1 -1
warp/tests/cuda/test_async.py +4 -4
warp/tests/cuda/test_conditional_captures.py +1 -1
warp/tests/cuda/test_multigpu.py +1 -1
warp/tests/cuda/test_streams.py +58 -1
warp/tests/geometry/test_bvh.py +157 -22
warp/tests/geometry/test_marching_cubes.py +0 -1
warp/tests/geometry/test_mesh.py +5 -3
warp/tests/geometry/test_mesh_query_aabb.py +5 -12
warp/tests/geometry/test_mesh_query_point.py +5 -2
warp/tests/geometry/test_mesh_query_ray.py +15 -3
warp/tests/geometry/test_volume_write.py +5 -5
warp/tests/interop/test_dlpack.py +14 -14
warp/tests/interop/test_jax.py +772 -49
warp/tests/interop/test_paddle.py +1 -1
warp/tests/test_adam.py +0 -1
warp/tests/test_arithmetic.py +9 -9
warp/tests/test_array.py +527 -100
warp/tests/test_array_reduce.py +3 -3
warp/tests/test_atomic.py +12 -8
warp/tests/test_atomic_bitwise.py +209 -0
warp/tests/test_atomic_cas.py +4 -4
warp/tests/test_bool.py +2 -2
warp/tests/test_builtins_resolution.py +5 -571
warp/tests/test_codegen.py +33 -14
warp/tests/test_conditional.py +1 -1
warp/tests/test_context.py +6 -6
warp/tests/test_copy.py +242 -161
warp/tests/test_ctypes.py +3 -3
warp/tests/test_devices.py +24 -2
warp/tests/test_examples.py +16 -84
warp/tests/test_fabricarray.py +35 -35
warp/tests/test_fast_math.py +0 -2
warp/tests/test_fem.py +56 -10
warp/tests/test_fixedarray.py +3 -3
warp/tests/test_func.py +8 -5
warp/tests/test_generics.py +1 -1
warp/tests/test_indexedarray.py +24 -24
warp/tests/test_intersect.py +39 -9
warp/tests/test_large.py +1 -1
warp/tests/test_lerp.py +3 -1
warp/tests/test_linear_solvers.py +1 -1
warp/tests/test_map.py +35 -4
warp/tests/test_mat.py +52 -62
warp/tests/test_mat_constructors.py +4 -5
warp/tests/test_mat_lite.py +1 -1
warp/tests/test_mat_scalar_ops.py +121 -121
warp/tests/test_math.py +34 -0
warp/tests/test_module_aot.py +4 -4
warp/tests/test_modules_lite.py +28 -2
warp/tests/test_print.py +11 -11
warp/tests/test_quat.py +93 -58
warp/tests/test_runlength_encode.py +1 -1
warp/tests/test_scalar_ops.py +38 -10
warp/tests/test_smoothstep.py +1 -1
warp/tests/test_sparse.py +126 -15
warp/tests/test_spatial.py +105 -87
warp/tests/test_special_values.py +6 -6
warp/tests/test_static.py +7 -7
warp/tests/test_struct.py +13 -2
warp/tests/test_triangle_closest_point.py +48 -1
warp/tests/test_types.py +27 -15
warp/tests/test_utils.py +52 -52
warp/tests/test_vec.py +29 -29
warp/tests/test_vec_constructors.py +5 -5
warp/tests/test_vec_scalar_ops.py +97 -97
warp/tests/test_version.py +75 -0
warp/tests/tile/test_tile.py +178 -0
warp/tests/tile/test_tile_atomic_bitwise.py +403 -0
warp/tests/tile/test_tile_cholesky.py +7 -4
warp/tests/tile/test_tile_load.py +26 -2
warp/tests/tile/test_tile_mathdx.py +3 -3
warp/tests/tile/test_tile_matmul.py +1 -1
warp/tests/tile/test_tile_mlp.py +2 -4
warp/tests/tile/test_tile_reduce.py +214 -13
warp/tests/unittest_suites.py +6 -14
warp/tests/unittest_utils.py +10 -9
warp/tests/walkthrough_debug.py +3 -1
warp/torch.py +6 -373
warp/types.py +29 -5764
warp/utils.py +10 -1659
{warp_lang-1.9.1.dist-info → warp_lang-1.10.0rc2.dist-info}/METADATA +46 -99
warp_lang-1.10.0rc2.dist-info/RECORD +468 -0
warp_lang-1.10.0rc2.dist-info/licenses/licenses/Gaia-LICENSE.txt +6 -0
warp_lang-1.10.0rc2.dist-info/licenses/licenses/appdirs-LICENSE.txt +22 -0
warp_lang-1.10.0rc2.dist-info/licenses/licenses/asset_pixel_jpg-LICENSE.txt +3 -0
warp_lang-1.10.0rc2.dist-info/licenses/licenses/cuda-LICENSE.txt +1582 -0
warp_lang-1.10.0rc2.dist-info/licenses/licenses/dlpack-LICENSE.txt +201 -0
warp_lang-1.10.0rc2.dist-info/licenses/licenses/fp16-LICENSE.txt +28 -0
warp_lang-1.10.0rc2.dist-info/licenses/licenses/libmathdx-LICENSE.txt +220 -0
warp_lang-1.10.0rc2.dist-info/licenses/licenses/llvm-LICENSE.txt +279 -0
warp_lang-1.10.0rc2.dist-info/licenses/licenses/moller-LICENSE.txt +16 -0
warp_lang-1.10.0rc2.dist-info/licenses/licenses/nanovdb-LICENSE.txt +2 -0
warp_lang-1.10.0rc2.dist-info/licenses/licenses/nvrtc-LICENSE.txt +1592 -0
warp_lang-1.10.0rc2.dist-info/licenses/licenses/svd-LICENSE.txt +23 -0
warp_lang-1.10.0rc2.dist-info/licenses/licenses/unittest_parallel-LICENSE.txt +21 -0
warp_lang-1.10.0rc2.dist-info/licenses/licenses/usd-LICENSE.txt +213 -0
warp_lang-1.10.0rc2.dist-info/licenses/licenses/windingnumber-LICENSE.txt +21 -0
warp/examples/assets/cartpole.urdf +0 -110
warp/examples/assets/crazyflie.usd +0 -0
warp/examples/assets/nv_ant.xml +0 -92
warp/examples/assets/nv_humanoid.xml +0 -183
warp/examples/assets/quadruped.urdf +0 -268
warp/examples/optim/example_bounce.py +0 -266
warp/examples/optim/example_cloth_throw.py +0 -228
warp/examples/optim/example_drone.py +0 -870
warp/examples/optim/example_inverse_kinematics.py +0 -182
warp/examples/optim/example_inverse_kinematics_torch.py +0 -191
warp/examples/optim/example_softbody_properties.py +0 -400
warp/examples/optim/example_spring_cage.py +0 -245
warp/examples/optim/example_trajectory.py +0 -227
warp/examples/sim/example_cartpole.py +0 -143
warp/examples/sim/example_cloth.py +0 -225
warp/examples/sim/example_cloth_self_contact.py +0 -316
warp/examples/sim/example_granular.py +0 -130
warp/examples/sim/example_granular_collision_sdf.py +0 -202
warp/examples/sim/example_jacobian_ik.py +0 -244
warp/examples/sim/example_particle_chain.py +0 -124
warp/examples/sim/example_quadruped.py +0 -203
warp/examples/sim/example_rigid_chain.py +0 -203
warp/examples/sim/example_rigid_contact.py +0 -195
warp/examples/sim/example_rigid_force.py +0 -133
warp/examples/sim/example_rigid_gyroscopic.py +0 -115
warp/examples/sim/example_rigid_soft_contact.py +0 -140
warp/examples/sim/example_soft_body.py +0 -196
warp/examples/tile/example_tile_walker.py +0 -327
warp/sim/__init__.py +0 -74
warp/sim/articulation.py +0 -793
warp/sim/collide.py +0 -2570
warp/sim/graph_coloring.py +0 -307
warp/sim/import_mjcf.py +0 -791
warp/sim/import_snu.py +0 -227
warp/sim/import_urdf.py +0 -579
warp/sim/import_usd.py +0 -898
warp/sim/inertia.py +0 -357
warp/sim/integrator.py +0 -245
warp/sim/integrator_euler.py +0 -2000
warp/sim/integrator_featherstone.py +0 -2101
warp/sim/integrator_vbd.py +0 -2487
warp/sim/integrator_xpbd.py +0 -3295
warp/sim/model.py +0 -4821
warp/sim/particles.py +0 -121
warp/sim/render.py +0 -431
warp/sim/utils.py +0 -431
warp/tests/sim/disabled_kinematics.py +0 -244
warp/tests/sim/test_cloth.py +0 -863
warp/tests/sim/test_collision.py +0 -743
warp/tests/sim/test_coloring.py +0 -347
warp/tests/sim/test_inertia.py +0 -161
warp/tests/sim/test_model.py +0 -226
warp/tests/sim/test_sim_grad.py +0 -287
warp/tests/sim/test_sim_grad_bounce_linear.py +0 -212
warp/tests/sim/test_sim_kinematics.py +0 -98
warp/thirdparty/__init__.py +0 -0
warp_lang-1.9.1.dist-info/RECORD +0 -456
/warp/{fem → _src/fem}/quadrature/__init__.py +0 -0
/warp/{tests/sim → _src/thirdparty}/__init__.py +0 -0
/warp/{thirdparty → _src/thirdparty}/appdirs.py +0 -0
/warp/{thirdparty → _src/thirdparty}/dlpack.py +0 -0
{warp_lang-1.9.1.dist-info → warp_lang-1.10.0rc2.dist-info}/WHEEL +0 -0
{warp_lang-1.9.1.dist-info → warp_lang-1.10.0rc2.dist-info}/licenses/LICENSE.md +0 -0
{warp_lang-1.9.1.dist-info → warp_lang-1.10.0rc2.dist-info}/top_level.txt +0 -0

warp/native/warp.cu CHANGED Viewed

@@ -222,6 +222,14 @@ struct ModuleInfo
     void* module = NULL;
 };
+// Information used when deferring graph destruction.
+struct GraphDestroyInfo
+{
+    void* context = NULL;
+    void* graph = NULL;
+    void* graph_exec = NULL;
+};
 static std::unordered_map<CUfunction, std::string> g_kernel_names;
 // cached info for all devices, indexed by ordinal
@@ -253,6 +261,11 @@ static std::vector<FreeInfo> g_deferred_free_list;
 // Call unload_deferred_modules() to release.
 static std::vector<ModuleInfo> g_deferred_module_list;
+// Graphs that cannot be destroyed immediately get queued here.
+// Call destroy_deferred_graphs() to release.
+static std::vector<GraphDestroyInfo> g_deferred_graph_list;
 void wp_cuda_set_context_restore_policy(bool always_restore)
 {
     ContextGuard::always_restore = always_restore;
@@ -338,7 +351,7 @@ int cuda_init()
 }
-static inline CUcontext get_current_context()
+CUcontext get_current_context()
 {
     CUcontext ctx;
     if (check_cu(cuCtxGetCurrent_f(&ctx)))
@@ -495,6 +508,38 @@ static int unload_deferred_modules(void* context = NULL)
     return num_unloaded_modules;
 }
+static int destroy_deferred_graphs(void* context = NULL)
+{
+    if (g_deferred_graph_list.empty() || !g_captures.empty())
+        return 0;
+    int num_destroyed_graphs = 0;
+    for (auto it = g_deferred_graph_list.begin(); it != g_deferred_graph_list.end(); /*noop*/)
+    {
+        // destroy the graph if it matches the given context or if the context is unspecified
+        const GraphDestroyInfo& graph_info = *it;
+        if (graph_info.context == context || !context)
+        {
+            if (graph_info.graph)
+            {
+                check_cuda(cudaGraphDestroy((cudaGraph_t)graph_info.graph));
+            }
+            if (graph_info.graph_exec)
+            {
+                check_cuda(cudaGraphExecDestroy((cudaGraphExec_t)graph_info.graph_exec));
+            }
+            ++num_destroyed_graphs;
+            it = g_deferred_graph_list.erase(it);
+        }
+        else
+        {
+            ++it;
+        }
+    }
+    return num_destroyed_graphs;
+}
 static void CUDART_CB on_graph_destroy(void* user_data)
 {
     if (!user_data)
@@ -989,15 +1034,15 @@ void wp_memtile_device(void* context, void* dst, const void* src, size_t srcsize
 static __global__ void array_copy_1d_kernel(void* dst, const void* src,
-                                        int dst_stride, int src_stride,
+                                        size_t dst_stride, size_t src_stride,
                                         const int* dst_indices, const int* src_indices,
-                                        int n, int elem_size)
+                                        size_t n, size_t elem_size)
 {
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t i = size_t(blockIdx.x) * size_t(blockDim.x) + size_t(threadIdx.x);
     if (i < n)
     {
-        int src_idx = src_indices ? src_indices[i] : i;
-        int dst_idx = dst_indices ? dst_indices[i] : i;
+        size_t src_idx = src_indices ? src_indices[i] : i;
+        size_t dst_idx = dst_indices ? dst_indices[i] : i;
         const char* p = (const char*)src + src_idx * src_stride;
         char* q = (char*)dst + dst_idx * dst_stride;
         memcpy(q, p, elem_size);
@@ -1005,20 +1050,20 @@ static __global__ void array_copy_1d_kernel(void* dst, const void* src,
 }
 static __global__ void array_copy_2d_kernel(void* dst, const void* src,
-                                        wp::vec_t<2, int> dst_strides, wp::vec_t<2, int> src_strides,
+                                        wp::vec_t<2, size_t> dst_strides, wp::vec_t<2, size_t> src_strides,
                                         wp::vec_t<2, const int*> dst_indices, wp::vec_t<2, const int*> src_indices,
-                                        wp::vec_t<2, int> shape, int elem_size)
+                                        wp::vec_t<2, size_t> shape, size_t elem_size)
 {
-    int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    int n = shape[1];
-    int i = tid / n;
-    int j = tid % n;
+    size_t tid = size_t(blockIdx.x) * size_t(blockDim.x) + size_t(threadIdx.x);
+    size_t n = shape[1];
+    size_t i = tid / n;
+    size_t j = tid % n;
     if (i < shape[0] /*&& j < shape[1]*/)
     {
-        int src_idx0 = src_indices[0] ? src_indices[0][i] : i;
-        int dst_idx0 = dst_indices[0] ? dst_indices[0][i] : i;
-        int src_idx1 = src_indices[1] ? src_indices[1][j] : j;
-        int dst_idx1 = dst_indices[1] ? dst_indices[1][j] : j;
+        size_t src_idx0 = src_indices[0] ? src_indices[0][i] : i;
+        size_t dst_idx0 = dst_indices[0] ? dst_indices[0][i] : i;
+        size_t src_idx1 = src_indices[1] ? src_indices[1][j] : j;
+        size_t dst_idx1 = dst_indices[1] ? dst_indices[1][j] : j;
         const char* p = (const char*)src + src_idx0 * src_strides[0] + src_idx1 * src_strides[1];
         char* q = (char*)dst + dst_idx0 * dst_strides[0] + dst_idx1 * dst_strides[1];
         memcpy(q, p, elem_size);
@@ -1026,24 +1071,24 @@ static __global__ void array_copy_2d_kernel(void* dst, const void* src,
 }
 static __global__ void array_copy_3d_kernel(void* dst, const void* src,
-                                        wp::vec_t<3, int> dst_strides, wp::vec_t<3, int> src_strides,
+                                        wp::vec_t<3, size_t> dst_strides, wp::vec_t<3, size_t> src_strides,
                                         wp::vec_t<3, const int*> dst_indices, wp::vec_t<3, const int*> src_indices,
-                                        wp::vec_t<3, int> shape, int elem_size)
-{
-    int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    int n = shape[1];
-    int o = shape[2];
-    int i = tid / (n * o);
-    int j = tid % (n * o) / o;
-    int k = tid % o;
+                                        wp::vec_t<3, size_t> shape, size_t elem_size)
+{
+    size_t tid = size_t(blockIdx.x) * size_t(blockDim.x) + size_t(threadIdx.x);
+    size_t n = shape[1];
+    size_t o = shape[2];
+    size_t i = tid / (n * o);
+    size_t j = tid % (n * o) / o;
+    size_t k = tid % o;
     if (i < shape[0] && j < shape[1] /*&& k < shape[2]*/)
     {
-        int src_idx0 = src_indices[0] ? src_indices[0][i] : i;
-        int dst_idx0 = dst_indices[0] ? dst_indices[0][i] : i;
-        int src_idx1 = src_indices[1] ? src_indices[1][j] : j;
-        int dst_idx1 = dst_indices[1] ? dst_indices[1][j] : j;
-        int src_idx2 = src_indices[2] ? src_indices[2][k] : k;
-        int dst_idx2 = dst_indices[2] ? dst_indices[2][k] : k;
+        size_t src_idx0 = src_indices[0] ? src_indices[0][i] : i;
+        size_t dst_idx0 = dst_indices[0] ? dst_indices[0][i] : i;
+        size_t src_idx1 = src_indices[1] ? src_indices[1][j] : j;
+        size_t dst_idx1 = dst_indices[1] ? dst_indices[1][j] : j;
+        size_t src_idx2 = src_indices[2] ? src_indices[2][k] : k;
+        size_t dst_idx2 = dst_indices[2] ? dst_indices[2][k] : k;
         const char* p = (const char*)src + src_idx0 * src_strides[0]
                                          + src_idx1 * src_strides[1]
                                          + src_idx2 * src_strides[2];
@@ -1055,28 +1100,28 @@ static __global__ void array_copy_3d_kernel(void* dst, const void* src,
 }
 static __global__ void array_copy_4d_kernel(void* dst, const void* src,
-                                        wp::vec_t<4, int> dst_strides, wp::vec_t<4, int> src_strides,
+                                        wp::vec_t<4, size_t> dst_strides, wp::vec_t<4, size_t> src_strides,
                                         wp::vec_t<4, const int*> dst_indices, wp::vec_t<4, const int*> src_indices,
-                                        wp::vec_t<4, int> shape, int elem_size)
-{
-    int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    int n = shape[1];
-    int o = shape[2];
-    int p = shape[3];
-    int i = tid / (n * o * p);
-    int j = tid % (n * o * p) / (o * p);
-    int k = tid % (o * p) / p;
-    int l = tid % p;
+                                        wp::vec_t<4, size_t> shape, size_t elem_size)
+{
+    size_t tid = size_t(blockIdx.x) * size_t(blockDim.x) + size_t(threadIdx.x);
+    size_t n = shape[1];
+    size_t o = shape[2];
+    size_t p = shape[3];
+    size_t i = tid / (n * o * p);
+    size_t j = tid % (n * o * p) / (o * p);
+    size_t k = tid % (o * p) / p;
+    size_t l = tid % p;
     if (i < shape[0] && j < shape[1] && k < shape[2] /*&& l < shape[3]*/)
     {
-        int src_idx0 = src_indices[0] ? src_indices[0][i] : i;
-        int dst_idx0 = dst_indices[0] ? dst_indices[0][i] : i;
-        int src_idx1 = src_indices[1] ? src_indices[1][j] : j;
-        int dst_idx1 = dst_indices[1] ? dst_indices[1][j] : j;
-        int src_idx2 = src_indices[2] ? src_indices[2][k] : k;
-        int dst_idx2 = dst_indices[2] ? dst_indices[2][k] : k;
-        int src_idx3 = src_indices[3] ? src_indices[3][l] : l;
-        int dst_idx3 = dst_indices[3] ? dst_indices[3][l] : l;
+        size_t src_idx0 = src_indices[0] ? src_indices[0][i] : i;
+        size_t dst_idx0 = dst_indices[0] ? dst_indices[0][i] : i;
+        size_t src_idx1 = src_indices[1] ? src_indices[1][j] : j;
+        size_t dst_idx1 = dst_indices[1] ? dst_indices[1][j] : j;
+        size_t src_idx2 = src_indices[2] ? src_indices[2][k] : k;
+        size_t dst_idx2 = dst_indices[2] ? dst_indices[2][k] : k;
+        size_t src_idx3 = src_indices[3] ? src_indices[3][l] : l;
+        size_t dst_idx3 = dst_indices[3] ? dst_indices[3][l] : l;
         const char* p = (const char*)src + src_idx0 * src_strides[0]
                                          + src_idx1 * src_strides[1]
                                          + src_idx2 * src_strides[2]
@@ -1091,14 +1136,14 @@ static __global__ void array_copy_4d_kernel(void* dst, const void* src,
 static __global__ void array_copy_from_fabric_kernel(wp::fabricarray_t<void> src,
-                                                     void* dst_data, int dst_stride, const int* dst_indices,
-                                                     int elem_size)
+                                                     void* dst_data, size_t dst_stride, const int* dst_indices,
+                                                     size_t elem_size)
 {
-    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t tid = size_t(blockIdx.x) * size_t(blockDim.x) + size_t(threadIdx.x);
     if (tid < src.size)
     {
-        int dst_idx = dst_indices ? dst_indices[tid] : tid;
+        size_t dst_idx = dst_indices ? dst_indices[tid] : tid;
         void* dst_ptr = (char*)dst_data + dst_idx * dst_stride;
         const void* src_ptr = fabricarray_element_ptr(src, tid, elem_size);
         memcpy(dst_ptr, src_ptr, elem_size);
@@ -1106,15 +1151,15 @@ static __global__ void array_copy_from_fabric_kernel(wp::fabricarray_t<void> src
 }
 static __global__ void array_copy_from_fabric_indexed_kernel(wp::indexedfabricarray_t<void> src,
-                                                             void* dst_data, int dst_stride, const int* dst_indices,
-                                                             int elem_size)
+                                                             void* dst_data, size_t dst_stride, const int* dst_indices,
+                                                             size_t elem_size)
 {
-    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t tid = size_t(blockIdx.x) * size_t(blockDim.x) + size_t(threadIdx.x);
     if (tid < src.size)
     {
-        int src_index = src.indices[tid];
-        int dst_idx = dst_indices ? dst_indices[tid] : tid;
+        size_t src_index = src.indices[tid];
+        size_t dst_idx = dst_indices ? dst_indices[tid] : tid;
         void* dst_ptr = (char*)dst_data + dst_idx * dst_stride;
         const void* src_ptr = fabricarray_element_ptr(src.fa, src_index, elem_size);
         memcpy(dst_ptr, src_ptr, elem_size);
@@ -1122,14 +1167,14 @@ static __global__ void array_copy_from_fabric_indexed_kernel(wp::indexedfabricar
 }
 static __global__ void array_copy_to_fabric_kernel(wp::fabricarray_t<void> dst,
-                                                   const void* src_data, int src_stride, const int* src_indices,
-                                                   int elem_size)
+                                                   const void* src_data, size_t src_stride, const int* src_indices,
+                                                   size_t elem_size)
 {
-    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t tid = size_t(blockIdx.x) * size_t(blockDim.x) + size_t(threadIdx.x);
     if (tid < dst.size)
     {
-        int src_idx = src_indices ? src_indices[tid] : tid;
+        size_t src_idx = src_indices ? src_indices[tid] : tid;
         const void* src_ptr = (const char*)src_data + src_idx * src_stride;
         void* dst_ptr = fabricarray_element_ptr(dst, tid, elem_size);
         memcpy(dst_ptr, src_ptr, elem_size);
@@ -1137,25 +1182,25 @@ static __global__ void array_copy_to_fabric_kernel(wp::fabricarray_t<void> dst,
 }
 static __global__ void array_copy_to_fabric_indexed_kernel(wp::indexedfabricarray_t<void> dst,
-                                                           const void* src_data, int src_stride, const int* src_indices,
-                                                           int elem_size)
+                                                           const void* src_data, size_t src_stride, const int* src_indices,
+                                                           size_t elem_size)
 {
-    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t tid = size_t(blockIdx.x) * size_t(blockDim.x) + size_t(threadIdx.x);
     if (tid < dst.size)
     {
-        int src_idx = src_indices ? src_indices[tid] : tid;
+        size_t src_idx = src_indices ? src_indices[tid] : tid;
         const void* src_ptr = (const char*)src_data + src_idx * src_stride;
-        int dst_idx = dst.indices[tid];
+        size_t dst_idx = dst.indices[tid];
         void* dst_ptr = fabricarray_element_ptr(dst.fa, dst_idx, elem_size);
         memcpy(dst_ptr, src_ptr, elem_size);
     }
 }
-static __global__ void array_copy_fabric_to_fabric_kernel(wp::fabricarray_t<void> dst, wp::fabricarray_t<void> src, int elem_size)
+static __global__ void array_copy_fabric_to_fabric_kernel(wp::fabricarray_t<void> dst, wp::fabricarray_t<void> src, size_t elem_size)
 {
-    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t tid = size_t(blockIdx.x) * size_t(blockDim.x) + size_t(threadIdx.x);
     if (tid < dst.size)
     {
@@ -1166,27 +1211,27 @@ static __global__ void array_copy_fabric_to_fabric_kernel(wp::fabricarray_t<void
 }
-static __global__ void array_copy_fabric_to_fabric_indexed_kernel(wp::indexedfabricarray_t<void> dst, wp::fabricarray_t<void> src, int elem_size)
+static __global__ void array_copy_fabric_to_fabric_indexed_kernel(wp::indexedfabricarray_t<void> dst, wp::fabricarray_t<void> src, size_t elem_size)
 {
-    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t tid = size_t(blockIdx.x) * size_t(blockDim.x) + size_t(threadIdx.x);
     if (tid < dst.size)
     {
         const void* src_ptr = fabricarray_element_ptr(src, tid, elem_size);
-        int dst_index = dst.indices[tid];
+        size_t dst_index = dst.indices[tid];
         void* dst_ptr = fabricarray_element_ptr(dst.fa, dst_index, elem_size);
         memcpy(dst_ptr, src_ptr, elem_size);
     }
 }
-static __global__ void array_copy_fabric_indexed_to_fabric_kernel(wp::fabricarray_t<void> dst, wp::indexedfabricarray_t<void> src, int elem_size)
+static __global__ void array_copy_fabric_indexed_to_fabric_kernel(wp::fabricarray_t<void> dst, wp::indexedfabricarray_t<void> src, size_t elem_size)
 {
-    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t tid = size_t(blockIdx.x) * size_t(blockDim.x) + size_t(threadIdx.x);
     if (tid < dst.size)
     {
-        int src_index = src.indices[tid];
+        size_t src_index = src.indices[tid];
         const void* src_ptr = fabricarray_element_ptr(src.fa, src_index, elem_size);
         void* dst_ptr = fabricarray_element_ptr(dst, tid, elem_size);
         memcpy(dst_ptr, src_ptr, elem_size);
@@ -1194,14 +1239,14 @@ static __global__ void array_copy_fabric_indexed_to_fabric_kernel(wp::fabricarra
 }
-static __global__ void array_copy_fabric_indexed_to_fabric_indexed_kernel(wp::indexedfabricarray_t<void> dst, wp::indexedfabricarray_t<void> src, int elem_size)
+static __global__ void array_copy_fabric_indexed_to_fabric_indexed_kernel(wp::indexedfabricarray_t<void> dst, wp::indexedfabricarray_t<void> src, size_t elem_size)
 {
-    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t tid = size_t(blockIdx.x) * size_t(blockDim.x) + size_t(threadIdx.x);
     if (tid < dst.size)
     {
-        int src_index = src.indices[tid];
-        int dst_index = dst.indices[tid];
+        size_t src_index = src.indices[tid];
+        size_t dst_index = dst.indices[tid];
         const void* src_ptr = fabricarray_element_ptr(src.fa, src_index, elem_size);
         void* dst_ptr = fabricarray_element_ptr(dst.fa, dst_index, elem_size);
         memcpy(dst_ptr, src_ptr, elem_size);
@@ -1440,9 +1485,9 @@ WP_API bool wp_array_copy_device(void* context, void* dst, void* src, int dst_ty
     }
     case 2:
     {
-        wp::vec_t<2, int> shape_v(src_shape[0], src_shape[1]);
-        wp::vec_t<2, int> src_strides_v(src_strides[0], src_strides[1]);
-        wp::vec_t<2, int> dst_strides_v(dst_strides[0], dst_strides[1]);
+        wp::vec_t<2, size_t> shape_v(src_shape[0], src_shape[1]);
+        wp::vec_t<2, size_t> src_strides_v(src_strides[0], src_strides[1]);
+        wp::vec_t<2, size_t> dst_strides_v(dst_strides[0], dst_strides[1]);
         wp::vec_t<2, const int*> src_indices_v(src_indices[0], src_indices[1]);
         wp::vec_t<2, const int*> dst_indices_v(dst_indices[0], dst_indices[1]);
@@ -1454,9 +1499,9 @@ WP_API bool wp_array_copy_device(void* context, void* dst, void* src, int dst_ty
     }
     case 3:
     {
-        wp::vec_t<3, int> shape_v(src_shape[0], src_shape[1], src_shape[2]);
-        wp::vec_t<3, int> src_strides_v(src_strides[0], src_strides[1], src_strides[2]);
-        wp::vec_t<3, int> dst_strides_v(dst_strides[0], dst_strides[1], dst_strides[2]);
+        wp::vec_t<3, size_t> shape_v(src_shape[0], src_shape[1], src_shape[2]);
+        wp::vec_t<3, size_t> src_strides_v(src_strides[0], src_strides[1], src_strides[2]);
+        wp::vec_t<3, size_t> dst_strides_v(dst_strides[0], dst_strides[1], dst_strides[2]);
         wp::vec_t<3, const int*> src_indices_v(src_indices[0], src_indices[1], src_indices[2]);
         wp::vec_t<3, const int*> dst_indices_v(dst_indices[0], dst_indices[1], dst_indices[2]);
@@ -1468,9 +1513,9 @@ WP_API bool wp_array_copy_device(void* context, void* dst, void* src, int dst_ty
     }
     case 4:
     {
-        wp::vec_t<4, int> shape_v(src_shape[0], src_shape[1], src_shape[2], src_shape[3]);
-        wp::vec_t<4, int> src_strides_v(src_strides[0], src_strides[1], src_strides[2], src_strides[3]);
-        wp::vec_t<4, int> dst_strides_v(dst_strides[0], dst_strides[1], dst_strides[2], dst_strides[3]);
+        wp::vec_t<4, size_t> shape_v(src_shape[0], src_shape[1], src_shape[2], src_shape[3]);
+        wp::vec_t<4, size_t> src_strides_v(src_strides[0], src_strides[1], src_strides[2], src_strides[3]);
+        wp::vec_t<4, size_t> dst_strides_v(dst_strides[0], dst_strides[1], dst_strides[2], dst_strides[3]);
         wp::vec_t<4, const int*> src_indices_v(src_indices[0], src_indices[1], src_indices[2], src_indices[3]);
         wp::vec_t<4, const int*> dst_indices_v(dst_indices[0], dst_indices[1], dst_indices[2], dst_indices[3]);
@@ -1490,94 +1535,94 @@ WP_API bool wp_array_copy_device(void* context, void* dst, void* src, int dst_ty
 static __global__ void array_fill_1d_kernel(void* data,
-                                            int n,
-                                            int stride,
+                                            size_t n,
+                                            size_t stride,
                                             const int* indices,
                                             const void* value,
-                                            int value_size)
+                                            size_t value_size)
 {
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t i = size_t(blockIdx.x) * size_t(blockDim.x) + size_t(threadIdx.x);
     if (i < n)
     {
-        int idx = indices ? indices[i] : i;
+        size_t idx = indices ? indices[i] : i;
         char* p = (char*)data + idx * stride;
         memcpy(p, value, value_size);
     }
 }
 static __global__ void array_fill_2d_kernel(void* data,
-                                            wp::vec_t<2, int> shape,
-                                            wp::vec_t<2, int> strides,
+                                            wp::vec_t<2, size_t> shape,
+                                            wp::vec_t<2, size_t> strides,
                                             wp::vec_t<2, const int*> indices,
                                             const void* value,
-                                            int value_size)
+                                            size_t value_size)
 {
-    int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    int n = shape[1];
-    int i = tid / n;
-    int j = tid % n;
+    size_t tid = size_t(blockIdx.x) * size_t(blockDim.x) + size_t(threadIdx.x);
+    size_t n = shape[1];
+    size_t i = tid / n;
+    size_t j = tid % n;
     if (i < shape[0] /*&& j < shape[1]*/)
     {
-        int idx0 = indices[0] ? indices[0][i] : i;
-        int idx1 = indices[1] ? indices[1][j] : j;
+        size_t idx0 = indices[0] ? indices[0][i] : i;
+        size_t idx1 = indices[1] ? indices[1][j] : j;
         char* p = (char*)data + idx0 * strides[0] + idx1 * strides[1];
         memcpy(p, value, value_size);
     }
 }
 static __global__ void array_fill_3d_kernel(void* data,
-                                            wp::vec_t<3, int> shape,
-                                            wp::vec_t<3, int> strides,
+                                            wp::vec_t<3, size_t> shape,
+                                            wp::vec_t<3, size_t> strides,
                                             wp::vec_t<3, const int*> indices,
                                             const void* value,
-                                            int value_size)
-{
-    int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    int n = shape[1];
-    int o = shape[2];
-    int i = tid / (n * o);
-    int j = tid % (n * o) / o;
-    int k = tid % o;
+                                            size_t value_size)
+{
+    size_t tid = size_t(blockIdx.x) * size_t(blockDim.x) + size_t(threadIdx.x);
+    size_t n = shape[1];
+    size_t o = shape[2];
+    size_t i = tid / (n * o);
+    size_t j = tid % (n * o) / o;
+    size_t k = tid % o;
     if (i < shape[0] && j < shape[1] /*&& k < shape[2]*/)
     {
-        int idx0 = indices[0] ? indices[0][i] : i;
-        int idx1 = indices[1] ? indices[1][j] : j;
-        int idx2 = indices[2] ? indices[2][k] : k;
+        size_t idx0 = indices[0] ? indices[0][i] : i;
+        size_t idx1 = indices[1] ? indices[1][j] : j;
+        size_t idx2 = indices[2] ? indices[2][k] : k;
         char* p = (char*)data + idx0 * strides[0] + idx1 * strides[1] + idx2 * strides[2];
         memcpy(p, value, value_size);
     }
 }
 static __global__ void array_fill_4d_kernel(void* data,
-                                            wp::vec_t<4, int> shape,
-                                            wp::vec_t<4, int> strides,
+                                            wp::vec_t<4, size_t> shape,
+                                            wp::vec_t<4, size_t> strides,
                                             wp::vec_t<4, const int*> indices,
                                             const void* value,
-                                            int value_size)
-{
-    int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    int n = shape[1];
-    int o = shape[2];
-    int p = shape[3];
-    int i = tid / (n * o * p);
-    int j = tid % (n * o * p) / (o * p);
-    int k = tid % (o * p) / p;
-    int l = tid % p;
+                                            size_t value_size)
+{
+    size_t tid = size_t(blockIdx.x) * size_t(blockDim.x) + size_t(threadIdx.x);
+    size_t n = shape[1];
+    size_t o = shape[2];
+    size_t p = shape[3];
+    size_t i = tid / (n * o * p);
+    size_t j = tid % (n * o * p) / (o * p);
+    size_t k = tid % (o * p) / p;
+    size_t l = tid % p;
     if (i < shape[0] && j < shape[1] && k < shape[2] /*&& l < shape[3]*/)
     {
-        int idx0 = indices[0] ? indices[0][i] : i;
-        int idx1 = indices[1] ? indices[1][j] : j;
-        int idx2 = indices[2] ? indices[2][k] : k;
-        int idx3 = indices[3] ? indices[3][l] : l;
+        size_t idx0 = indices[0] ? indices[0][i] : i;
+        size_t idx1 = indices[1] ? indices[1][j] : j;
+        size_t idx2 = indices[2] ? indices[2][k] : k;
+        size_t idx3 = indices[3] ? indices[3][l] : l;
         char* p = (char*)data + idx0 * strides[0] + idx1 * strides[1] + idx2 * strides[2] + idx3 * strides[3];
         memcpy(p, value, value_size);
     }
 }
-static __global__ void array_fill_fabric_kernel(wp::fabricarray_t<void> fa, const void* value, int value_size)
+static __global__ void array_fill_fabric_kernel(wp::fabricarray_t<void> fa, const void* value, size_t value_size)
 {
-    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t tid = size_t(blockIdx.x) * size_t(blockDim.x) + size_t(threadIdx.x);
     if (tid < fa.size)
     {
         void* dst_ptr = fabricarray_element_ptr(fa, tid, value_size);
@@ -1586,9 +1631,9 @@ static __global__ void array_fill_fabric_kernel(wp::fabricarray_t<void> fa, cons
 }
-static __global__ void array_fill_fabric_indexed_kernel(wp::indexedfabricarray_t<void> ifa, const void* value, int value_size)
+static __global__ void array_fill_fabric_indexed_kernel(wp::indexedfabricarray_t<void> ifa, const void* value, size_t value_size)
 {
-    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t tid = size_t(blockIdx.x) * size_t(blockDim.x) + size_t(threadIdx.x);
     if (tid < ifa.size)
     {
         size_t idx = size_t(ifa.indices[tid]);
@@ -1685,8 +1730,8 @@ WP_API void wp_array_fill_device(void* context, void* arr_ptr, int arr_type, con
     }
     case 2:
     {
-        wp::vec_t<2, int> shape_v(shape[0], shape[1]);
-        wp::vec_t<2, int> strides_v(strides[0], strides[1]);
+        wp::vec_t<2, size_t> shape_v(shape[0], shape[1]);
+        wp::vec_t<2, size_t> strides_v(strides[0], strides[1]);
         wp::vec_t<2, const int*> indices_v(indices[0], indices[1]);
         wp_launch_device(WP_CURRENT_CONTEXT, array_fill_2d_kernel, n,
                          (data, shape_v, strides_v, indices_v, value_devptr, value_size));
@@ -1694,8 +1739,8 @@ WP_API void wp_array_fill_device(void* context, void* arr_ptr, int arr_type, con
     }
     case 3:
     {
-        wp::vec_t<3, int> shape_v(shape[0], shape[1], shape[2]);
-        wp::vec_t<3, int> strides_v(strides[0], strides[1], strides[2]);
+        wp::vec_t<3, size_t> shape_v(shape[0], shape[1], shape[2]);
+        wp::vec_t<3, size_t> strides_v(strides[0], strides[1], strides[2]);
         wp::vec_t<3, const int*> indices_v(indices[0], indices[1], indices[2]);
         wp_launch_device(WP_CURRENT_CONTEXT, array_fill_3d_kernel, n,
                          (data, shape_v, strides_v, indices_v, value_devptr, value_size));
@@ -1703,8 +1748,8 @@ WP_API void wp_array_fill_device(void* context, void* arr_ptr, int arr_type, con
     }
     case 4:
     {
-        wp::vec_t<4, int> shape_v(shape[0], shape[1], shape[2], shape[3]);
-        wp::vec_t<4, int> strides_v(strides[0], strides[1], strides[2], strides[3]);
+        wp::vec_t<4, size_t> shape_v(shape[0], shape[1], shape[2], shape[3]);
+        wp::vec_t<4, size_t> strides_v(strides[0], strides[1], strides[2], strides[3]);
         wp::vec_t<4, const int*> indices_v(indices[0], indices[1], indices[2], indices[3]);
         wp_launch_device(WP_CURRENT_CONTEXT, array_fill_4d_kernel, n,
                          (data, shape_v, strides_v, indices_v, value_devptr, value_size));
@@ -2072,13 +2117,17 @@ void wp_cuda_context_synchronize(void* context)
     check_cu(cuCtxSynchronize_f());
-    if (free_deferred_allocs(context ? context : get_current_context()) > 0)
+    if (!context)
+        context = get_current_context();
+    if (free_deferred_allocs(context) > 0)
     {
         // ensure deferred asynchronous deallocations complete
         check_cu(cuCtxSynchronize_f());
     }
     unload_deferred_modules(context);
+    destroy_deferred_graphs(context);
     // check_cuda(cudaDeviceGraphMemTrim(wp_cuda_context_get_device_ordinal(context)));
 }
@@ -2514,15 +2563,36 @@ void wp_cuda_stream_synchronize(void* stream)
     check_cu(cuStreamSynchronize_f(static_cast<CUstream>(stream)));
 }
-void wp_cuda_stream_wait_event(void* stream, void* event)
+void wp_cuda_stream_wait_event(void* stream, void* event, bool external)
 {
-    check_cu(cuStreamWaitEvent_f(static_cast<CUstream>(stream), static_cast<CUevent>(event), 0));
+    // the external flag can only be used during graph capture
+    if (external && !g_captures.empty() && wp_cuda_stream_is_capturing(stream))
+    {
+        // wait for an external event during graph capture
+        check_cu(cuStreamWaitEvent_f(static_cast<CUstream>(stream), static_cast<CUevent>(event), CU_EVENT_WAIT_EXTERNAL));
+    }
+    else
+    {
+        check_cu(cuStreamWaitEvent_f(static_cast<CUstream>(stream), static_cast<CUevent>(event), CU_EVENT_WAIT_DEFAULT));
+    }
 }
-void wp_cuda_stream_wait_stream(void* stream, void* other_stream, void* event)
+void wp_cuda_stream_wait_stream(void* stream, void* other_stream, void* event, bool external)
 {
-    check_cu(cuEventRecord_f(static_cast<CUevent>(event), static_cast<CUstream>(other_stream)));
-    check_cu(cuStreamWaitEvent_f(static_cast<CUstream>(stream), static_cast<CUevent>(event), 0));
+    unsigned record_flags = CU_EVENT_RECORD_DEFAULT;
+    unsigned wait_flags = CU_EVENT_WAIT_DEFAULT;
+    // the external flag can only be used during graph capture
+    if (external && !g_captures.empty())
+    {
+        if (wp_cuda_stream_is_capturing(other_stream))
+            record_flags = CU_EVENT_RECORD_EXTERNAL;
+        if (wp_cuda_stream_is_capturing(stream))
+            wait_flags = CU_EVENT_WAIT_EXTERNAL;
+    }
+    check_cu(cuEventRecordWithFlags_f(static_cast<CUevent>(event), static_cast<CUstream>(other_stream), record_flags));
+    check_cu(cuStreamWaitEvent_f(static_cast<CUstream>(stream), static_cast<CUevent>(event), wait_flags));
 }
 int wp_cuda_stream_is_capturing(void* stream)
@@ -2575,11 +2645,12 @@ int wp_cuda_event_query(void* event)
     return res;
 }
-void wp_cuda_event_record(void* event, void* stream, bool timing)
+void wp_cuda_event_record(void* event, void* stream, bool external)
 {
-    if (timing && !g_captures.empty() && wp_cuda_stream_is_capturing(stream))
+    // the external flag can only be used during graph capture
+    if (external && !g_captures.empty() && wp_cuda_stream_is_capturing(stream))
     {
-        // record timing event during graph capture
+        // record external event during graph capture (e.g., for timing or when explicitly specified by the user)
         check_cu(cuEventRecordWithFlags_f(static_cast<CUevent>(event), static_cast<CUstream>(stream), CU_EVENT_RECORD_EXTERNAL));
     }
     else
@@ -2629,7 +2700,7 @@ bool wp_cuda_graph_begin_capture(void* context, void* stream, int external)
     else
     {
         // start the capture
-        if (!check_cuda(cudaStreamBeginCapture(cuda_stream, cudaStreamCaptureModeGlobal)))
+        if (!check_cuda(cudaStreamBeginCapture(cuda_stream, cudaStreamCaptureModeThreadLocal)))
             return false;
     }
@@ -2776,6 +2847,7 @@ bool wp_cuda_graph_end_capture(void* context, void* stream, void** graph_ret)
     {
         free_deferred_allocs();
         unload_deferred_modules();
+        destroy_deferred_graphs();
     }
     if (graph_ret)
@@ -2996,7 +3068,7 @@ bool wp_cuda_graph_resume_capture(void* context, void* stream, void* graph)
                                   leaf_nodes.data(),
                                   nullptr,
                                   leaf_nodes.size(),
-                                  cudaStreamCaptureModeGlobal)))
+                                  cudaStreamCaptureModeThreadLocal)))
         return false;
     return true;
@@ -3455,16 +3527,38 @@ bool wp_cuda_graph_launch(void* graph_exec, void* stream)
 bool wp_cuda_graph_destroy(void* context, void* graph)
 {
-    ContextGuard guard(context);
-    return check_cuda(cudaGraphDestroy((cudaGraph_t)graph));
+    // ensure there are no graph captures in progress
+    if (g_captures.empty())
+    {
+        ContextGuard guard(context);
+        return check_cuda(cudaGraphDestroy((cudaGraph_t)graph));
+    }
+    else
+    {
+        GraphDestroyInfo info;
+        info.context = context ? context : get_current_context();
+        info.graph = graph;
+        g_deferred_graph_list.push_back(info);
+        return true;
+    }
 }
 bool wp_cuda_graph_exec_destroy(void* context, void* graph_exec)
 {
-    ContextGuard guard(context);
-    return check_cuda(cudaGraphExecDestroy((cudaGraphExec_t)graph_exec));
+    // ensure there are no graph captures in progress
+    if (g_captures.empty())
+    {
+        ContextGuard guard(context);
+        return check_cuda(cudaGraphExecDestroy((cudaGraphExec_t)graph_exec));
+    }
+    else
+    {
+        GraphDestroyInfo info;
+        info.context = context ? context : get_current_context();
+        info.graph_exec = graph_exec;
+        g_deferred_graph_list.push_back(info);
+        return true;
+    }
 }
 bool write_file(const char* data, size_t size, std::string filename, const char* mode)
@@ -4317,17 +4411,5 @@ void wp_cuda_timing_end(timing_result_t* results, int size)
     g_cuda_timing_state = parent_state;
 }
-// impl. files
-#include "bvh.cu"
-#include "mesh.cu"
-#include "sort.cu"
-#include "hashgrid.cu"
-#include "reduce.cu"
-#include "runlength_encode.cu"
-#include "scan.cu"
-#include "sparse.cu"
-#include "volume.cu"
-#include "volume_builder.cu"
 //#include "spline.inl"
 //#include "volume.inl"