PyPI - warp-lang - Versions diffs - 1.9.0__py3-none-win_amd64.whl → 1.10.0rc2__py3-none-win_amd64.whl - Mend

warp-lang 1.9.0__py3-none-win_amd64.whl → 1.10.0rc2__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (350) hide show

warp/__init__.py +301 -287
warp/__init__.pyi +2220 -313
warp/_src/__init__.py +14 -0
warp/_src/autograd.py +1075 -0
warp/_src/build.py +618 -0
warp/_src/build_dll.py +640 -0
warp/{builtins.py → _src/builtins.py} +1497 -226
warp/_src/codegen.py +4359 -0
warp/{config.py → _src/config.py} +178 -169
warp/_src/constants.py +57 -0
warp/_src/context.py +8294 -0
warp/_src/dlpack.py +462 -0
warp/_src/fabric.py +355 -0
warp/_src/fem/__init__.py +14 -0
warp/_src/fem/adaptivity.py +508 -0
warp/_src/fem/cache.py +687 -0
warp/_src/fem/dirichlet.py +188 -0
warp/{fem → _src/fem}/domain.py +40 -30
warp/_src/fem/field/__init__.py +131 -0
warp/_src/fem/field/field.py +701 -0
warp/{fem → _src/fem}/field/nodal_field.py +30 -15
warp/{fem → _src/fem}/field/restriction.py +1 -1
warp/{fem → _src/fem}/field/virtual.py +53 -27
warp/_src/fem/geometry/__init__.py +32 -0
warp/{fem → _src/fem}/geometry/adaptive_nanogrid.py +77 -163
warp/_src/fem/geometry/closest_point.py +97 -0
warp/{fem → _src/fem}/geometry/deformed_geometry.py +14 -22
warp/{fem → _src/fem}/geometry/element.py +32 -10
warp/{fem → _src/fem}/geometry/geometry.py +48 -20
warp/{fem → _src/fem}/geometry/grid_2d.py +12 -23
warp/{fem → _src/fem}/geometry/grid_3d.py +12 -23
warp/{fem → _src/fem}/geometry/hexmesh.py +40 -63
warp/{fem → _src/fem}/geometry/nanogrid.py +255 -248
warp/{fem → _src/fem}/geometry/partition.py +121 -63
warp/{fem → _src/fem}/geometry/quadmesh.py +26 -45
warp/{fem → _src/fem}/geometry/tetmesh.py +40 -63
warp/{fem → _src/fem}/geometry/trimesh.py +26 -45
warp/{fem → _src/fem}/integrate.py +164 -158
warp/_src/fem/linalg.py +383 -0
warp/_src/fem/operator.py +396 -0
warp/_src/fem/polynomial.py +229 -0
warp/{fem → _src/fem}/quadrature/pic_quadrature.py +15 -20
warp/{fem → _src/fem}/quadrature/quadrature.py +95 -47
warp/_src/fem/space/__init__.py +248 -0
warp/{fem → _src/fem}/space/basis_function_space.py +20 -11
warp/_src/fem/space/basis_space.py +679 -0
warp/{fem → _src/fem}/space/dof_mapper.py +3 -3
warp/{fem → _src/fem}/space/function_space.py +14 -13
warp/{fem → _src/fem}/space/grid_2d_function_space.py +4 -7
warp/{fem → _src/fem}/space/grid_3d_function_space.py +4 -4
warp/{fem → _src/fem}/space/hexmesh_function_space.py +4 -10
warp/{fem → _src/fem}/space/nanogrid_function_space.py +3 -9
warp/{fem → _src/fem}/space/partition.py +117 -60
warp/{fem → _src/fem}/space/quadmesh_function_space.py +4 -10
warp/{fem → _src/fem}/space/restriction.py +66 -33
warp/_src/fem/space/shape/__init__.py +152 -0
warp/{fem → _src/fem}/space/shape/cube_shape_function.py +9 -9
warp/{fem → _src/fem}/space/shape/shape_function.py +8 -9
warp/{fem → _src/fem}/space/shape/square_shape_function.py +6 -6
warp/{fem → _src/fem}/space/shape/tet_shape_function.py +3 -3
warp/{fem → _src/fem}/space/shape/triangle_shape_function.py +3 -3
warp/{fem → _src/fem}/space/tetmesh_function_space.py +3 -9
warp/_src/fem/space/topology.py +459 -0
warp/{fem → _src/fem}/space/trimesh_function_space.py +3 -9
warp/_src/fem/types.py +112 -0
warp/_src/fem/utils.py +486 -0
warp/_src/jax.py +186 -0
warp/_src/jax_experimental/__init__.py +14 -0
warp/_src/jax_experimental/custom_call.py +387 -0
warp/_src/jax_experimental/ffi.py +1284 -0
warp/_src/jax_experimental/xla_ffi.py +656 -0
warp/_src/marching_cubes.py +708 -0
warp/_src/math.py +414 -0
warp/_src/optim/__init__.py +14 -0
warp/_src/optim/adam.py +163 -0
warp/_src/optim/linear.py +1606 -0
warp/_src/optim/sgd.py +112 -0
warp/_src/paddle.py +406 -0
warp/_src/render/__init__.py +14 -0
warp/_src/render/imgui_manager.py +289 -0
warp/_src/render/render_opengl.py +3636 -0
warp/_src/render/render_usd.py +937 -0
warp/_src/render/utils.py +160 -0
warp/_src/sparse.py +2716 -0
warp/_src/tape.py +1206 -0
warp/{thirdparty → _src/thirdparty}/unittest_parallel.py +9 -2
warp/_src/torch.py +391 -0
warp/_src/types.py +5870 -0
warp/_src/utils.py +1693 -0
warp/autograd.py +12 -1054
warp/bin/warp-clang.dll +0 -0
warp/bin/warp.dll +0 -0
warp/build.py +8 -588
warp/build_dll.py +6 -471
warp/codegen.py +6 -4246
warp/constants.py +6 -39
warp/context.py +12 -7851
warp/dlpack.py +6 -444
warp/examples/distributed/example_jacobi_mpi.py +4 -5
warp/examples/fem/example_adaptive_grid.py +1 -1
warp/examples/fem/example_apic_fluid.py +1 -1
warp/examples/fem/example_burgers.py +8 -8
warp/examples/fem/example_diffusion.py +1 -1
warp/examples/fem/example_distortion_energy.py +1 -1
warp/examples/fem/example_mixed_elasticity.py +2 -2
warp/examples/fem/example_navier_stokes.py +1 -1
warp/examples/fem/example_nonconforming_contact.py +7 -7
warp/examples/fem/example_stokes.py +1 -1
warp/examples/fem/example_stokes_transfer.py +1 -1
warp/examples/fem/utils.py +2 -2
warp/examples/interop/example_jax_callable.py +1 -1
warp/examples/interop/example_jax_ffi_callback.py +1 -1
warp/examples/interop/example_jax_kernel.py +3 -2
warp/examples/tile/example_tile_mcgp.py +191 -0
warp/fabric.py +6 -337
warp/fem/__init__.py +159 -97
warp/fem/adaptivity.py +7 -489
warp/fem/cache.py +9 -648
warp/fem/dirichlet.py +6 -184
warp/fem/field/__init__.py +8 -109
warp/fem/field/field.py +7 -652
warp/fem/geometry/__init__.py +7 -18
warp/fem/geometry/closest_point.py +11 -77
warp/fem/linalg.py +18 -366
warp/fem/operator.py +11 -369
warp/fem/polynomial.py +9 -209
warp/fem/space/__init__.py +5 -211
warp/fem/space/basis_space.py +6 -662
warp/fem/space/shape/__init__.py +41 -118
warp/fem/space/topology.py +6 -437
warp/fem/types.py +6 -81
warp/fem/utils.py +11 -444
warp/jax.py +8 -165
warp/jax_experimental/__init__.py +14 -1
warp/jax_experimental/custom_call.py +8 -342
warp/jax_experimental/ffi.py +17 -853
warp/jax_experimental/xla_ffi.py +5 -596
warp/marching_cubes.py +5 -689
warp/math.py +16 -393
warp/native/array.h +385 -37
warp/native/builtin.h +316 -39
warp/native/bvh.cpp +43 -9
warp/native/bvh.cu +62 -27
warp/native/bvh.h +310 -309
warp/native/clang/clang.cpp +102 -97
warp/native/coloring.cpp +0 -1
warp/native/crt.h +208 -0
warp/native/exports.h +156 -0
warp/native/hashgrid.cu +2 -0
warp/native/intersect.h +24 -1
warp/native/intersect_tri.h +44 -35
warp/native/mat.h +1456 -276
warp/native/mesh.cpp +4 -4
warp/native/mesh.cu +4 -2
warp/native/mesh.h +176 -61
warp/native/quat.h +0 -52
warp/native/scan.cu +2 -0
warp/native/sort.cu +22 -13
warp/native/sort.h +2 -0
warp/native/sparse.cu +7 -3
warp/native/spatial.h +12 -0
warp/native/tile.h +837 -70
warp/native/tile_radix_sort.h +1 -1
warp/native/tile_reduce.h +394 -46
warp/native/tile_scan.h +4 -4
warp/native/vec.h +469 -53
warp/native/version.h +23 -0
warp/native/volume.cpp +1 -1
warp/native/volume.cu +1 -0
warp/native/volume.h +1 -1
warp/native/volume_builder.cu +2 -0
warp/native/warp.cpp +60 -32
warp/native/warp.cu +313 -201
warp/native/warp.h +14 -11
warp/optim/__init__.py +6 -3
warp/optim/adam.py +6 -145
warp/optim/linear.py +14 -1585
warp/optim/sgd.py +6 -94
warp/paddle.py +6 -388
warp/render/__init__.py +8 -4
warp/render/imgui_manager.py +7 -267
warp/render/render_opengl.py +6 -3616
warp/render/render_usd.py +6 -918
warp/render/utils.py +6 -142
warp/sparse.py +37 -2563
warp/tape.py +6 -1188
warp/tests/__main__.py +1 -1
warp/tests/cuda/test_async.py +4 -4
warp/tests/cuda/test_conditional_captures.py +1 -1
warp/tests/cuda/test_multigpu.py +1 -1
warp/tests/cuda/test_streams.py +58 -1
warp/tests/geometry/test_bvh.py +157 -22
warp/tests/geometry/test_hash_grid.py +38 -0
warp/tests/geometry/test_marching_cubes.py +0 -1
warp/tests/geometry/test_mesh.py +5 -3
warp/tests/geometry/test_mesh_query_aabb.py +5 -12
warp/tests/geometry/test_mesh_query_point.py +5 -2
warp/tests/geometry/test_mesh_query_ray.py +15 -3
warp/tests/geometry/test_volume_write.py +5 -5
warp/tests/interop/test_dlpack.py +14 -14
warp/tests/interop/test_jax.py +1382 -79
warp/tests/interop/test_paddle.py +1 -1
warp/tests/test_adam.py +0 -1
warp/tests/test_arithmetic.py +9 -9
warp/tests/test_array.py +529 -100
warp/tests/test_array_reduce.py +3 -3
warp/tests/test_atomic.py +12 -8
warp/tests/test_atomic_bitwise.py +209 -0
warp/tests/test_atomic_cas.py +4 -4
warp/tests/test_bool.py +2 -2
warp/tests/test_builtins_resolution.py +5 -571
warp/tests/test_codegen.py +34 -15
warp/tests/test_conditional.py +1 -1
warp/tests/test_context.py +6 -6
warp/tests/test_copy.py +242 -161
warp/tests/test_ctypes.py +3 -3
warp/tests/test_devices.py +24 -2
warp/tests/test_examples.py +16 -84
warp/tests/test_fabricarray.py +35 -35
warp/tests/test_fast_math.py +0 -2
warp/tests/test_fem.py +60 -14
warp/tests/test_fixedarray.py +3 -3
warp/tests/test_func.py +8 -5
warp/tests/test_generics.py +1 -1
warp/tests/test_indexedarray.py +24 -24
warp/tests/test_intersect.py +39 -9
warp/tests/test_large.py +1 -1
warp/tests/test_lerp.py +3 -1
warp/tests/test_linear_solvers.py +1 -1
warp/tests/test_map.py +49 -4
warp/tests/test_mat.py +52 -62
warp/tests/test_mat_constructors.py +4 -5
warp/tests/test_mat_lite.py +1 -1
warp/tests/test_mat_scalar_ops.py +121 -121
warp/tests/test_math.py +34 -0
warp/tests/test_module_aot.py +4 -4
warp/tests/test_modules_lite.py +28 -2
warp/tests/test_print.py +11 -11
warp/tests/test_quat.py +93 -58
warp/tests/test_runlength_encode.py +1 -1
warp/tests/test_scalar_ops.py +38 -10
warp/tests/test_smoothstep.py +1 -1
warp/tests/test_sparse.py +126 -15
warp/tests/test_spatial.py +105 -87
warp/tests/test_special_values.py +6 -6
warp/tests/test_static.py +7 -7
warp/tests/test_struct.py +13 -2
warp/tests/test_triangle_closest_point.py +48 -1
warp/tests/test_tuple.py +96 -0
warp/tests/test_types.py +82 -9
warp/tests/test_utils.py +52 -52
warp/tests/test_vec.py +29 -29
warp/tests/test_vec_constructors.py +5 -5
warp/tests/test_vec_scalar_ops.py +97 -97
warp/tests/test_version.py +75 -0
warp/tests/tile/test_tile.py +239 -0
warp/tests/tile/test_tile_atomic_bitwise.py +403 -0
warp/tests/tile/test_tile_cholesky.py +7 -4
warp/tests/tile/test_tile_load.py +26 -2
warp/tests/tile/test_tile_mathdx.py +3 -3
warp/tests/tile/test_tile_matmul.py +1 -1
warp/tests/tile/test_tile_mlp.py +2 -4
warp/tests/tile/test_tile_reduce.py +214 -13
warp/tests/unittest_suites.py +6 -14
warp/tests/unittest_utils.py +10 -9
warp/tests/walkthrough_debug.py +3 -1
warp/torch.py +6 -373
warp/types.py +29 -5750
warp/utils.py +10 -1659
{warp_lang-1.9.0.dist-info → warp_lang-1.10.0rc2.dist-info}/METADATA +47 -103
warp_lang-1.10.0rc2.dist-info/RECORD +468 -0
warp_lang-1.10.0rc2.dist-info/licenses/licenses/Gaia-LICENSE.txt +6 -0
warp_lang-1.10.0rc2.dist-info/licenses/licenses/appdirs-LICENSE.txt +22 -0
warp_lang-1.10.0rc2.dist-info/licenses/licenses/asset_pixel_jpg-LICENSE.txt +3 -0
warp_lang-1.10.0rc2.dist-info/licenses/licenses/cuda-LICENSE.txt +1582 -0
warp_lang-1.10.0rc2.dist-info/licenses/licenses/dlpack-LICENSE.txt +201 -0
warp_lang-1.10.0rc2.dist-info/licenses/licenses/fp16-LICENSE.txt +28 -0
warp_lang-1.10.0rc2.dist-info/licenses/licenses/libmathdx-LICENSE.txt +220 -0
warp_lang-1.10.0rc2.dist-info/licenses/licenses/llvm-LICENSE.txt +279 -0
warp_lang-1.10.0rc2.dist-info/licenses/licenses/moller-LICENSE.txt +16 -0
warp_lang-1.10.0rc2.dist-info/licenses/licenses/nanovdb-LICENSE.txt +2 -0
warp_lang-1.10.0rc2.dist-info/licenses/licenses/nvrtc-LICENSE.txt +1592 -0
warp_lang-1.10.0rc2.dist-info/licenses/licenses/svd-LICENSE.txt +23 -0
warp_lang-1.10.0rc2.dist-info/licenses/licenses/unittest_parallel-LICENSE.txt +21 -0
warp_lang-1.10.0rc2.dist-info/licenses/licenses/usd-LICENSE.txt +213 -0
warp_lang-1.10.0rc2.dist-info/licenses/licenses/windingnumber-LICENSE.txt +21 -0
warp/examples/assets/cartpole.urdf +0 -110
warp/examples/assets/crazyflie.usd +0 -0
warp/examples/assets/nv_ant.xml +0 -92
warp/examples/assets/nv_humanoid.xml +0 -183
warp/examples/assets/quadruped.urdf +0 -268
warp/examples/optim/example_bounce.py +0 -266
warp/examples/optim/example_cloth_throw.py +0 -228
warp/examples/optim/example_drone.py +0 -870
warp/examples/optim/example_inverse_kinematics.py +0 -182
warp/examples/optim/example_inverse_kinematics_torch.py +0 -191
warp/examples/optim/example_softbody_properties.py +0 -400
warp/examples/optim/example_spring_cage.py +0 -245
warp/examples/optim/example_trajectory.py +0 -227
warp/examples/sim/example_cartpole.py +0 -143
warp/examples/sim/example_cloth.py +0 -225
warp/examples/sim/example_cloth_self_contact.py +0 -316
warp/examples/sim/example_granular.py +0 -130
warp/examples/sim/example_granular_collision_sdf.py +0 -202
warp/examples/sim/example_jacobian_ik.py +0 -244
warp/examples/sim/example_particle_chain.py +0 -124
warp/examples/sim/example_quadruped.py +0 -203
warp/examples/sim/example_rigid_chain.py +0 -203
warp/examples/sim/example_rigid_contact.py +0 -195
warp/examples/sim/example_rigid_force.py +0 -133
warp/examples/sim/example_rigid_gyroscopic.py +0 -115
warp/examples/sim/example_rigid_soft_contact.py +0 -140
warp/examples/sim/example_soft_body.py +0 -196
warp/examples/tile/example_tile_walker.py +0 -327
warp/sim/__init__.py +0 -74
warp/sim/articulation.py +0 -793
warp/sim/collide.py +0 -2570
warp/sim/graph_coloring.py +0 -307
warp/sim/import_mjcf.py +0 -791
warp/sim/import_snu.py +0 -227
warp/sim/import_urdf.py +0 -579
warp/sim/import_usd.py +0 -898
warp/sim/inertia.py +0 -357
warp/sim/integrator.py +0 -245
warp/sim/integrator_euler.py +0 -2000
warp/sim/integrator_featherstone.py +0 -2101
warp/sim/integrator_vbd.py +0 -2487
warp/sim/integrator_xpbd.py +0 -3295
warp/sim/model.py +0 -4821
warp/sim/particles.py +0 -121
warp/sim/render.py +0 -431
warp/sim/utils.py +0 -431
warp/tests/sim/disabled_kinematics.py +0 -244
warp/tests/sim/test_cloth.py +0 -863
warp/tests/sim/test_collision.py +0 -743
warp/tests/sim/test_coloring.py +0 -347
warp/tests/sim/test_inertia.py +0 -161
warp/tests/sim/test_model.py +0 -226
warp/tests/sim/test_sim_grad.py +0 -287
warp/tests/sim/test_sim_grad_bounce_linear.py +0 -212
warp/tests/sim/test_sim_kinematics.py +0 -98
warp/thirdparty/__init__.py +0 -0
warp_lang-1.9.0.dist-info/RECORD +0 -456
/warp/{fem → _src/fem}/quadrature/__init__.py +0 -0
/warp/{tests/sim → _src/thirdparty}/__init__.py +0 -0
/warp/{thirdparty → _src/thirdparty}/appdirs.py +0 -0
/warp/{thirdparty → _src/thirdparty}/dlpack.py +0 -0
{warp_lang-1.9.0.dist-info → warp_lang-1.10.0rc2.dist-info}/WHEEL +0 -0
{warp_lang-1.9.0.dist-info → warp_lang-1.10.0rc2.dist-info}/licenses/LICENSE.md +0 -0
{warp_lang-1.9.0.dist-info → warp_lang-1.10.0rc2.dist-info}/top_level.txt +0 -0

warp/native/warp.cu CHANGED Viewed

@@ -19,6 +19,7 @@
 #include "scan.h"
 #include "cuda_util.h"
 #include "error.h"
+#include "sort.h"
 #include <cstdlib>
 #include <fstream>
@@ -221,6 +222,14 @@ struct ModuleInfo
     void* module = NULL;
 };
+// Information used when deferring graph destruction.
+struct GraphDestroyInfo
+{
+    void* context = NULL;
+    void* graph = NULL;
+    void* graph_exec = NULL;
+};
 static std::unordered_map<CUfunction, std::string> g_kernel_names;
 // cached info for all devices, indexed by ordinal
@@ -252,6 +261,11 @@ static std::vector<FreeInfo> g_deferred_free_list;
 // Call unload_deferred_modules() to release.
 static std::vector<ModuleInfo> g_deferred_module_list;
+// Graphs that cannot be destroyed immediately get queued here.
+// Call destroy_deferred_graphs() to release.
+static std::vector<GraphDestroyInfo> g_deferred_graph_list;
 void wp_cuda_set_context_restore_policy(bool always_restore)
 {
     ContextGuard::always_restore = always_restore;
@@ -337,7 +351,7 @@ int cuda_init()
 }
-static inline CUcontext get_current_context()
+CUcontext get_current_context()
 {
     CUcontext ctx;
     if (check_cu(cuCtxGetCurrent_f(&ctx)))
@@ -494,6 +508,38 @@ static int unload_deferred_modules(void* context = NULL)
     return num_unloaded_modules;
 }
+static int destroy_deferred_graphs(void* context = NULL)
+{
+    if (g_deferred_graph_list.empty() || !g_captures.empty())
+        return 0;
+    int num_destroyed_graphs = 0;
+    for (auto it = g_deferred_graph_list.begin(); it != g_deferred_graph_list.end(); /*noop*/)
+    {
+        // destroy the graph if it matches the given context or if the context is unspecified
+        const GraphDestroyInfo& graph_info = *it;
+        if (graph_info.context == context || !context)
+        {
+            if (graph_info.graph)
+            {
+                check_cuda(cudaGraphDestroy((cudaGraph_t)graph_info.graph));
+            }
+            if (graph_info.graph_exec)
+            {
+                check_cuda(cudaGraphExecDestroy((cudaGraphExec_t)graph_info.graph_exec));
+            }
+            ++num_destroyed_graphs;
+            it = g_deferred_graph_list.erase(it);
+        }
+        else
+        {
+            ++it;
+        }
+    }
+    return num_destroyed_graphs;
+}
 static void CUDART_CB on_graph_destroy(void* user_data)
 {
     if (!user_data)
@@ -988,15 +1034,15 @@ void wp_memtile_device(void* context, void* dst, const void* src, size_t srcsize
 static __global__ void array_copy_1d_kernel(void* dst, const void* src,
-                                        int dst_stride, int src_stride,
+                                        size_t dst_stride, size_t src_stride,
                                         const int* dst_indices, const int* src_indices,
-                                        int n, int elem_size)
+                                        size_t n, size_t elem_size)
 {
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t i = size_t(blockIdx.x) * size_t(blockDim.x) + size_t(threadIdx.x);
     if (i < n)
     {
-        int src_idx = src_indices ? src_indices[i] : i;
-        int dst_idx = dst_indices ? dst_indices[i] : i;
+        size_t src_idx = src_indices ? src_indices[i] : i;
+        size_t dst_idx = dst_indices ? dst_indices[i] : i;
         const char* p = (const char*)src + src_idx * src_stride;
         char* q = (char*)dst + dst_idx * dst_stride;
         memcpy(q, p, elem_size);
@@ -1004,20 +1050,20 @@ static __global__ void array_copy_1d_kernel(void* dst, const void* src,
 }
 static __global__ void array_copy_2d_kernel(void* dst, const void* src,
-                                        wp::vec_t<2, int> dst_strides, wp::vec_t<2, int> src_strides,
+                                        wp::vec_t<2, size_t> dst_strides, wp::vec_t<2, size_t> src_strides,
                                         wp::vec_t<2, const int*> dst_indices, wp::vec_t<2, const int*> src_indices,
-                                        wp::vec_t<2, int> shape, int elem_size)
+                                        wp::vec_t<2, size_t> shape, size_t elem_size)
 {
-    int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    int n = shape[1];
-    int i = tid / n;
-    int j = tid % n;
+    size_t tid = size_t(blockIdx.x) * size_t(blockDim.x) + size_t(threadIdx.x);
+    size_t n = shape[1];
+    size_t i = tid / n;
+    size_t j = tid % n;
     if (i < shape[0] /*&& j < shape[1]*/)
     {
-        int src_idx0 = src_indices[0] ? src_indices[0][i] : i;
-        int dst_idx0 = dst_indices[0] ? dst_indices[0][i] : i;
-        int src_idx1 = src_indices[1] ? src_indices[1][j] : j;
-        int dst_idx1 = dst_indices[1] ? dst_indices[1][j] : j;
+        size_t src_idx0 = src_indices[0] ? src_indices[0][i] : i;
+        size_t dst_idx0 = dst_indices[0] ? dst_indices[0][i] : i;
+        size_t src_idx1 = src_indices[1] ? src_indices[1][j] : j;
+        size_t dst_idx1 = dst_indices[1] ? dst_indices[1][j] : j;
         const char* p = (const char*)src + src_idx0 * src_strides[0] + src_idx1 * src_strides[1];
         char* q = (char*)dst + dst_idx0 * dst_strides[0] + dst_idx1 * dst_strides[1];
         memcpy(q, p, elem_size);
@@ -1025,24 +1071,24 @@ static __global__ void array_copy_2d_kernel(void* dst, const void* src,
 }
 static __global__ void array_copy_3d_kernel(void* dst, const void* src,
-                                        wp::vec_t<3, int> dst_strides, wp::vec_t<3, int> src_strides,
+                                        wp::vec_t<3, size_t> dst_strides, wp::vec_t<3, size_t> src_strides,
                                         wp::vec_t<3, const int*> dst_indices, wp::vec_t<3, const int*> src_indices,
-                                        wp::vec_t<3, int> shape, int elem_size)
-{
-    int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    int n = shape[1];
-    int o = shape[2];
-    int i = tid / (n * o);
-    int j = tid % (n * o) / o;
-    int k = tid % o;
+                                        wp::vec_t<3, size_t> shape, size_t elem_size)
+{
+    size_t tid = size_t(blockIdx.x) * size_t(blockDim.x) + size_t(threadIdx.x);
+    size_t n = shape[1];
+    size_t o = shape[2];
+    size_t i = tid / (n * o);
+    size_t j = tid % (n * o) / o;
+    size_t k = tid % o;
     if (i < shape[0] && j < shape[1] /*&& k < shape[2]*/)
     {
-        int src_idx0 = src_indices[0] ? src_indices[0][i] : i;
-        int dst_idx0 = dst_indices[0] ? dst_indices[0][i] : i;
-        int src_idx1 = src_indices[1] ? src_indices[1][j] : j;
-        int dst_idx1 = dst_indices[1] ? dst_indices[1][j] : j;
-        int src_idx2 = src_indices[2] ? src_indices[2][k] : k;
-        int dst_idx2 = dst_indices[2] ? dst_indices[2][k] : k;
+        size_t src_idx0 = src_indices[0] ? src_indices[0][i] : i;
+        size_t dst_idx0 = dst_indices[0] ? dst_indices[0][i] : i;
+        size_t src_idx1 = src_indices[1] ? src_indices[1][j] : j;
+        size_t dst_idx1 = dst_indices[1] ? dst_indices[1][j] : j;
+        size_t src_idx2 = src_indices[2] ? src_indices[2][k] : k;
+        size_t dst_idx2 = dst_indices[2] ? dst_indices[2][k] : k;
         const char* p = (const char*)src + src_idx0 * src_strides[0]
                                          + src_idx1 * src_strides[1]
                                          + src_idx2 * src_strides[2];
@@ -1054,28 +1100,28 @@ static __global__ void array_copy_3d_kernel(void* dst, const void* src,
 }
 static __global__ void array_copy_4d_kernel(void* dst, const void* src,
-                                        wp::vec_t<4, int> dst_strides, wp::vec_t<4, int> src_strides,
+                                        wp::vec_t<4, size_t> dst_strides, wp::vec_t<4, size_t> src_strides,
                                         wp::vec_t<4, const int*> dst_indices, wp::vec_t<4, const int*> src_indices,
-                                        wp::vec_t<4, int> shape, int elem_size)
-{
-    int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    int n = shape[1];
-    int o = shape[2];
-    int p = shape[3];
-    int i = tid / (n * o * p);
-    int j = tid % (n * o * p) / (o * p);
-    int k = tid % (o * p) / p;
-    int l = tid % p;
+                                        wp::vec_t<4, size_t> shape, size_t elem_size)
+{
+    size_t tid = size_t(blockIdx.x) * size_t(blockDim.x) + size_t(threadIdx.x);
+    size_t n = shape[1];
+    size_t o = shape[2];
+    size_t p = shape[3];
+    size_t i = tid / (n * o * p);
+    size_t j = tid % (n * o * p) / (o * p);
+    size_t k = tid % (o * p) / p;
+    size_t l = tid % p;
     if (i < shape[0] && j < shape[1] && k < shape[2] /*&& l < shape[3]*/)
     {
-        int src_idx0 = src_indices[0] ? src_indices[0][i] : i;
-        int dst_idx0 = dst_indices[0] ? dst_indices[0][i] : i;
-        int src_idx1 = src_indices[1] ? src_indices[1][j] : j;
-        int dst_idx1 = dst_indices[1] ? dst_indices[1][j] : j;
-        int src_idx2 = src_indices[2] ? src_indices[2][k] : k;
-        int dst_idx2 = dst_indices[2] ? dst_indices[2][k] : k;
-        int src_idx3 = src_indices[3] ? src_indices[3][l] : l;
-        int dst_idx3 = dst_indices[3] ? dst_indices[3][l] : l;
+        size_t src_idx0 = src_indices[0] ? src_indices[0][i] : i;
+        size_t dst_idx0 = dst_indices[0] ? dst_indices[0][i] : i;
+        size_t src_idx1 = src_indices[1] ? src_indices[1][j] : j;
+        size_t dst_idx1 = dst_indices[1] ? dst_indices[1][j] : j;
+        size_t src_idx2 = src_indices[2] ? src_indices[2][k] : k;
+        size_t dst_idx2 = dst_indices[2] ? dst_indices[2][k] : k;
+        size_t src_idx3 = src_indices[3] ? src_indices[3][l] : l;
+        size_t dst_idx3 = dst_indices[3] ? dst_indices[3][l] : l;
         const char* p = (const char*)src + src_idx0 * src_strides[0]
                                          + src_idx1 * src_strides[1]
                                          + src_idx2 * src_strides[2]
@@ -1090,14 +1136,14 @@ static __global__ void array_copy_4d_kernel(void* dst, const void* src,
 static __global__ void array_copy_from_fabric_kernel(wp::fabricarray_t<void> src,
-                                                     void* dst_data, int dst_stride, const int* dst_indices,
-                                                     int elem_size)
+                                                     void* dst_data, size_t dst_stride, const int* dst_indices,
+                                                     size_t elem_size)
 {
-    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t tid = size_t(blockIdx.x) * size_t(blockDim.x) + size_t(threadIdx.x);
     if (tid < src.size)
     {
-        int dst_idx = dst_indices ? dst_indices[tid] : tid;
+        size_t dst_idx = dst_indices ? dst_indices[tid] : tid;
         void* dst_ptr = (char*)dst_data + dst_idx * dst_stride;
         const void* src_ptr = fabricarray_element_ptr(src, tid, elem_size);
         memcpy(dst_ptr, src_ptr, elem_size);
@@ -1105,15 +1151,15 @@ static __global__ void array_copy_from_fabric_kernel(wp::fabricarray_t<void> src
 }
 static __global__ void array_copy_from_fabric_indexed_kernel(wp::indexedfabricarray_t<void> src,
-                                                             void* dst_data, int dst_stride, const int* dst_indices,
-                                                             int elem_size)
+                                                             void* dst_data, size_t dst_stride, const int* dst_indices,
+                                                             size_t elem_size)
 {
-    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t tid = size_t(blockIdx.x) * size_t(blockDim.x) + size_t(threadIdx.x);
     if (tid < src.size)
     {
-        int src_index = src.indices[tid];
-        int dst_idx = dst_indices ? dst_indices[tid] : tid;
+        size_t src_index = src.indices[tid];
+        size_t dst_idx = dst_indices ? dst_indices[tid] : tid;
         void* dst_ptr = (char*)dst_data + dst_idx * dst_stride;
         const void* src_ptr = fabricarray_element_ptr(src.fa, src_index, elem_size);
         memcpy(dst_ptr, src_ptr, elem_size);
@@ -1121,14 +1167,14 @@ static __global__ void array_copy_from_fabric_indexed_kernel(wp::indexedfabricar
 }
 static __global__ void array_copy_to_fabric_kernel(wp::fabricarray_t<void> dst,
-                                                   const void* src_data, int src_stride, const int* src_indices,
-                                                   int elem_size)
+                                                   const void* src_data, size_t src_stride, const int* src_indices,
+                                                   size_t elem_size)
 {
-    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t tid = size_t(blockIdx.x) * size_t(blockDim.x) + size_t(threadIdx.x);
     if (tid < dst.size)
     {
-        int src_idx = src_indices ? src_indices[tid] : tid;
+        size_t src_idx = src_indices ? src_indices[tid] : tid;
         const void* src_ptr = (const char*)src_data + src_idx * src_stride;
         void* dst_ptr = fabricarray_element_ptr(dst, tid, elem_size);
         memcpy(dst_ptr, src_ptr, elem_size);
@@ -1136,25 +1182,25 @@ static __global__ void array_copy_to_fabric_kernel(wp::fabricarray_t<void> dst,
 }
 static __global__ void array_copy_to_fabric_indexed_kernel(wp::indexedfabricarray_t<void> dst,
-                                                           const void* src_data, int src_stride, const int* src_indices,
-                                                           int elem_size)
+                                                           const void* src_data, size_t src_stride, const int* src_indices,
+                                                           size_t elem_size)
 {
-    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t tid = size_t(blockIdx.x) * size_t(blockDim.x) + size_t(threadIdx.x);
     if (tid < dst.size)
     {
-        int src_idx = src_indices ? src_indices[tid] : tid;
+        size_t src_idx = src_indices ? src_indices[tid] : tid;
         const void* src_ptr = (const char*)src_data + src_idx * src_stride;
-        int dst_idx = dst.indices[tid];
+        size_t dst_idx = dst.indices[tid];
         void* dst_ptr = fabricarray_element_ptr(dst.fa, dst_idx, elem_size);
         memcpy(dst_ptr, src_ptr, elem_size);
     }
 }
-static __global__ void array_copy_fabric_to_fabric_kernel(wp::fabricarray_t<void> dst, wp::fabricarray_t<void> src, int elem_size)
+static __global__ void array_copy_fabric_to_fabric_kernel(wp::fabricarray_t<void> dst, wp::fabricarray_t<void> src, size_t elem_size)
 {
-    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t tid = size_t(blockIdx.x) * size_t(blockDim.x) + size_t(threadIdx.x);
     if (tid < dst.size)
     {
@@ -1165,27 +1211,27 @@ static __global__ void array_copy_fabric_to_fabric_kernel(wp::fabricarray_t<void
 }
-static __global__ void array_copy_fabric_to_fabric_indexed_kernel(wp::indexedfabricarray_t<void> dst, wp::fabricarray_t<void> src, int elem_size)
+static __global__ void array_copy_fabric_to_fabric_indexed_kernel(wp::indexedfabricarray_t<void> dst, wp::fabricarray_t<void> src, size_t elem_size)
 {
-    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t tid = size_t(blockIdx.x) * size_t(blockDim.x) + size_t(threadIdx.x);
     if (tid < dst.size)
     {
         const void* src_ptr = fabricarray_element_ptr(src, tid, elem_size);
-        int dst_index = dst.indices[tid];
+        size_t dst_index = dst.indices[tid];
         void* dst_ptr = fabricarray_element_ptr(dst.fa, dst_index, elem_size);
         memcpy(dst_ptr, src_ptr, elem_size);
     }
 }
-static __global__ void array_copy_fabric_indexed_to_fabric_kernel(wp::fabricarray_t<void> dst, wp::indexedfabricarray_t<void> src, int elem_size)
+static __global__ void array_copy_fabric_indexed_to_fabric_kernel(wp::fabricarray_t<void> dst, wp::indexedfabricarray_t<void> src, size_t elem_size)
 {
-    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t tid = size_t(blockIdx.x) * size_t(blockDim.x) + size_t(threadIdx.x);
     if (tid < dst.size)
     {
-        int src_index = src.indices[tid];
+        size_t src_index = src.indices[tid];
         const void* src_ptr = fabricarray_element_ptr(src.fa, src_index, elem_size);
         void* dst_ptr = fabricarray_element_ptr(dst, tid, elem_size);
         memcpy(dst_ptr, src_ptr, elem_size);
@@ -1193,14 +1239,14 @@ static __global__ void array_copy_fabric_indexed_to_fabric_kernel(wp::fabricarra
 }
-static __global__ void array_copy_fabric_indexed_to_fabric_indexed_kernel(wp::indexedfabricarray_t<void> dst, wp::indexedfabricarray_t<void> src, int elem_size)
+static __global__ void array_copy_fabric_indexed_to_fabric_indexed_kernel(wp::indexedfabricarray_t<void> dst, wp::indexedfabricarray_t<void> src, size_t elem_size)
 {
-    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t tid = size_t(blockIdx.x) * size_t(blockDim.x) + size_t(threadIdx.x);
     if (tid < dst.size)
     {
-        int src_index = src.indices[tid];
-        int dst_index = dst.indices[tid];
+        size_t src_index = src.indices[tid];
+        size_t dst_index = dst.indices[tid];
         const void* src_ptr = fabricarray_element_ptr(src.fa, src_index, elem_size);
         void* dst_ptr = fabricarray_element_ptr(dst.fa, dst_index, elem_size);
         memcpy(dst_ptr, src_ptr, elem_size);
@@ -1439,9 +1485,9 @@ WP_API bool wp_array_copy_device(void* context, void* dst, void* src, int dst_ty
     }
     case 2:
     {
-        wp::vec_t<2, int> shape_v(src_shape[0], src_shape[1]);
-        wp::vec_t<2, int> src_strides_v(src_strides[0], src_strides[1]);
-        wp::vec_t<2, int> dst_strides_v(dst_strides[0], dst_strides[1]);
+        wp::vec_t<2, size_t> shape_v(src_shape[0], src_shape[1]);
+        wp::vec_t<2, size_t> src_strides_v(src_strides[0], src_strides[1]);
+        wp::vec_t<2, size_t> dst_strides_v(dst_strides[0], dst_strides[1]);
         wp::vec_t<2, const int*> src_indices_v(src_indices[0], src_indices[1]);
         wp::vec_t<2, const int*> dst_indices_v(dst_indices[0], dst_indices[1]);
@@ -1453,9 +1499,9 @@ WP_API bool wp_array_copy_device(void* context, void* dst, void* src, int dst_ty
     }
     case 3:
     {
-        wp::vec_t<3, int> shape_v(src_shape[0], src_shape[1], src_shape[2]);
-        wp::vec_t<3, int> src_strides_v(src_strides[0], src_strides[1], src_strides[2]);
-        wp::vec_t<3, int> dst_strides_v(dst_strides[0], dst_strides[1], dst_strides[2]);
+        wp::vec_t<3, size_t> shape_v(src_shape[0], src_shape[1], src_shape[2]);
+        wp::vec_t<3, size_t> src_strides_v(src_strides[0], src_strides[1], src_strides[2]);
+        wp::vec_t<3, size_t> dst_strides_v(dst_strides[0], dst_strides[1], dst_strides[2]);
         wp::vec_t<3, const int*> src_indices_v(src_indices[0], src_indices[1], src_indices[2]);
         wp::vec_t<3, const int*> dst_indices_v(dst_indices[0], dst_indices[1], dst_indices[2]);
@@ -1467,9 +1513,9 @@ WP_API bool wp_array_copy_device(void* context, void* dst, void* src, int dst_ty
     }
     case 4:
     {
-        wp::vec_t<4, int> shape_v(src_shape[0], src_shape[1], src_shape[2], src_shape[3]);
-        wp::vec_t<4, int> src_strides_v(src_strides[0], src_strides[1], src_strides[2], src_strides[3]);
-        wp::vec_t<4, int> dst_strides_v(dst_strides[0], dst_strides[1], dst_strides[2], dst_strides[3]);
+        wp::vec_t<4, size_t> shape_v(src_shape[0], src_shape[1], src_shape[2], src_shape[3]);
+        wp::vec_t<4, size_t> src_strides_v(src_strides[0], src_strides[1], src_strides[2], src_strides[3]);
+        wp::vec_t<4, size_t> dst_strides_v(dst_strides[0], dst_strides[1], dst_strides[2], dst_strides[3]);
         wp::vec_t<4, const int*> src_indices_v(src_indices[0], src_indices[1], src_indices[2], src_indices[3]);
         wp::vec_t<4, const int*> dst_indices_v(dst_indices[0], dst_indices[1], dst_indices[2], dst_indices[3]);
@@ -1489,94 +1535,94 @@ WP_API bool wp_array_copy_device(void* context, void* dst, void* src, int dst_ty
 static __global__ void array_fill_1d_kernel(void* data,
-                                            int n,
-                                            int stride,
+                                            size_t n,
+                                            size_t stride,
                                             const int* indices,
                                             const void* value,
-                                            int value_size)
+                                            size_t value_size)
 {
-    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t i = size_t(blockIdx.x) * size_t(blockDim.x) + size_t(threadIdx.x);
     if (i < n)
     {
-        int idx = indices ? indices[i] : i;
+        size_t idx = indices ? indices[i] : i;
         char* p = (char*)data + idx * stride;
         memcpy(p, value, value_size);
     }
 }
 static __global__ void array_fill_2d_kernel(void* data,
-                                            wp::vec_t<2, int> shape,
-                                            wp::vec_t<2, int> strides,
+                                            wp::vec_t<2, size_t> shape,
+                                            wp::vec_t<2, size_t> strides,
                                             wp::vec_t<2, const int*> indices,
                                             const void* value,
-                                            int value_size)
+                                            size_t value_size)
 {
-    int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    int n = shape[1];
-    int i = tid / n;
-    int j = tid % n;
+    size_t tid = size_t(blockIdx.x) * size_t(blockDim.x) + size_t(threadIdx.x);
+    size_t n = shape[1];
+    size_t i = tid / n;
+    size_t j = tid % n;
     if (i < shape[0] /*&& j < shape[1]*/)
     {
-        int idx0 = indices[0] ? indices[0][i] : i;
-        int idx1 = indices[1] ? indices[1][j] : j;
+        size_t idx0 = indices[0] ? indices[0][i] : i;
+        size_t idx1 = indices[1] ? indices[1][j] : j;
         char* p = (char*)data + idx0 * strides[0] + idx1 * strides[1];
         memcpy(p, value, value_size);
     }
 }
 static __global__ void array_fill_3d_kernel(void* data,
-                                            wp::vec_t<3, int> shape,
-                                            wp::vec_t<3, int> strides,
+                                            wp::vec_t<3, size_t> shape,
+                                            wp::vec_t<3, size_t> strides,
                                             wp::vec_t<3, const int*> indices,
                                             const void* value,
-                                            int value_size)
-{
-    int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    int n = shape[1];
-    int o = shape[2];
-    int i = tid / (n * o);
-    int j = tid % (n * o) / o;
-    int k = tid % o;
+                                            size_t value_size)
+{
+    size_t tid = size_t(blockIdx.x) * size_t(blockDim.x) + size_t(threadIdx.x);
+    size_t n = shape[1];
+    size_t o = shape[2];
+    size_t i = tid / (n * o);
+    size_t j = tid % (n * o) / o;
+    size_t k = tid % o;
     if (i < shape[0] && j < shape[1] /*&& k < shape[2]*/)
     {
-        int idx0 = indices[0] ? indices[0][i] : i;
-        int idx1 = indices[1] ? indices[1][j] : j;
-        int idx2 = indices[2] ? indices[2][k] : k;
+        size_t idx0 = indices[0] ? indices[0][i] : i;
+        size_t idx1 = indices[1] ? indices[1][j] : j;
+        size_t idx2 = indices[2] ? indices[2][k] : k;
         char* p = (char*)data + idx0 * strides[0] + idx1 * strides[1] + idx2 * strides[2];
         memcpy(p, value, value_size);
     }
 }
 static __global__ void array_fill_4d_kernel(void* data,
-                                            wp::vec_t<4, int> shape,
-                                            wp::vec_t<4, int> strides,
+                                            wp::vec_t<4, size_t> shape,
+                                            wp::vec_t<4, size_t> strides,
                                             wp::vec_t<4, const int*> indices,
                                             const void* value,
-                                            int value_size)
-{
-    int tid = blockIdx.x * blockDim.x + threadIdx.x;
-    int n = shape[1];
-    int o = shape[2];
-    int p = shape[3];
-    int i = tid / (n * o * p);
-    int j = tid % (n * o * p) / (o * p);
-    int k = tid % (o * p) / p;
-    int l = tid % p;
+                                            size_t value_size)
+{
+    size_t tid = size_t(blockIdx.x) * size_t(blockDim.x) + size_t(threadIdx.x);
+    size_t n = shape[1];
+    size_t o = shape[2];
+    size_t p = shape[3];
+    size_t i = tid / (n * o * p);
+    size_t j = tid % (n * o * p) / (o * p);
+    size_t k = tid % (o * p) / p;
+    size_t l = tid % p;
     if (i < shape[0] && j < shape[1] && k < shape[2] /*&& l < shape[3]*/)
     {
-        int idx0 = indices[0] ? indices[0][i] : i;
-        int idx1 = indices[1] ? indices[1][j] : j;
-        int idx2 = indices[2] ? indices[2][k] : k;
-        int idx3 = indices[3] ? indices[3][l] : l;
+        size_t idx0 = indices[0] ? indices[0][i] : i;
+        size_t idx1 = indices[1] ? indices[1][j] : j;
+        size_t idx2 = indices[2] ? indices[2][k] : k;
+        size_t idx3 = indices[3] ? indices[3][l] : l;
         char* p = (char*)data + idx0 * strides[0] + idx1 * strides[1] + idx2 * strides[2] + idx3 * strides[3];
         memcpy(p, value, value_size);
     }
 }
-static __global__ void array_fill_fabric_kernel(wp::fabricarray_t<void> fa, const void* value, int value_size)
+static __global__ void array_fill_fabric_kernel(wp::fabricarray_t<void> fa, const void* value, size_t value_size)
 {
-    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t tid = size_t(blockIdx.x) * size_t(blockDim.x) + size_t(threadIdx.x);
     if (tid < fa.size)
     {
         void* dst_ptr = fabricarray_element_ptr(fa, tid, value_size);
@@ -1585,9 +1631,9 @@ static __global__ void array_fill_fabric_kernel(wp::fabricarray_t<void> fa, cons
 }
-static __global__ void array_fill_fabric_indexed_kernel(wp::indexedfabricarray_t<void> ifa, const void* value, int value_size)
+static __global__ void array_fill_fabric_indexed_kernel(wp::indexedfabricarray_t<void> ifa, const void* value, size_t value_size)
 {
-    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    size_t tid = size_t(blockIdx.x) * size_t(blockDim.x) + size_t(threadIdx.x);
     if (tid < ifa.size)
     {
         size_t idx = size_t(ifa.indices[tid]);
@@ -1684,8 +1730,8 @@ WP_API void wp_array_fill_device(void* context, void* arr_ptr, int arr_type, con
     }
     case 2:
     {
-        wp::vec_t<2, int> shape_v(shape[0], shape[1]);
-        wp::vec_t<2, int> strides_v(strides[0], strides[1]);
+        wp::vec_t<2, size_t> shape_v(shape[0], shape[1]);
+        wp::vec_t<2, size_t> strides_v(strides[0], strides[1]);
         wp::vec_t<2, const int*> indices_v(indices[0], indices[1]);
         wp_launch_device(WP_CURRENT_CONTEXT, array_fill_2d_kernel, n,
                          (data, shape_v, strides_v, indices_v, value_devptr, value_size));
@@ -1693,8 +1739,8 @@ WP_API void wp_array_fill_device(void* context, void* arr_ptr, int arr_type, con
     }
     case 3:
     {
-        wp::vec_t<3, int> shape_v(shape[0], shape[1], shape[2]);
-        wp::vec_t<3, int> strides_v(strides[0], strides[1], strides[2]);
+        wp::vec_t<3, size_t> shape_v(shape[0], shape[1], shape[2]);
+        wp::vec_t<3, size_t> strides_v(strides[0], strides[1], strides[2]);
         wp::vec_t<3, const int*> indices_v(indices[0], indices[1], indices[2]);
         wp_launch_device(WP_CURRENT_CONTEXT, array_fill_3d_kernel, n,
                          (data, shape_v, strides_v, indices_v, value_devptr, value_size));
@@ -1702,8 +1748,8 @@ WP_API void wp_array_fill_device(void* context, void* arr_ptr, int arr_type, con
     }
     case 4:
     {
-        wp::vec_t<4, int> shape_v(shape[0], shape[1], shape[2], shape[3]);
-        wp::vec_t<4, int> strides_v(strides[0], strides[1], strides[2], strides[3]);
+        wp::vec_t<4, size_t> shape_v(shape[0], shape[1], shape[2], shape[3]);
+        wp::vec_t<4, size_t> strides_v(strides[0], strides[1], strides[2], strides[3]);
         wp::vec_t<4, const int*> indices_v(indices[0], indices[1], indices[2], indices[3]);
         wp_launch_device(WP_CURRENT_CONTEXT, array_fill_4d_kernel, n,
                          (data, shape_v, strides_v, indices_v, value_devptr, value_size));
@@ -2071,13 +2117,17 @@ void wp_cuda_context_synchronize(void* context)
     check_cu(cuCtxSynchronize_f());
-    if (free_deferred_allocs(context ? context : get_current_context()) > 0)
+    if (!context)
+        context = get_current_context();
+    if (free_deferred_allocs(context) > 0)
     {
         // ensure deferred asynchronous deallocations complete
         check_cu(cuCtxSynchronize_f());
     }
     unload_deferred_modules(context);
+    destroy_deferred_graphs(context);
     // check_cuda(cudaDeviceGraphMemTrim(wp_cuda_context_get_device_ordinal(context)));
 }
@@ -2448,6 +2498,9 @@ void wp_cuda_stream_destroy(void* context, void* stream)
     wp_cuda_stream_unregister(context, stream);
+    // release temporary radix sort buffer associated with this stream
+    radix_sort_release(context, stream);
     check_cu(cuStreamDestroy_f(static_cast<CUstream>(stream)));
 }
@@ -2510,15 +2563,36 @@ void wp_cuda_stream_synchronize(void* stream)
     check_cu(cuStreamSynchronize_f(static_cast<CUstream>(stream)));
 }
-void wp_cuda_stream_wait_event(void* stream, void* event)
+void wp_cuda_stream_wait_event(void* stream, void* event, bool external)
 {
-    check_cu(cuStreamWaitEvent_f(static_cast<CUstream>(stream), static_cast<CUevent>(event), 0));
+    // the external flag can only be used during graph capture
+    if (external && !g_captures.empty() && wp_cuda_stream_is_capturing(stream))
+    {
+        // wait for an external event during graph capture
+        check_cu(cuStreamWaitEvent_f(static_cast<CUstream>(stream), static_cast<CUevent>(event), CU_EVENT_WAIT_EXTERNAL));
+    }
+    else
+    {
+        check_cu(cuStreamWaitEvent_f(static_cast<CUstream>(stream), static_cast<CUevent>(event), CU_EVENT_WAIT_DEFAULT));
+    }
 }
-void wp_cuda_stream_wait_stream(void* stream, void* other_stream, void* event)
+void wp_cuda_stream_wait_stream(void* stream, void* other_stream, void* event, bool external)
 {
-    check_cu(cuEventRecord_f(static_cast<CUevent>(event), static_cast<CUstream>(other_stream)));
-    check_cu(cuStreamWaitEvent_f(static_cast<CUstream>(stream), static_cast<CUevent>(event), 0));
+    unsigned record_flags = CU_EVENT_RECORD_DEFAULT;
+    unsigned wait_flags = CU_EVENT_WAIT_DEFAULT;
+    // the external flag can only be used during graph capture
+    if (external && !g_captures.empty())
+    {
+        if (wp_cuda_stream_is_capturing(other_stream))
+            record_flags = CU_EVENT_RECORD_EXTERNAL;
+        if (wp_cuda_stream_is_capturing(stream))
+            wait_flags = CU_EVENT_WAIT_EXTERNAL;
+    }
+    check_cu(cuEventRecordWithFlags_f(static_cast<CUevent>(event), static_cast<CUstream>(other_stream), record_flags));
+    check_cu(cuStreamWaitEvent_f(static_cast<CUstream>(stream), static_cast<CUevent>(event), wait_flags));
 }
 int wp_cuda_stream_is_capturing(void* stream)
@@ -2571,11 +2645,12 @@ int wp_cuda_event_query(void* event)
     return res;
 }
-void wp_cuda_event_record(void* event, void* stream, bool timing)
+void wp_cuda_event_record(void* event, void* stream, bool external)
 {
-    if (timing && !g_captures.empty() && wp_cuda_stream_is_capturing(stream))
+    // the external flag can only be used during graph capture
+    if (external && !g_captures.empty() && wp_cuda_stream_is_capturing(stream))
     {
-        // record timing event during graph capture
+        // record external event during graph capture (e.g., for timing or when explicitly specified by the user)
         check_cu(cuEventRecordWithFlags_f(static_cast<CUevent>(event), static_cast<CUstream>(stream), CU_EVENT_RECORD_EXTERNAL));
     }
     else
@@ -2625,7 +2700,7 @@ bool wp_cuda_graph_begin_capture(void* context, void* stream, int external)
     else
     {
         // start the capture
-        if (!check_cuda(cudaStreamBeginCapture(cuda_stream, cudaStreamCaptureModeGlobal)))
+        if (!check_cuda(cudaStreamBeginCapture(cuda_stream, cudaStreamCaptureModeThreadLocal)))
             return false;
     }
@@ -2772,6 +2847,7 @@ bool wp_cuda_graph_end_capture(void* context, void* stream, void** graph_ret)
     {
         free_deferred_allocs();
         unload_deferred_modules();
+        destroy_deferred_graphs();
     }
     if (graph_ret)
@@ -2811,11 +2887,12 @@ bool wp_cuda_graph_create_exec(void* context, void* stream, void* graph, void**
 // Support for conditional graph nodes available with CUDA 12.4+.
 #if CUDA_VERSION >= 12040
-// CUBIN data for compiled conditional modules, loaded on demand, keyed on device architecture
-static std::map<int, void*> g_conditional_cubins;
+// CUBIN or PTX data for compiled conditional modules, loaded on demand, keyed on device architecture
+using ModuleKey = std::pair<int, bool>; // <arch, use_ptx>
+static std::map<ModuleKey, void*> g_conditional_modules;
 // Compile module with conditional helper kernels
-static void* compile_conditional_module(int arch)
+static void* compile_conditional_module(int arch, bool use_ptx)
 {
     static const char* kernel_source = R"(
         typedef __device_builtin__ unsigned long long cudaGraphConditionalHandle;
@@ -2844,8 +2921,9 @@ static void* compile_conditional_module(int arch)
     )";
     // avoid recompilation
-    auto it = g_conditional_cubins.find(arch);
-    if (it != g_conditional_cubins.end())
+    ModuleKey key = {arch, use_ptx};
+    auto it = g_conditional_modules.find(key);
+    if (it != g_conditional_modules.end())
         return it->second;
     nvrtcProgram prog;
@@ -2853,11 +2931,23 @@ static void* compile_conditional_module(int arch)
         return NULL;
     char arch_opt[128];
-    snprintf(arch_opt, sizeof(arch_opt), "--gpu-architecture=sm_%d", arch);
+    if (use_ptx)
+        snprintf(arch_opt, sizeof(arch_opt), "--gpu-architecture=compute_%d", arch);
+    else
+        snprintf(arch_opt, sizeof(arch_opt), "--gpu-architecture=sm_%d", arch);
     std::vector<const char*> opts;
     opts.push_back(arch_opt);
+    const bool print_debug = (std::getenv("WARP_DEBUG") != nullptr);
+    if (print_debug)
+    {
+        printf("NVRTC options (conditional module, arch=%d, use_ptx=%s):\n", arch, use_ptx ? "true" : "false");
+        for(auto o: opts) {
+            printf("%s\n", o);
+        }
+    }
     if (!check_nvrtc(nvrtcCompileProgram(prog, int(opts.size()), opts.data())))
     {
         size_t log_size;
@@ -2874,23 +2964,37 @@ static void* compile_conditional_module(int arch)
     // get output
     char* output = NULL;
     size_t output_size = 0;
-    check_nvrtc(nvrtcGetCUBINSize(prog, &output_size));
-    if (output_size > 0)
+    if (use_ptx)
+    {
+        check_nvrtc(nvrtcGetPTXSize(prog, &output_size));
+        if (output_size > 0)
+        {
+            output = new char[output_size];
+            if (check_nvrtc(nvrtcGetPTX(prog, output)))
+                g_conditional_modules[key] = output;
+        }
+    }
+    else
     {
-        output = new char[output_size];
-        if (check_nvrtc(nvrtcGetCUBIN(prog, output)))
-            g_conditional_cubins[arch] = output;
+        check_nvrtc(nvrtcGetCUBINSize(prog, &output_size));
+        if (output_size > 0)
+        {
+            output = new char[output_size];
+            if (check_nvrtc(nvrtcGetCUBIN(prog, output)))
+                g_conditional_modules[key] = output;
+        }
     }
     nvrtcDestroyProgram(&prog);
-    // return CUBIN data
+    // return CUBIN or PTX data
     return output;
 }
 // Load module with conditional helper kernels
-static CUmodule load_conditional_module(void* context)
+static CUmodule load_conditional_module(void* context, int arch, bool use_ptx)
 {
     ContextInfo* context_info = get_context_info(context);
     if (!context_info)
@@ -2900,17 +3004,15 @@ static CUmodule load_conditional_module(void* context)
     if (context_info->conditional_module)
         return context_info->conditional_module;
-    int arch = context_info->device_info->arch;
     // compile if needed
-    void* compiled_module = compile_conditional_module(arch);
+    void* compiled_module = compile_conditional_module(arch, use_ptx);
     if (!compiled_module)
     {
         fprintf(stderr, "Warp error: Failed to compile conditional kernels\n");
         return NULL;
     }
-    // load module
+    // load module (handles both PTX and CUBIN data automatically)
     CUmodule module = NULL;
     if (!check_cu(cuModuleLoadDataEx_f(&module, compiled_module, 0, NULL, NULL)))
     {
@@ -2923,10 +3025,10 @@ static CUmodule load_conditional_module(void* context)
     return module;
 }
-static CUfunction get_conditional_kernel(void* context, const char* name)
+static CUfunction get_conditional_kernel(void* context, int arch, bool use_ptx, const char* name)
 {
     // load module if needed
-    CUmodule module = load_conditional_module(context);
+    CUmodule module = load_conditional_module(context, arch, use_ptx);
     if (!module)
         return NULL;
@@ -2966,7 +3068,7 @@ bool wp_cuda_graph_resume_capture(void* context, void* stream, void* graph)
                                   leaf_nodes.data(),
                                   nullptr,
                                   leaf_nodes.size(),
-                                  cudaStreamCaptureModeGlobal)))
+                                  cudaStreamCaptureModeThreadLocal)))
         return false;
     return true;
@@ -2976,7 +3078,7 @@ bool wp_cuda_graph_resume_capture(void* context, void* stream, void* graph)
 // https://developer.nvidia.com/blog/dynamic-control-flow-in-cuda-graphs-with-conditional-nodes/
 // condition is a gpu pointer
 // if_graph_ret and else_graph_ret should be NULL if not needed
-bool wp_cuda_graph_insert_if_else(void* context, void* stream, int* condition, void** if_graph_ret, void** else_graph_ret)
+bool wp_cuda_graph_insert_if_else(void* context, void* stream, int arch, bool use_ptx, int* condition, void** if_graph_ret, void** else_graph_ret)
 {
     bool has_if = if_graph_ret != NULL;
     bool has_else = else_graph_ret != NULL;
@@ -3019,9 +3121,9 @@ bool wp_cuda_graph_insert_if_else(void* context, void* stream, int* condition, v
         // (need to negate the condition if only the else branch is used)
         CUfunction kernel;
         if (has_if)
-            kernel = get_conditional_kernel(context, "set_conditional_if_handle_kernel");
+            kernel = get_conditional_kernel(context, arch, use_ptx, "set_conditional_if_handle_kernel");
         else
-            kernel = get_conditional_kernel(context, "set_conditional_else_handle_kernel");
+            kernel = get_conditional_kernel(context, arch, use_ptx, "set_conditional_else_handle_kernel");
         if (!kernel)
         {
@@ -3072,7 +3174,7 @@ bool wp_cuda_graph_insert_if_else(void* context, void* stream, int* condition, v
         check_cuda(cudaGraphConditionalHandleCreate(&if_handle, cuda_graph));
         check_cuda(cudaGraphConditionalHandleCreate(&else_handle, cuda_graph));
-        CUfunction kernel = get_conditional_kernel(context, "set_conditional_if_else_handles_kernel");
+        CUfunction kernel = get_conditional_kernel(context, arch, use_ptx, "set_conditional_if_else_handles_kernel");
         if (!kernel)
         {
             wp::set_error_string("Failed to get built-in conditional kernel");
@@ -3273,7 +3375,7 @@ bool wp_cuda_graph_insert_child_graph(void* context, void* stream, void* child_g
     return true;
 }
-bool wp_cuda_graph_insert_while(void* context, void* stream, int* condition, void** body_graph_ret, uint64_t* handle_ret)
+bool wp_cuda_graph_insert_while(void* context, void* stream, int arch, bool use_ptx, int* condition, void** body_graph_ret, uint64_t* handle_ret)
 {
     // if there's no body, it's a no-op
     if (!body_graph_ret)
@@ -3303,7 +3405,7 @@ bool wp_cuda_graph_insert_while(void* context, void* stream, int* condition, voi
         return false;
     // launch a kernel to set the condition handle from condition pointer
-    CUfunction kernel = get_conditional_kernel(context, "set_conditional_if_handle_kernel");
+    CUfunction kernel = get_conditional_kernel(context, arch, use_ptx, "set_conditional_if_handle_kernel");
     if (!kernel)
     {
         wp::set_error_string("Failed to get built-in conditional kernel");
@@ -3339,14 +3441,14 @@ bool wp_cuda_graph_insert_while(void* context, void* stream, int* condition, voi
     return true;
 }
-bool wp_cuda_graph_set_condition(void* context, void* stream, int* condition, uint64_t handle)
+bool wp_cuda_graph_set_condition(void* context, void* stream, int arch, bool use_ptx, int* condition, uint64_t handle)
 {
     ContextGuard guard(context);
     CUstream cuda_stream = static_cast<CUstream>(stream);
     // launch a kernel to set the condition handle from condition pointer
-    CUfunction kernel = get_conditional_kernel(context, "set_conditional_if_handle_kernel");
+    CUfunction kernel = get_conditional_kernel(context, arch, use_ptx, "set_conditional_if_handle_kernel");
     if (!kernel)
     {
         wp::set_error_string("Failed to get built-in conditional kernel");
@@ -3378,19 +3480,19 @@ bool wp_cuda_graph_resume_capture(void* context, void* stream, void* graph)
     return false;
 }
-bool wp_cuda_graph_insert_if_else(void* context, void* stream, int* condition, void** if_graph_ret, void** else_graph_ret)
+bool wp_cuda_graph_insert_if_else(void* context, void* stream, int arch, bool use_ptx, int* condition, void** if_graph_ret, void** else_graph_ret)
 {
     wp::set_error_string("Warp error: Warp must be built with CUDA Toolkit 12.4+ to enable conditional graph nodes");
     return false;
 }
-bool wp_cuda_graph_insert_while(void* context, void* stream, int* condition, void** body_graph_ret, uint64_t* handle_ret)
+bool wp_cuda_graph_insert_while(void* context, void* stream, int arch, bool use_ptx, int* condition, void** body_graph_ret, uint64_t* handle_ret)
 {
     wp::set_error_string("Warp error: Warp must be built with CUDA Toolkit 12.4+ to enable conditional graph nodes");
     return false;
 }
-bool wp_cuda_graph_set_condition(void* context, void* stream, int* condition, uint64_t handle)
+bool wp_cuda_graph_set_condition(void* context, void* stream, int arch, bool use_ptx, int* condition, uint64_t handle)
 {
     wp::set_error_string("Warp error: Warp must be built with CUDA Toolkit 12.4+ to enable conditional graph nodes");
     return false;
@@ -3425,16 +3527,38 @@ bool wp_cuda_graph_launch(void* graph_exec, void* stream)
 bool wp_cuda_graph_destroy(void* context, void* graph)
 {
-    ContextGuard guard(context);
-    return check_cuda(cudaGraphDestroy((cudaGraph_t)graph));
+    // ensure there are no graph captures in progress
+    if (g_captures.empty())
+    {
+        ContextGuard guard(context);
+        return check_cuda(cudaGraphDestroy((cudaGraph_t)graph));
+    }
+    else
+    {
+        GraphDestroyInfo info;
+        info.context = context ? context : get_current_context();
+        info.graph = graph;
+        g_deferred_graph_list.push_back(info);
+        return true;
+    }
 }
 bool wp_cuda_graph_exec_destroy(void* context, void* graph_exec)
 {
-    ContextGuard guard(context);
-    return check_cuda(cudaGraphExecDestroy((cudaGraphExec_t)graph_exec));
+    // ensure there are no graph captures in progress
+    if (g_captures.empty())
+    {
+        ContextGuard guard(context);
+        return check_cuda(cudaGraphExecDestroy((cudaGraphExec_t)graph_exec));
+    }
+    else
+    {
+        GraphDestroyInfo info;
+        info.context = context ? context : get_current_context();
+        info.graph_exec = graph_exec;
+        g_deferred_graph_list.push_back(info);
+        return true;
+    }
 }
 bool write_file(const char* data, size_t size, std::string filename, const char* mode)
@@ -4287,17 +4411,5 @@ void wp_cuda_timing_end(timing_result_t* results, int size)
     g_cuda_timing_state = parent_state;
 }
-// impl. files
-#include "bvh.cu"
-#include "mesh.cu"
-#include "sort.cu"
-#include "hashgrid.cu"
-#include "reduce.cu"
-#include "runlength_encode.cu"
-#include "scan.cu"
-#include "sparse.cu"
-#include "volume.cu"
-#include "volume_builder.cu"
 //#include "spline.inl"
 //#include "volume.inl"