PyPI - warp-lang - Versions diffs - 0.9.0__py3-none-win_amd64.whl → 0.11.0__py3-none-win_amd64.whl - Mend

warp-lang 0.9.0__py3-none-win_amd64.whl → 0.11.0__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (315) hide show

warp/__init__.py +15 -7
warp/__init__.pyi +1 -0
warp/bin/warp-clang.dll +0 -0
warp/bin/warp.dll +0 -0
warp/build.py +22 -443
warp/build_dll.py +384 -0
warp/builtins.py +998 -488
warp/codegen.py +1307 -739
warp/config.py +5 -3
warp/constants.py +6 -0
warp/context.py +1291 -548
warp/dlpack.py +31 -31
warp/fabric.py +326 -0
warp/fem/__init__.py +27 -0
warp/fem/cache.py +389 -0
warp/fem/dirichlet.py +181 -0
warp/fem/domain.py +263 -0
warp/fem/field/__init__.py +101 -0
warp/fem/field/field.py +149 -0
warp/fem/field/nodal_field.py +299 -0
warp/fem/field/restriction.py +21 -0
warp/fem/field/test.py +181 -0
warp/fem/field/trial.py +183 -0
warp/fem/geometry/__init__.py +19 -0
warp/fem/geometry/closest_point.py +70 -0
warp/fem/geometry/deformed_geometry.py +271 -0
warp/fem/geometry/element.py +744 -0
warp/fem/geometry/geometry.py +186 -0
warp/fem/geometry/grid_2d.py +373 -0
warp/fem/geometry/grid_3d.py +435 -0
warp/fem/geometry/hexmesh.py +953 -0
warp/fem/geometry/partition.py +376 -0
warp/fem/geometry/quadmesh_2d.py +532 -0
warp/fem/geometry/tetmesh.py +840 -0
warp/fem/geometry/trimesh_2d.py +577 -0
warp/fem/integrate.py +1616 -0
warp/fem/operator.py +191 -0
warp/fem/polynomial.py +213 -0
warp/fem/quadrature/__init__.py +2 -0
warp/fem/quadrature/pic_quadrature.py +245 -0
warp/fem/quadrature/quadrature.py +294 -0
warp/fem/space/__init__.py +292 -0
warp/fem/space/basis_space.py +489 -0
warp/fem/space/collocated_function_space.py +105 -0
warp/fem/space/dof_mapper.py +236 -0
warp/fem/space/function_space.py +145 -0
warp/fem/space/grid_2d_function_space.py +267 -0
warp/fem/space/grid_3d_function_space.py +306 -0
warp/fem/space/hexmesh_function_space.py +352 -0
warp/fem/space/partition.py +350 -0
warp/fem/space/quadmesh_2d_function_space.py +369 -0
warp/fem/space/restriction.py +160 -0
warp/fem/space/shape/__init__.py +15 -0
warp/fem/space/shape/cube_shape_function.py +738 -0
warp/fem/space/shape/shape_function.py +103 -0
warp/fem/space/shape/square_shape_function.py +611 -0
warp/fem/space/shape/tet_shape_function.py +567 -0
warp/fem/space/shape/triangle_shape_function.py +429 -0
warp/fem/space/tetmesh_function_space.py +292 -0
warp/fem/space/topology.py +295 -0
warp/fem/space/trimesh_2d_function_space.py +221 -0
warp/fem/types.py +77 -0
warp/fem/utils.py +495 -0
warp/native/array.h +164 -55
warp/native/builtin.h +150 -174
warp/native/bvh.cpp +75 -328
warp/native/bvh.cu +406 -23
warp/native/bvh.h +37 -45
warp/native/clang/clang.cpp +136 -24
warp/native/crt.cpp +1 -76
warp/native/crt.h +111 -104
warp/native/cuda_crt.h +1049 -0
warp/native/cuda_util.cpp +15 -3
warp/native/cuda_util.h +3 -1
warp/native/cutlass/tools/library/scripts/conv2d_operation.py +463 -0
warp/native/cutlass/tools/library/scripts/conv3d_operation.py +321 -0
warp/native/cutlass/tools/library/scripts/gemm_operation.py +988 -0
warp/native/cutlass/tools/library/scripts/generator.py +4625 -0
warp/native/cutlass/tools/library/scripts/library.py +799 -0
warp/native/cutlass/tools/library/scripts/manifest.py +402 -0
warp/native/cutlass/tools/library/scripts/pycutlass/docs/source/conf.py +96 -0
warp/native/cutlass/tools/library/scripts/pycutlass/profile/conv/conv2d_f16_sm80.py +106 -0
warp/native/cutlass/tools/library/scripts/pycutlass/profile/gemm/gemm_f32_sm80.py +91 -0
warp/native/cutlass/tools/library/scripts/pycutlass/setup.py +80 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/__init__.py +48 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/arguments.py +118 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/c_types.py +241 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/compiler.py +432 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/conv2d_operation.py +631 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/epilogue.py +1026 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/frontend.py +104 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/gemm_operation.py +1276 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/library.py +744 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/memory_manager.py +74 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/operation.py +110 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/parser.py +619 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/reduction_operation.py +398 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/tensor_ref.py +70 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/__init__.py +4 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/conv2d_testbed.py +646 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/gemm_grouped_testbed.py +235 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/gemm_testbed.py +557 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/test/profiler.py +70 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/type_hint.py +39 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/utils/__init__.py +1 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/utils/device.py +76 -0
warp/native/cutlass/tools/library/scripts/pycutlass/src/pycutlass/utils/reference_model.py +255 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/__init__.py +0 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py +201 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py +177 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py +98 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_dgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py +95 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_few_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.py +163 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_fixed_channels_f16nhwc_f16nhwc_f16nhwc_tensor_op_f32_sm80.py +187 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py +309 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py +54 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py +96 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_fprop_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py +107 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_strided_dgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py +253 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f16nhwc_tensor_op_f16_sm80.py +97 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_f16nhwc_f16nhwc_f32nhwc_tensor_op_f32_sm80.py +242 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_f32nhwc_f32nhwc_f32nhwc_simt_f32_sm80.py +96 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/conv2d_wgrad_implicit_gemm_tf32nhwc_tf32nhwc_f32nhwc_tensor_op_f32_sm80.py +107 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/conv/run_all_tests.py +10 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/frontend/test_frontend.py +146 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/__init__.py +0 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_bf16_sm80.py +96 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_f16_sm80.py +447 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_f32_sm80.py +146 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_f64_sm80.py +102 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_grouped_sm80.py +203 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/gemm_s8_sm80.py +229 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/gemm/run_all_tests.py +9 -0
warp/native/cutlass/tools/library/scripts/pycutlass/test/unit/test_sm80.py +453 -0
warp/native/cutlass/tools/library/scripts/rank_2k_operation.py +398 -0
warp/native/cutlass/tools/library/scripts/rank_k_operation.py +387 -0
warp/native/cutlass/tools/library/scripts/rt.py +796 -0
warp/native/cutlass/tools/library/scripts/symm_operation.py +400 -0
warp/native/cutlass/tools/library/scripts/trmm_operation.py +407 -0
warp/native/cutlass_gemm.cu +5 -3
warp/native/exports.h +1240 -949
warp/native/fabric.h +228 -0
warp/native/hashgrid.cpp +4 -4
warp/native/hashgrid.h +22 -2
warp/native/initializer_array.h +2 -2
warp/native/intersect.h +22 -7
warp/native/intersect_adj.h +8 -8
warp/native/intersect_tri.h +13 -16
warp/native/marching.cu +157 -161
warp/native/mat.h +119 -19
warp/native/matnn.h +2 -2
warp/native/mesh.cpp +108 -83
warp/native/mesh.cu +243 -6
warp/native/mesh.h +1547 -458
warp/native/nanovdb/NanoVDB.h +1 -1
warp/native/noise.h +272 -329
warp/native/quat.h +51 -8
warp/native/rand.h +45 -35
warp/native/range.h +6 -2
warp/native/reduce.cpp +157 -0
warp/native/reduce.cu +348 -0
warp/native/runlength_encode.cpp +62 -0
warp/native/runlength_encode.cu +46 -0
warp/native/scan.cu +11 -13
warp/native/scan.h +1 -0
warp/native/solid_angle.h +442 -0
warp/native/sort.cpp +13 -0
warp/native/sort.cu +9 -1
warp/native/sparse.cpp +338 -0
warp/native/sparse.cu +545 -0
warp/native/spatial.h +2 -2
warp/native/temp_buffer.h +30 -0
warp/native/vec.h +126 -24
warp/native/volume.h +120 -0
warp/native/warp.cpp +658 -53
warp/native/warp.cu +660 -68
warp/native/warp.h +112 -12
warp/optim/__init__.py +1 -0
warp/optim/linear.py +922 -0
warp/optim/sgd.py +92 -0
warp/render/render_opengl.py +392 -152
warp/render/render_usd.py +11 -11
warp/sim/__init__.py +2 -2
warp/sim/articulation.py +385 -185
warp/sim/collide.py +21 -8
warp/sim/import_mjcf.py +297 -106
warp/sim/import_urdf.py +389 -210
warp/sim/import_usd.py +198 -97
warp/sim/inertia.py +17 -18
warp/sim/integrator_euler.py +14 -8
warp/sim/integrator_xpbd.py +161 -19
warp/sim/model.py +795 -291
warp/sim/optimizer.py +2 -6
warp/sim/render.py +65 -3
warp/sim/utils.py +3 -0
warp/sparse.py +1227 -0
warp/stubs.py +665 -223
warp/tape.py +66 -15
warp/tests/__main__.py +3 -6
warp/tests/assets/curlnoise_golden.npy +0 -0
warp/tests/assets/pnoise_golden.npy +0 -0
warp/tests/assets/torus.usda +105 -105
warp/tests/{test_class_kernel.py → aux_test_class_kernel.py} +9 -1
warp/tests/aux_test_conditional_unequal_types_kernels.py +21 -0
warp/tests/{test_dependent.py → aux_test_dependent.py} +2 -2
warp/tests/{test_reference.py → aux_test_reference.py} +1 -1
warp/tests/aux_test_unresolved_func.py +14 -0
warp/tests/aux_test_unresolved_symbol.py +14 -0
warp/tests/disabled_kinematics.py +239 -0
warp/tests/run_coverage_serial.py +31 -0
warp/tests/test_adam.py +103 -106
warp/tests/test_arithmetic.py +128 -74
warp/tests/test_array.py +1497 -211
warp/tests/test_array_reduce.py +150 -0
warp/tests/test_atomic.py +64 -28
warp/tests/test_bool.py +99 -0
warp/tests/test_builtins_resolution.py +1292 -0
warp/tests/test_bvh.py +75 -43
warp/tests/test_closest_point_edge_edge.py +54 -57
warp/tests/test_codegen.py +233 -128
warp/tests/test_compile_consts.py +28 -20
warp/tests/test_conditional.py +108 -24
warp/tests/test_copy.py +10 -12
warp/tests/test_ctypes.py +112 -88
warp/tests/test_dense.py +21 -14
warp/tests/test_devices.py +98 -0
warp/tests/test_dlpack.py +136 -108
warp/tests/test_examples.py +277 -0
warp/tests/test_fabricarray.py +955 -0
warp/tests/test_fast_math.py +15 -11
warp/tests/test_fem.py +1271 -0
warp/tests/test_fp16.py +53 -19
warp/tests/test_func.py +187 -74
warp/tests/test_generics.py +194 -49
warp/tests/test_grad.py +180 -116
warp/tests/test_grad_customs.py +176 -0
warp/tests/test_hash_grid.py +52 -37
warp/tests/test_import.py +10 -23
warp/tests/test_indexedarray.py +577 -24
warp/tests/test_intersect.py +18 -9
warp/tests/test_large.py +141 -0
warp/tests/test_launch.py +251 -15
warp/tests/test_lerp.py +64 -65
warp/tests/test_linear_solvers.py +154 -0
warp/tests/test_lvalue.py +493 -0
warp/tests/test_marching_cubes.py +12 -13
warp/tests/test_mat.py +508 -2778
warp/tests/test_mat_lite.py +115 -0
warp/tests/test_mat_scalar_ops.py +2889 -0
warp/tests/test_math.py +103 -9
warp/tests/test_matmul.py +305 -69
warp/tests/test_matmul_lite.py +410 -0
warp/tests/test_mesh.py +71 -14
warp/tests/test_mesh_query_aabb.py +41 -25
warp/tests/test_mesh_query_point.py +325 -34
warp/tests/test_mesh_query_ray.py +39 -22
warp/tests/test_mlp.py +30 -22
warp/tests/test_model.py +92 -89
warp/tests/test_modules_lite.py +39 -0
warp/tests/test_multigpu.py +88 -114
warp/tests/test_noise.py +12 -11
warp/tests/test_operators.py +16 -20
warp/tests/test_options.py +11 -11
warp/tests/test_pinned.py +17 -18
warp/tests/test_print.py +32 -11
warp/tests/test_quat.py +275 -129
warp/tests/test_rand.py +18 -16
warp/tests/test_reload.py +38 -34
warp/tests/test_rounding.py +50 -43
warp/tests/test_runlength_encode.py +190 -0
warp/tests/test_smoothstep.py +9 -11
warp/tests/test_snippet.py +143 -0
warp/tests/test_sparse.py +460 -0
warp/tests/test_spatial.py +276 -243
warp/tests/test_streams.py +110 -85
warp/tests/test_struct.py +331 -85
warp/tests/test_tape.py +39 -21
warp/tests/test_torch.py +118 -89
warp/tests/test_transient_module.py +12 -13
warp/tests/test_types.py +614 -0
warp/tests/test_utils.py +494 -0
warp/tests/test_vec.py +354 -1987
warp/tests/test_vec_lite.py +73 -0
warp/tests/test_vec_scalar_ops.py +2099 -0
warp/tests/test_volume.py +457 -293
warp/tests/test_volume_write.py +124 -134
warp/tests/unittest_serial.py +35 -0
warp/tests/unittest_suites.py +341 -0
warp/tests/unittest_utils.py +568 -0
warp/tests/unused_test_misc.py +71 -0
warp/tests/{test_debug.py → walkthough_debug.py} +3 -17
warp/thirdparty/appdirs.py +36 -45
warp/thirdparty/unittest_parallel.py +549 -0
warp/torch.py +72 -30
warp/types.py +1744 -713
warp/utils.py +360 -350
warp_lang-0.11.0.dist-info/LICENSE.md +36 -0
warp_lang-0.11.0.dist-info/METADATA +238 -0
warp_lang-0.11.0.dist-info/RECORD +332 -0
{warp_lang-0.9.0.dist-info → warp_lang-0.11.0.dist-info}/WHEEL +1 -1
warp/bin/warp-clang.exp +0 -0
warp/bin/warp-clang.lib +0 -0
warp/bin/warp.exp +0 -0
warp/bin/warp.lib +0 -0
warp/tests/test_all.py +0 -215
warp/tests/test_array_scan.py +0 -60
warp/tests/test_base.py +0 -208
warp/tests/test_unresolved_func.py +0 -7
warp/tests/test_unresolved_symbol.py +0 -7
warp_lang-0.9.0.dist-info/METADATA +0 -20
warp_lang-0.9.0.dist-info/RECORD +0 -177
/warp/tests/{test_compile_consts_dummy.py → aux_test_compile_consts_dummy.py} +0 -0
/warp/tests/{test_reference_reference.py → aux_test_reference_reference.py} +0 -0
/warp/tests/{test_square.py → aux_test_square.py} +0 -0
{warp_lang-0.9.0.dist-info → warp_lang-0.11.0.dist-info}/top_level.txt +0 -0

warp/native/warp.cu CHANGED Viewed

@@ -73,10 +73,15 @@ struct DeviceInfo
     static constexpr int kNameLen = 128;
     CUdevice device = -1;
+    CUuuid uuid = {0};
     int ordinal = -1;
+    int pci_domain_id = -1;
+    int pci_bus_id = -1;
+    int pci_device_id = -1;
     char name[kNameLen] = "";
     int arch = 0;
     int is_uva = 0;
+    int is_memory_pool_supported = 0;
 };
 struct ContextInfo
@@ -125,7 +130,12 @@ int cuda_init()
                 g_devices[i].device = device;
                 g_devices[i].ordinal = i;
                 check_cu(cuDeviceGetName_f(g_devices[i].name, DeviceInfo::kNameLen, device));
+                check_cu(cuDeviceGetUuid_f(&g_devices[i].uuid, device));
+                check_cu(cuDeviceGetAttribute_f(&g_devices[i].pci_domain_id, CU_DEVICE_ATTRIBUTE_PCI_DOMAIN_ID, device));
+                check_cu(cuDeviceGetAttribute_f(&g_devices[i].pci_bus_id, CU_DEVICE_ATTRIBUTE_PCI_BUS_ID, device));
+                check_cu(cuDeviceGetAttribute_f(&g_devices[i].pci_device_id, CU_DEVICE_ATTRIBUTE_PCI_DEVICE_ID, device));
                 check_cu(cuDeviceGetAttribute_f(&g_devices[i].is_uva, CU_DEVICE_ATTRIBUTE_UNIFIED_ADDRESSING, device));
+                check_cu(cuDeviceGetAttribute_f(&g_devices[i].is_memory_pool_supported, CU_DEVICE_ATTRIBUTE_MEMORY_POOLS_SUPPORTED, device));
                 int major = 0;
                 int minor = 0;
                 check_cu(cuDeviceGetAttribute_f(&major, CU_DEVICE_ATTRIBUTE_COMPUTE_CAPABILITY_MAJOR, device));
@@ -216,6 +226,26 @@ void* alloc_device(void* context, size_t s)
     return ptr;
 }
+void* alloc_temp_device(void* context, size_t s)
+{
+    // "cudaMallocAsync ignores the current device/context when determining where the allocation will reside. Instead,
+    // cudaMallocAsync determines the resident device based on the specified memory pool or the supplied stream."
+    ContextGuard guard(context);
+    void* ptr;
+    if (cuda_context_is_memory_pool_supported(context))
+    {
+        check_cuda(cudaMallocAsync(&ptr, s, get_current_stream()));
+    }
+    else
+    {
+        check_cuda(cudaMalloc(&ptr, s));
+    }
+    return ptr;
+}
 void free_device(void* context, void* ptr)
 {
     ContextGuard guard(context);
@@ -223,6 +253,20 @@ void free_device(void* context, void* ptr)
     check_cuda(cudaFree(ptr));
 }
+void free_temp_device(void* context, void* ptr)
+{
+    ContextGuard guard(context);
+    if (cuda_context_is_memory_pool_supported(context))
+    {
+        check_cuda(cudaFreeAsync(ptr, get_current_stream()));
+    }
+    else
+    {
+        check_cuda(cudaFree(ptr));
+    }
+}
 void memcpy_h2d(void* context, void* dest, void* src, size_t n)
 {
     ContextGuard guard(context);
@@ -266,7 +310,7 @@ void memset_device(void* context, void* dest, int value, size_t n)
 {
     ContextGuard guard(context);
-    if ((n%4) > 0)
+    if (true)// ((n%4) > 0)
     {
         // for unaligned lengths fallback to CUDA memset
         check_cuda(cudaMemsetAsync(dest, value, n, get_current_stream()));
@@ -279,6 +323,72 @@ void memset_device(void* context, void* dest, int value, size_t n)
     }
 }
+// fill memory buffer with a value: generic memtile kernel using memcpy for each element
+__global__ void memtile_kernel(void* dst, const void* src, size_t srcsize, size_t n)
+{
+    size_t tid = wp::grid_index();
+    if (tid < n)
+    {
+        memcpy((int8_t*)dst + srcsize * tid, src, srcsize);
+    }
+}
+// this should be faster than memtile_kernel, but requires proper alignment of dst
+template <typename T>
+__global__ void memtile_value_kernel(T* dst, T value, size_t n)
+{
+    size_t tid = wp::grid_index();
+    if (tid < n)
+    {
+        dst[tid] = value;
+    }
+}
+void memtile_device(void* context, void* dst, const void* src, size_t srcsize, size_t n)
+{
+    ContextGuard guard(context);
+    size_t dst_addr = reinterpret_cast<size_t>(dst);
+    size_t src_addr = reinterpret_cast<size_t>(src);
+    // try memtile_value first because it should be faster, but we need to ensure proper alignment
+    if (srcsize == 8 && (dst_addr & 7) == 0 && (src_addr & 7) == 0)
+    {
+        int64_t* p = reinterpret_cast<int64_t*>(dst);
+        int64_t value = *reinterpret_cast<const int64_t*>(src);
+        wp_launch_device(WP_CURRENT_CONTEXT, memtile_value_kernel, n, (p, value, n));
+    }
+    else if (srcsize == 4 && (dst_addr & 3) == 0 && (src_addr & 3) == 0)
+    {
+        int32_t* p = reinterpret_cast<int32_t*>(dst);
+        int32_t value = *reinterpret_cast<const int32_t*>(src);
+        wp_launch_device(WP_CURRENT_CONTEXT, memtile_value_kernel, n, (p, value, n));
+    }
+    else if (srcsize == 2 && (dst_addr & 1) == 0 && (src_addr & 1) == 0)
+    {
+        int16_t* p = reinterpret_cast<int16_t*>(dst);
+        int16_t value = *reinterpret_cast<const int16_t*>(src);
+        wp_launch_device(WP_CURRENT_CONTEXT, memtile_value_kernel, n, (p, value, n));
+    }
+    else if (srcsize == 1)
+    {
+        check_cuda(cudaMemset(dst, *reinterpret_cast<const int8_t*>(src), n));
+    }
+    else
+    {
+        // generic version
+        // TODO: use a persistent stream-local staging buffer to avoid allocs?
+        void* src_device;
+        check_cuda(cudaMalloc(&src_device, srcsize));
+        check_cuda(cudaMemcpyAsync(src_device, src, srcsize, cudaMemcpyHostToDevice, get_current_stream()));
+        wp_launch_device(WP_CURRENT_CONTEXT, memtile_kernel, n, (dst, src_device, srcsize, n));
+        check_cuda(cudaFree(src_device));
+    }
+}
 static __global__ void array_copy_1d_kernel(void* dst, const void* src,
                                         int dst_stride, int src_stride,
@@ -382,6 +492,125 @@ static __global__ void array_copy_4d_kernel(void* dst, const void* src,
 }
+static __global__ void array_copy_from_fabric_kernel(wp::fabricarray_t<void> src,
+                                                     void* dst_data, int dst_stride, const int* dst_indices,
+                                                     int elem_size)
+{
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid < src.size)
+    {
+        int dst_idx = dst_indices ? dst_indices[tid] : tid;
+        void* dst_ptr = (char*)dst_data + dst_idx * dst_stride;
+        const void* src_ptr = fabricarray_element_ptr(src, tid, elem_size);
+        memcpy(dst_ptr, src_ptr, elem_size);
+    }
+}
+static __global__ void array_copy_from_fabric_indexed_kernel(wp::indexedfabricarray_t<void> src,
+                                                             void* dst_data, int dst_stride, const int* dst_indices,
+                                                             int elem_size)
+{
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid < src.size)
+    {
+        int src_index = src.indices[tid];
+        int dst_idx = dst_indices ? dst_indices[tid] : tid;
+        void* dst_ptr = (char*)dst_data + dst_idx * dst_stride;
+        const void* src_ptr = fabricarray_element_ptr(src.fa, src_index, elem_size);
+        memcpy(dst_ptr, src_ptr, elem_size);
+    }
+}
+static __global__ void array_copy_to_fabric_kernel(wp::fabricarray_t<void> dst,
+                                                   const void* src_data, int src_stride, const int* src_indices,
+                                                   int elem_size)
+{
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid < dst.size)
+    {
+        int src_idx = src_indices ? src_indices[tid] : tid;
+        const void* src_ptr = (const char*)src_data + src_idx * src_stride;
+        void* dst_ptr = fabricarray_element_ptr(dst, tid, elem_size);
+        memcpy(dst_ptr, src_ptr, elem_size);
+    }
+}
+static __global__ void array_copy_to_fabric_indexed_kernel(wp::indexedfabricarray_t<void> dst,
+                                                           const void* src_data, int src_stride, const int* src_indices,
+                                                           int elem_size)
+{
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid < dst.size)
+    {
+        int src_idx = src_indices ? src_indices[tid] : tid;
+        const void* src_ptr = (const char*)src_data + src_idx * src_stride;
+        int dst_idx = dst.indices[tid];
+        void* dst_ptr = fabricarray_element_ptr(dst.fa, dst_idx, elem_size);
+        memcpy(dst_ptr, src_ptr, elem_size);
+    }
+}
+static __global__ void array_copy_fabric_to_fabric_kernel(wp::fabricarray_t<void> dst, wp::fabricarray_t<void> src, int elem_size)
+{
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid < dst.size)
+    {
+        const void* src_ptr = fabricarray_element_ptr(src, tid, elem_size);
+        void* dst_ptr = fabricarray_element_ptr(dst, tid, elem_size);
+        memcpy(dst_ptr, src_ptr, elem_size);
+    }
+}
+static __global__ void array_copy_fabric_to_fabric_indexed_kernel(wp::indexedfabricarray_t<void> dst, wp::fabricarray_t<void> src, int elem_size)
+{
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid < dst.size)
+    {
+        const void* src_ptr = fabricarray_element_ptr(src, tid, elem_size);
+        int dst_index = dst.indices[tid];
+        void* dst_ptr = fabricarray_element_ptr(dst.fa, dst_index, elem_size);
+        memcpy(dst_ptr, src_ptr, elem_size);
+    }
+}
+static __global__ void array_copy_fabric_indexed_to_fabric_kernel(wp::fabricarray_t<void> dst, wp::indexedfabricarray_t<void> src, int elem_size)
+{
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid < dst.size)
+    {
+        int src_index = src.indices[tid];
+        const void* src_ptr = fabricarray_element_ptr(src.fa, src_index, elem_size);
+        void* dst_ptr = fabricarray_element_ptr(dst, tid, elem_size);
+        memcpy(dst_ptr, src_ptr, elem_size);
+    }
+}
+static __global__ void array_copy_fabric_indexed_to_fabric_indexed_kernel(wp::indexedfabricarray_t<void> dst, wp::indexedfabricarray_t<void> src, int elem_size)
+{
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid < dst.size)
+    {
+        int src_index = src.indices[tid];
+        int dst_index = dst.indices[tid];
+        const void* src_ptr = fabricarray_element_ptr(src.fa, src_index, elem_size);
+        void* dst_ptr = fabricarray_element_ptr(dst.fa, dst_index, elem_size);
+        memcpy(dst_ptr, src_ptr, elem_size);
+    }
+}
 WP_API size_t array_copy_device(void* context, void* dst, void* src, int dst_type, int src_type, int elem_size)
 {
     if (!src || !dst)
@@ -400,6 +629,12 @@ WP_API size_t array_copy_device(void* context, void* dst, void* src, int dst_typ
     const int*const* src_indices = NULL;
     const int*const* dst_indices = NULL;
+    const wp::fabricarray_t<void>* src_fabricarray = NULL;
+    wp::fabricarray_t<void>* dst_fabricarray = NULL;
+    const wp::indexedfabricarray_t<void>* src_indexedfabricarray = NULL;
+    wp::indexedfabricarray_t<void>* dst_indexedfabricarray = NULL;
     const int* null_indices[wp::ARRAY_MAX_DIMS] = { NULL };
     if (src_type == wp::ARRAY_TYPE_REGULAR)
@@ -421,9 +656,19 @@ WP_API size_t array_copy_device(void* context, void* dst, void* src, int dst_typ
         src_strides = src_arr.arr.strides;
         src_indices = src_arr.indices;
     }
+    else if (src_type == wp::ARRAY_TYPE_FABRIC)
+    {
+        src_fabricarray = static_cast<const wp::fabricarray_t<void>*>(src);
+        src_ndim = 1;
+    }
+    else if (src_type == wp::ARRAY_TYPE_FABRIC_INDEXED)
+    {
+        src_indexedfabricarray = static_cast<const wp::indexedfabricarray_t<void>*>(src);
+        src_ndim = 1;
+    }
     else
     {
-        fprintf(stderr, "Warp error: Invalid array type (%d)\n", src_type);
+        fprintf(stderr, "Warp copy error: Invalid array type (%d)\n", src_type);
         return 0;
     }
@@ -446,33 +691,149 @@ WP_API size_t array_copy_device(void* context, void* dst, void* src, int dst_typ
         dst_strides = dst_arr.arr.strides;
         dst_indices = dst_arr.indices;
     }
+    else if (dst_type == wp::ARRAY_TYPE_FABRIC)
+    {
+        dst_fabricarray = static_cast<wp::fabricarray_t<void>*>(dst);
+        dst_ndim = 1;
+    }
+    else if (dst_type == wp::ARRAY_TYPE_FABRIC_INDEXED)
+    {
+        dst_indexedfabricarray = static_cast<wp::indexedfabricarray_t<void>*>(dst);
+        dst_ndim = 1;
+    }
     else
     {
-        fprintf(stderr, "Warp error: Invalid array type (%d)\n", dst_type);
+        fprintf(stderr, "Warp copy error: Invalid array type (%d)\n", dst_type);
         return 0;
     }
     if (src_ndim != dst_ndim)
     {
-        fprintf(stderr, "Warp error: Incompatible array dimensionalities (%d and %d)\n", src_ndim, dst_ndim);
+        fprintf(stderr, "Warp copy error: Incompatible array dimensionalities (%d and %d)\n", src_ndim, dst_ndim);
         return 0;
     }
-    bool has_grad = (src_grad && dst_grad);
-    size_t n = 1;
+    ContextGuard guard(context);
+    // handle fabric arrays
+    if (dst_fabricarray)
+    {
+        size_t n = dst_fabricarray->size;
+        if (src_fabricarray)
+        {
+            // copy from fabric to fabric
+            if (src_fabricarray->size != n)
+            {
+                fprintf(stderr, "Warp copy error: Incompatible array sizes\n");
+                return 0;
+            }
+            wp_launch_device(WP_CURRENT_CONTEXT, array_copy_fabric_to_fabric_kernel, n,
+                            (*dst_fabricarray, *src_fabricarray, elem_size));
+            return n;
+        }
+        else if (src_indexedfabricarray)
+        {
+            // copy from fabric indexed to fabric
+            if (src_indexedfabricarray->size != n)
+            {
+                fprintf(stderr, "Warp copy error: Incompatible array sizes\n");
+                return 0;
+            }
+            wp_launch_device(WP_CURRENT_CONTEXT, array_copy_fabric_indexed_to_fabric_kernel, n,
+                            (*dst_fabricarray, *src_indexedfabricarray, elem_size));
+            return n;
+        }
+        else
+        {
+            // copy to fabric
+            if (size_t(src_shape[0]) != n)
+            {
+                fprintf(stderr, "Warp copy error: Incompatible array sizes\n");
+                return 0;
+            }
+            wp_launch_device(WP_CURRENT_CONTEXT, array_copy_to_fabric_kernel, n,
+                            (*dst_fabricarray, src_data, src_strides[0], src_indices[0], elem_size));
+            return n;
+        }
+    }
+    if (dst_indexedfabricarray)
+    {
+        size_t n = dst_indexedfabricarray->size;
+        if (src_fabricarray)
+        {
+            // copy from fabric to fabric indexed
+            if (src_fabricarray->size != n)
+            {
+                fprintf(stderr, "Warp copy error: Incompatible array sizes\n");
+                return 0;
+            }
+            wp_launch_device(WP_CURRENT_CONTEXT, array_copy_fabric_to_fabric_indexed_kernel, n,
+                            (*dst_indexedfabricarray, *src_fabricarray, elem_size));
+            return n;
+        }
+        else if (src_indexedfabricarray)
+        {
+            // copy from fabric indexed to fabric indexed
+            if (src_indexedfabricarray->size != n)
+            {
+                fprintf(stderr, "Warp copy error: Incompatible array sizes\n");
+                return 0;
+            }
+            wp_launch_device(WP_CURRENT_CONTEXT, array_copy_fabric_indexed_to_fabric_indexed_kernel, n,
+                            (*dst_indexedfabricarray, *src_indexedfabricarray, elem_size));
+            return n;
+        }
+        else
+        {
+            // copy to fabric indexed
+            if (size_t(src_shape[0]) != n)
+            {
+                fprintf(stderr, "Warp copy error: Incompatible array sizes\n");
+                return 0;
+            }
+            wp_launch_device(WP_CURRENT_CONTEXT, array_copy_to_fabric_indexed_kernel, n,
+                             (*dst_indexedfabricarray, src_data, src_strides[0], src_indices[0], elem_size));
+            return n;
+        }
+    }
+    else if (src_fabricarray)
+    {
+        // copy from fabric
+        size_t n = src_fabricarray->size;
+        if (size_t(dst_shape[0]) != n)
+        {
+            fprintf(stderr, "Warp copy error: Incompatible array sizes\n");
+            return 0;
+        }
+        wp_launch_device(WP_CURRENT_CONTEXT, array_copy_from_fabric_kernel, n,
+                         (*src_fabricarray, dst_data, dst_strides[0], dst_indices[0], elem_size));
+        return n;
+    }
+    else if (src_indexedfabricarray)
+    {
+        // copy from fabric indexed
+        size_t n = src_indexedfabricarray->size;
+        if (size_t(dst_shape[0]) != n)
+        {
+            fprintf(stderr, "Warp copy error: Incompatible array sizes\n");
+            return 0;
+        }
+        wp_launch_device(WP_CURRENT_CONTEXT, array_copy_from_fabric_indexed_kernel, n,
+                         (*src_indexedfabricarray, dst_data, dst_strides[0], dst_indices[0], elem_size));
+        return n;
+    }
+    size_t n = 1;
     for (int i = 0; i < src_ndim; i++)
     {
         if (src_shape[i] != dst_shape[i])
         {
-            fprintf(stderr, "Warp error: Incompatible array shapes\n");
+            fprintf(stderr, "Warp copy error: Incompatible array shapes\n");
             return 0;
         }
         n *= src_shape[i];
     }
-    ContextGuard guard(context);
     switch (src_ndim)
     {
     case 1:
@@ -481,13 +842,6 @@ WP_API size_t array_copy_device(void* context, void* dst, void* src, int dst_typ
                                                                    dst_strides[0], src_strides[0],
                                                                    dst_indices[0], src_indices[0],
                                                                    src_shape[0], elem_size));
-        if (has_grad)
-        {
-            wp_launch_device(WP_CURRENT_CONTEXT, array_copy_1d_kernel, n, (dst_grad, src_grad,
-                                                                       dst_strides[0], src_strides[0],
-                                                                       dst_indices[0], src_indices[0],
-                                                                       src_shape[0], elem_size));
-        }
         break;
     }
     case 2:
@@ -502,13 +856,6 @@ WP_API size_t array_copy_device(void* context, void* dst, void* src, int dst_typ
                                                                    dst_strides_v, src_strides_v,
                                                                    dst_indices_v, src_indices_v,
                                                                    shape_v, elem_size));
-        if (has_grad)
-        {
-            wp_launch_device(WP_CURRENT_CONTEXT, array_copy_2d_kernel, n, (dst_grad, src_grad,
-                                                                       dst_strides_v, src_strides_v,
-                                                                       dst_indices_v, src_indices_v,
-                                                                       shape_v, elem_size));
-        }
         break;
     }
     case 3:
@@ -523,13 +870,6 @@ WP_API size_t array_copy_device(void* context, void* dst, void* src, int dst_typ
                                                                    dst_strides_v, src_strides_v,
                                                                    dst_indices_v, src_indices_v,
                                                                    shape_v, elem_size));
-        if (has_grad)
-        {
-            wp_launch_device(WP_CURRENT_CONTEXT, array_copy_3d_kernel, n, (dst_grad, src_grad,
-                                                                       dst_strides_v, src_strides_v,
-                                                                       dst_indices_v, src_indices_v,
-                                                                       shape_v, elem_size));
-        }
         break;
     }
     case 4:
@@ -544,17 +884,10 @@ WP_API size_t array_copy_device(void* context, void* dst, void* src, int dst_typ
                                                                    dst_strides_v, src_strides_v,
                                                                    dst_indices_v, src_indices_v,
                                                                    shape_v, elem_size));
-        if (has_grad)
-        {
-            wp_launch_device(WP_CURRENT_CONTEXT, array_copy_4d_kernel, n, (dst_grad, src_grad,
-                                                                       dst_strides_v, src_strides_v,
-                                                                       dst_indices_v, src_indices_v,
-                                                                       shape_v, elem_size));
-        }
         break;
     }
     default:
-        fprintf(stderr, "Warp error: invalid array dimensionality (%d)\n", src_ndim);
+        fprintf(stderr, "Warp copy error: invalid array dimensionality (%d)\n", src_ndim);
         return 0;
     }
@@ -565,43 +898,231 @@ WP_API size_t array_copy_device(void* context, void* dst, void* src, int dst_typ
 }
-__global__ void memtile_kernel(char* dest, char* src, size_t srcsize, size_t n)
+static __global__ void array_fill_1d_kernel(void* data,
+                                            int n,
+                                            int stride,
+                                            const int* indices,
+                                            const void* value,
+                                            int value_size)
 {
-    const size_t tid = wp::grid_index();
-    if (tid < n)
+    int i = blockIdx.x * blockDim.x + threadIdx.x;
+    if (i < n)
     {
-        char *d = dest + srcsize * tid;
-        char *s = src;
-        for( size_t i=0; i < srcsize; ++i,++d,++s )
-        {
-            *d = *s;
-        }
+        int idx = indices ? indices[i] : i;
+        char* p = (char*)data + idx * stride;
+        memcpy(p, value, value_size);
     }
 }
-void memtile_device(void* context, void* dest, void *src, size_t srcsize, size_t n)
+static __global__ void array_fill_2d_kernel(void* data,
+                                            wp::vec_t<2, int> shape,
+                                            wp::vec_t<2, int> strides,
+                                            wp::vec_t<2, const int*> indices,
+                                            const void* value,
+                                            int value_size)
 {
-    ContextGuard guard(context);
-    void* src_device;
-    check_cuda(cudaMalloc(&src_device, srcsize));
-    check_cuda(cudaMemcpyAsync(src_device, src, srcsize, cudaMemcpyHostToDevice, get_current_stream()));
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    int n = shape[1];
+    int i = tid / n;
+    int j = tid % n;
+    if (i < shape[0] /*&& j < shape[1]*/)
+    {
+        int idx0 = indices[0] ? indices[0][i] : i;
+        int idx1 = indices[1] ? indices[1][j] : j;
+        char* p = (char*)data + idx0 * strides[0] + idx1 * strides[1];
+        memcpy(p, value, value_size);
+    }
+}
-    wp_launch_device(WP_CURRENT_CONTEXT, memtile_kernel, n, ((char *)dest,(char *)src_device,srcsize,n));
+static __global__ void array_fill_3d_kernel(void* data,
+                                            wp::vec_t<3, int> shape,
+                                            wp::vec_t<3, int> strides,
+                                            wp::vec_t<3, const int*> indices,
+                                            const void* value,
+                                            int value_size)
+{
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    int n = shape[1];
+    int o = shape[2];
+    int i = tid / (n * o);
+    int j = tid % (n * o) / o;
+    int k = tid % o;
+    if (i < shape[0] && j < shape[1] /*&& k < shape[2]*/)
+    {
+        int idx0 = indices[0] ? indices[0][i] : i;
+        int idx1 = indices[1] ? indices[1][j] : j;
+        int idx2 = indices[2] ? indices[2][k] : k;
+        char* p = (char*)data + idx0 * strides[0] + idx1 * strides[1] + idx2 * strides[2];
+        memcpy(p, value, value_size);
+    }
+}
-    check_cuda(cudaFree(src_device));
+static __global__ void array_fill_4d_kernel(void* data,
+                                            wp::vec_t<4, int> shape,
+                                            wp::vec_t<4, int> strides,
+                                            wp::vec_t<4, const int*> indices,
+                                            const void* value,
+                                            int value_size)
+{
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    int n = shape[1];
+    int o = shape[2];
+    int p = shape[3];
+    int i = tid / (n * o * p);
+    int j = tid % (n * o * p) / (o * p);
+    int k = tid % (o * p) / p;
+    int l = tid % p;
+    if (i < shape[0] && j < shape[1] && k < shape[2] /*&& l < shape[3]*/)
+    {
+        int idx0 = indices[0] ? indices[0][i] : i;
+        int idx1 = indices[1] ? indices[1][j] : j;
+        int idx2 = indices[2] ? indices[2][k] : k;
+        int idx3 = indices[3] ? indices[3][l] : l;
+        char* p = (char*)data + idx0 * strides[0] + idx1 * strides[1] + idx2 * strides[2] + idx3 * strides[3];
+        memcpy(p, value, value_size);
+    }
 }
-void array_inner_device(uint64_t a, uint64_t b, uint64_t out, int len)
+static __global__ void array_fill_fabric_kernel(wp::fabricarray_t<void> fa, const void* value, int value_size)
 {
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid < fa.size)
+    {
+        void* dst_ptr = fabricarray_element_ptr(fa, tid, value_size);
+        memcpy(dst_ptr, value, value_size);
+    }
+}
+static __global__ void array_fill_fabric_indexed_kernel(wp::indexedfabricarray_t<void> ifa, const void* value, int value_size)
+{
+    int tid = blockIdx.x * blockDim.x + threadIdx.x;
+    if (tid < ifa.size)
+    {
+        size_t idx = size_t(ifa.indices[tid]);
+        if (idx < ifa.fa.size)
+        {
+            void* dst_ptr = fabricarray_element_ptr(ifa.fa, idx, value_size);
+            memcpy(dst_ptr, value, value_size);
+        }
+    }
 }
-void array_sum_device(uint64_t a, uint64_t out, int len)
+WP_API void array_fill_device(void* context, void* arr_ptr, int arr_type, const void* value_ptr, int value_size)
 {
+    if (!arr_ptr || !value_ptr)
+        return;
+    void* data = NULL;
+    int ndim = 0;
+    const int* shape = NULL;
+    const int* strides = NULL;
+    const int*const* indices = NULL;
+    wp::fabricarray_t<void>* fa = NULL;
+    wp::indexedfabricarray_t<void>* ifa = NULL;
+    const int* null_indices[wp::ARRAY_MAX_DIMS] = { NULL };
+    if (arr_type == wp::ARRAY_TYPE_REGULAR)
+    {
+        wp::array_t<void>& arr = *static_cast<wp::array_t<void>*>(arr_ptr);
+        data = arr.data;
+        ndim = arr.ndim;
+        shape = arr.shape.dims;
+        strides = arr.strides;
+        indices = null_indices;
+    }
+    else if (arr_type == wp::ARRAY_TYPE_INDEXED)
+    {
+        wp::indexedarray_t<void>& ia = *static_cast<wp::indexedarray_t<void>*>(arr_ptr);
+        data = ia.arr.data;
+        ndim = ia.arr.ndim;
+        shape = ia.shape.dims;
+        strides = ia.arr.strides;
+        indices = ia.indices;
+    }
+    else if (arr_type == wp::ARRAY_TYPE_FABRIC)
+    {
+        fa = static_cast<wp::fabricarray_t<void>*>(arr_ptr);
+    }
+    else if (arr_type == wp::ARRAY_TYPE_FABRIC_INDEXED)
+    {
+        ifa = static_cast<wp::indexedfabricarray_t<void>*>(arr_ptr);
+    }
+    else
+    {
+        fprintf(stderr, "Warp fill error: Invalid array type id %d\n", arr_type);
+        return;
+    }
+    size_t n = 1;
+    for (int i = 0; i < ndim; i++)
+        n *= shape[i];
+    ContextGuard guard(context);
+    // copy value to device memory
+    void* value_devptr;
+    check_cuda(cudaMalloc(&value_devptr, value_size));
+    check_cuda(cudaMemcpyAsync(value_devptr, value_ptr, value_size, cudaMemcpyHostToDevice, get_current_stream()));
+    // handle fabric arrays
+    if (fa)
+    {
+        wp_launch_device(WP_CURRENT_CONTEXT, array_fill_fabric_kernel, n,
+                         (*fa, value_devptr, value_size));
+        return;
+    }
+    else if (ifa)
+    {
+        wp_launch_device(WP_CURRENT_CONTEXT, array_fill_fabric_indexed_kernel, n,
+                         (*ifa, value_devptr, value_size));
+        return;
+    }
+    // handle regular or indexed arrays
+    switch (ndim)
+    {
+    case 1:
+    {
+        wp_launch_device(WP_CURRENT_CONTEXT, array_fill_1d_kernel, n,
+                         (data, shape[0], strides[0], indices[0], value_devptr, value_size));
+        break;
+    }
+    case 2:
+    {
+        wp::vec_t<2, int> shape_v(shape[0], shape[1]);
+        wp::vec_t<2, int> strides_v(strides[0], strides[1]);
+        wp::vec_t<2, const int*> indices_v(indices[0], indices[1]);
+        wp_launch_device(WP_CURRENT_CONTEXT, array_fill_2d_kernel, n,
+                         (data, shape_v, strides_v, indices_v, value_devptr, value_size));
+        break;
+    }
+    case 3:
+    {
+        wp::vec_t<3, int> shape_v(shape[0], shape[1], shape[2]);
+        wp::vec_t<3, int> strides_v(strides[0], strides[1], strides[2]);
+        wp::vec_t<3, const int*> indices_v(indices[0], indices[1], indices[2]);
+        wp_launch_device(WP_CURRENT_CONTEXT, array_fill_3d_kernel, n,
+                         (data, shape_v, strides_v, indices_v, value_devptr, value_size));
+        break;
+    }
+    case 4:
+    {
+        wp::vec_t<4, int> shape_v(shape[0], shape[1], shape[2], shape[3]);
+        wp::vec_t<4, int> strides_v(strides[0], strides[1], strides[2], strides[3]);
+        wp::vec_t<4, const int*> indices_v(indices[0], indices[1], indices[2], indices[3]);
+        wp_launch_device(WP_CURRENT_CONTEXT, array_fill_4d_kernel, n,
+                         (data, shape_v, strides_v, indices_v, value_devptr, value_size));
+        break;
+    }
+    default:
+        fprintf(stderr, "Warp fill error: invalid array dimensionality (%d)\n", ndim);
+        return;
+    }
 }
 void array_scan_int_device(uint64_t in, uint64_t out, int len, bool inclusive)
@@ -628,6 +1149,11 @@ int cuda_toolkit_version()
     return CUDA_VERSION;
 }
+bool cuda_driver_is_initialized()
+{
+    return is_cuda_driver_initialized();
+}
 int nvrtc_supported_arch_count()
 {
     int count;
@@ -682,6 +1208,32 @@ int cuda_device_get_arch(int ordinal)
     return 0;
 }
+void cuda_device_get_uuid(int ordinal, char uuid[16])
+{
+    memcpy(uuid, g_devices[ordinal].uuid.bytes, sizeof(char)*16);
+}
+int cuda_device_get_pci_domain_id(int ordinal)
+{
+    if (ordinal >= 0 && ordinal < int(g_devices.size()))
+        return g_devices[ordinal].pci_domain_id;
+    return -1;
+}
+int cuda_device_get_pci_bus_id(int ordinal)
+{
+    if (ordinal >= 0 && ordinal < int(g_devices.size()))
+        return g_devices[ordinal].pci_bus_id;
+    return -1;
+}
+int cuda_device_get_pci_device_id(int ordinal)
+{
+    if (ordinal >= 0 && ordinal < int(g_devices.size()))
+        return g_devices[ordinal].pci_device_id;
+    return -1;
+}
 int cuda_device_is_uva(int ordinal)
 {
     if (ordinal >= 0 && ordinal < int(g_devices.size()))
@@ -689,6 +1241,13 @@ int cuda_device_is_uva(int ordinal)
     return 0;
 }
+int cuda_device_is_memory_pool_supported(int ordinal)
+{
+    if (ordinal >= 0 && ordinal < int(g_devices.size()))
+        return g_devices[ordinal].is_memory_pool_supported;
+    return false;
+}
 void* cuda_context_get_current()
 {
     return get_current_context();
@@ -797,6 +1356,16 @@ int cuda_context_is_primary(void* context)
     return 0;
 }
+int cuda_context_is_memory_pool_supported(void* context)
+{
+    int ordinal = cuda_context_get_device_ordinal(context);
+    if (ordinal != -1)
+    {
+        return cuda_device_is_memory_pool_supported(ordinal);
+    }
+    return 0;
+}
 void* cuda_context_get_stream(void* context)
 {
     ContextInfo* info = get_context_info(static_cast<CUcontext>(context));
@@ -1006,10 +1575,10 @@ void* cuda_graph_end_capture(void* context)
         //cudaGraphDebugDotPrint(graph, "graph.dot", cudaGraphDebugDotFlagsVerbose);
         cudaGraphExec_t graph_exec = NULL;
-        check_cuda(cudaGraphInstantiate(&graph_exec, graph, NULL, NULL, 0));
+        //check_cuda(cudaGraphInstantiate(&graph_exec, graph, NULL, NULL, 0));
         // can use after CUDA 11.4 to permit graphs to capture cudaMallocAsync() operations
-        //check_cuda(cudaGraphInstantiateWithFlags(&graph_exec, graph, cudaGraphInstantiateFlagAutoFreeOnLaunch));
+        check_cuda(cudaGraphInstantiateWithFlags(&graph_exec, graph, cudaGraphInstantiateFlagAutoFreeOnLaunch));
         // free source graph
         check_cuda(cudaGraphDestroy(graph));
@@ -1064,10 +1633,8 @@ size_t cuda_compile_program(const char* cuda_src, int arch, const char* include_
     std::vector<const char*> opts;
     opts.push_back(arch_opt);
-    opts.push_back(include_opt);
-    opts.push_back("--device-as-default-execution-space");
+    opts.push_back(include_opt);
     opts.push_back("--std=c++11");
-    opts.push_back("--define-macro=WP_CUDA");
     if (debug)
     {
@@ -1193,7 +1760,7 @@ void* cuda_load_module(void* context, const char* path)
         size_t length = ftell(file);
         fseek(file, 0, SEEK_SET);
-        input.resize(length);
+        input.resize(length + 1);
         if (fread(input.data(), 1, length, file) != length)
         {
             fprintf(stderr, "Warp error: Failed to read input file '%s'\n", path);
@@ -1201,6 +1768,8 @@ void* cuda_load_module(void* context, const char* path)
             return NULL;
         }
         fclose(file);
+        input[length] = '\0';
     }
     else
     {
@@ -1306,19 +1875,39 @@ void* cuda_get_kernel(void* context, void* module, const char* name)
     CUfunction kernel = NULL;
     if (!check_cu(cuModuleGetFunction_f(&kernel, (CUmodule)module, name)))
-        printf("Warp: Failed to lookup kernel function %s in module\n", name);
+        fprintf(stderr, "Warp CUDA error: Failed to lookup kernel function %s in module\n", name);
     return kernel;
 }
-size_t cuda_launch_kernel(void* context, void* kernel, size_t dim, void** args)
+size_t cuda_launch_kernel(void* context, void* kernel, size_t dim, int max_blocks, void** args)
 {
     ContextGuard guard(context);
     const int block_dim = 256;
     // CUDA specs up to compute capability 9.0 says the max x-dim grid is 2**31-1, so
     // grid_dim is fine as an int for the near future
-    const int grid_dim = (dim + block_dim - 1)/block_dim;
+    int grid_dim = (dim + block_dim - 1)/block_dim;
+    if (max_blocks <= 0) {
+        max_blocks = 2147483647;
+    }
+    if (grid_dim < 0)
+    {
+#if defined(_DEBUG)
+        fprintf(stderr, "Warp warning: Overflow in grid dimensions detected for %zu total elements and 256 threads "
+                "per block.\n    Setting block count to %d.\n", dim, max_blocks);
+#endif
+        grid_dim =  max_blocks;
+    }
+    else
+    {
+        if (grid_dim > max_blocks)
+        {
+            grid_dim = max_blocks;
+        }
+    }
     CUresult res = cuLaunchKernel_f(
         (CUfunction)kernel,
@@ -1384,8 +1973,11 @@ void cuda_graphics_unregister_resource(void* context, void* resource)
 #include "mesh.cu"
 #include "sort.cu"
 #include "hashgrid.cu"
+#include "reduce.cu"
+#include "runlength_encode.cu"
 #include "scan.cu"
 #include "marching.cu"
+#include "sparse.cu"
 #include "volume.cu"
 #include "volume_builder.cu"
 #if WP_ENABLE_CUTLASS