PyPI - warp-lang - Versions diffs - 1.7.0__py3-none-manylinux_2_34_aarch64.whl - Mend

warp-lang 1.7.0__py3-none-manylinux_2_34_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (429) hide show

warp/__init__.py +139 -0
warp/__init__.pyi +1 -0
warp/autograd.py +1142 -0
warp/bin/warp-clang.so +0 -0
warp/bin/warp.so +0 -0
warp/build.py +557 -0
warp/build_dll.py +405 -0
warp/builtins.py +6855 -0
warp/codegen.py +3969 -0
warp/config.py +158 -0
warp/constants.py +57 -0
warp/context.py +6812 -0
warp/dlpack.py +462 -0
warp/examples/__init__.py +24 -0
warp/examples/assets/bear.usd +0 -0
warp/examples/assets/bunny.usd +0 -0
warp/examples/assets/cartpole.urdf +110 -0
warp/examples/assets/crazyflie.usd +0 -0
warp/examples/assets/cube.usd +0 -0
warp/examples/assets/nonuniform.usd +0 -0
warp/examples/assets/nv_ant.xml +92 -0
warp/examples/assets/nv_humanoid.xml +183 -0
warp/examples/assets/nvidia_logo.png +0 -0
warp/examples/assets/pixel.jpg +0 -0
warp/examples/assets/quadruped.urdf +268 -0
warp/examples/assets/rocks.nvdb +0 -0
warp/examples/assets/rocks.usd +0 -0
warp/examples/assets/sphere.usd +0 -0
warp/examples/assets/square_cloth.usd +0 -0
warp/examples/benchmarks/benchmark_api.py +389 -0
warp/examples/benchmarks/benchmark_cloth.py +296 -0
warp/examples/benchmarks/benchmark_cloth_cupy.py +96 -0
warp/examples/benchmarks/benchmark_cloth_jax.py +105 -0
warp/examples/benchmarks/benchmark_cloth_numba.py +161 -0
warp/examples/benchmarks/benchmark_cloth_numpy.py +85 -0
warp/examples/benchmarks/benchmark_cloth_paddle.py +94 -0
warp/examples/benchmarks/benchmark_cloth_pytorch.py +94 -0
warp/examples/benchmarks/benchmark_cloth_taichi.py +120 -0
warp/examples/benchmarks/benchmark_cloth_warp.py +153 -0
warp/examples/benchmarks/benchmark_gemm.py +164 -0
warp/examples/benchmarks/benchmark_interop_paddle.py +166 -0
warp/examples/benchmarks/benchmark_interop_torch.py +166 -0
warp/examples/benchmarks/benchmark_launches.py +301 -0
warp/examples/benchmarks/benchmark_tile_load_store.py +103 -0
warp/examples/browse.py +37 -0
warp/examples/core/example_cupy.py +86 -0
warp/examples/core/example_dem.py +241 -0
warp/examples/core/example_fluid.py +299 -0
warp/examples/core/example_graph_capture.py +150 -0
warp/examples/core/example_marching_cubes.py +194 -0
warp/examples/core/example_mesh.py +180 -0
warp/examples/core/example_mesh_intersect.py +211 -0
warp/examples/core/example_nvdb.py +182 -0
warp/examples/core/example_raycast.py +111 -0
warp/examples/core/example_raymarch.py +205 -0
warp/examples/core/example_render_opengl.py +193 -0
warp/examples/core/example_sample_mesh.py +300 -0
warp/examples/core/example_sph.py +411 -0
warp/examples/core/example_torch.py +211 -0
warp/examples/core/example_wave.py +269 -0
warp/examples/fem/example_adaptive_grid.py +286 -0
warp/examples/fem/example_apic_fluid.py +423 -0
warp/examples/fem/example_burgers.py +261 -0
warp/examples/fem/example_convection_diffusion.py +178 -0
warp/examples/fem/example_convection_diffusion_dg.py +204 -0
warp/examples/fem/example_deformed_geometry.py +172 -0
warp/examples/fem/example_diffusion.py +196 -0
warp/examples/fem/example_diffusion_3d.py +225 -0
warp/examples/fem/example_diffusion_mgpu.py +220 -0
warp/examples/fem/example_distortion_energy.py +228 -0
warp/examples/fem/example_magnetostatics.py +240 -0
warp/examples/fem/example_mixed_elasticity.py +291 -0
warp/examples/fem/example_navier_stokes.py +261 -0
warp/examples/fem/example_nonconforming_contact.py +298 -0
warp/examples/fem/example_stokes.py +213 -0
warp/examples/fem/example_stokes_transfer.py +262 -0
warp/examples/fem/example_streamlines.py +352 -0
warp/examples/fem/utils.py +1000 -0
warp/examples/interop/example_jax_callable.py +116 -0
warp/examples/interop/example_jax_ffi_callback.py +132 -0
warp/examples/interop/example_jax_kernel.py +205 -0
warp/examples/optim/example_bounce.py +266 -0
warp/examples/optim/example_cloth_throw.py +228 -0
warp/examples/optim/example_diffray.py +561 -0
warp/examples/optim/example_drone.py +870 -0
warp/examples/optim/example_fluid_checkpoint.py +497 -0
warp/examples/optim/example_inverse_kinematics.py +182 -0
warp/examples/optim/example_inverse_kinematics_torch.py +191 -0
warp/examples/optim/example_softbody_properties.py +400 -0
warp/examples/optim/example_spring_cage.py +245 -0
warp/examples/optim/example_trajectory.py +227 -0
warp/examples/sim/example_cartpole.py +143 -0
warp/examples/sim/example_cloth.py +225 -0
warp/examples/sim/example_cloth_self_contact.py +322 -0
warp/examples/sim/example_granular.py +130 -0
warp/examples/sim/example_granular_collision_sdf.py +202 -0
warp/examples/sim/example_jacobian_ik.py +244 -0
warp/examples/sim/example_particle_chain.py +124 -0
warp/examples/sim/example_quadruped.py +203 -0
warp/examples/sim/example_rigid_chain.py +203 -0
warp/examples/sim/example_rigid_contact.py +195 -0
warp/examples/sim/example_rigid_force.py +133 -0
warp/examples/sim/example_rigid_gyroscopic.py +115 -0
warp/examples/sim/example_rigid_soft_contact.py +140 -0
warp/examples/sim/example_soft_body.py +196 -0
warp/examples/tile/example_tile_cholesky.py +87 -0
warp/examples/tile/example_tile_convolution.py +66 -0
warp/examples/tile/example_tile_fft.py +55 -0
warp/examples/tile/example_tile_filtering.py +113 -0
warp/examples/tile/example_tile_matmul.py +85 -0
warp/examples/tile/example_tile_mlp.py +383 -0
warp/examples/tile/example_tile_nbody.py +199 -0
warp/examples/tile/example_tile_walker.py +327 -0
warp/fabric.py +355 -0
warp/fem/__init__.py +106 -0
warp/fem/adaptivity.py +508 -0
warp/fem/cache.py +572 -0
warp/fem/dirichlet.py +202 -0
warp/fem/domain.py +411 -0
warp/fem/field/__init__.py +125 -0
warp/fem/field/field.py +619 -0
warp/fem/field/nodal_field.py +326 -0
warp/fem/field/restriction.py +37 -0
warp/fem/field/virtual.py +848 -0
warp/fem/geometry/__init__.py +32 -0
warp/fem/geometry/adaptive_nanogrid.py +857 -0
warp/fem/geometry/closest_point.py +84 -0
warp/fem/geometry/deformed_geometry.py +221 -0
warp/fem/geometry/element.py +776 -0
warp/fem/geometry/geometry.py +362 -0
warp/fem/geometry/grid_2d.py +392 -0
warp/fem/geometry/grid_3d.py +452 -0
warp/fem/geometry/hexmesh.py +911 -0
warp/fem/geometry/nanogrid.py +571 -0
warp/fem/geometry/partition.py +389 -0
warp/fem/geometry/quadmesh.py +663 -0
warp/fem/geometry/tetmesh.py +855 -0
warp/fem/geometry/trimesh.py +806 -0
warp/fem/integrate.py +2335 -0
warp/fem/linalg.py +419 -0
warp/fem/operator.py +293 -0
warp/fem/polynomial.py +229 -0
warp/fem/quadrature/__init__.py +17 -0
warp/fem/quadrature/pic_quadrature.py +299 -0
warp/fem/quadrature/quadrature.py +591 -0
warp/fem/space/__init__.py +228 -0
warp/fem/space/basis_function_space.py +468 -0
warp/fem/space/basis_space.py +667 -0
warp/fem/space/dof_mapper.py +251 -0
warp/fem/space/function_space.py +309 -0
warp/fem/space/grid_2d_function_space.py +177 -0
warp/fem/space/grid_3d_function_space.py +227 -0
warp/fem/space/hexmesh_function_space.py +257 -0
warp/fem/space/nanogrid_function_space.py +201 -0
warp/fem/space/partition.py +367 -0
warp/fem/space/quadmesh_function_space.py +223 -0
warp/fem/space/restriction.py +179 -0
warp/fem/space/shape/__init__.py +143 -0
warp/fem/space/shape/cube_shape_function.py +1105 -0
warp/fem/space/shape/shape_function.py +133 -0
warp/fem/space/shape/square_shape_function.py +926 -0
warp/fem/space/shape/tet_shape_function.py +834 -0
warp/fem/space/shape/triangle_shape_function.py +672 -0
warp/fem/space/tetmesh_function_space.py +271 -0
warp/fem/space/topology.py +424 -0
warp/fem/space/trimesh_function_space.py +194 -0
warp/fem/types.py +99 -0
warp/fem/utils.py +420 -0
warp/jax.py +187 -0
warp/jax_experimental/__init__.py +16 -0
warp/jax_experimental/custom_call.py +351 -0
warp/jax_experimental/ffi.py +698 -0
warp/jax_experimental/xla_ffi.py +602 -0
warp/math.py +244 -0
warp/native/array.h +1145 -0
warp/native/builtin.h +1800 -0
warp/native/bvh.cpp +492 -0
warp/native/bvh.cu +791 -0
warp/native/bvh.h +554 -0
warp/native/clang/clang.cpp +536 -0
warp/native/coloring.cpp +613 -0
warp/native/crt.cpp +51 -0
warp/native/crt.h +362 -0
warp/native/cuda_crt.h +1058 -0
warp/native/cuda_util.cpp +646 -0
warp/native/cuda_util.h +307 -0
warp/native/error.cpp +77 -0
warp/native/error.h +36 -0
warp/native/exports.h +1878 -0
warp/native/fabric.h +245 -0
warp/native/hashgrid.cpp +311 -0
warp/native/hashgrid.cu +87 -0
warp/native/hashgrid.h +240 -0
warp/native/initializer_array.h +41 -0
warp/native/intersect.h +1230 -0
warp/native/intersect_adj.h +375 -0
warp/native/intersect_tri.h +339 -0
warp/native/marching.cpp +19 -0
warp/native/marching.cu +514 -0
warp/native/marching.h +19 -0
warp/native/mat.h +2220 -0
warp/native/mathdx.cpp +87 -0
warp/native/matnn.h +343 -0
warp/native/mesh.cpp +266 -0
warp/native/mesh.cu +404 -0
warp/native/mesh.h +1980 -0
warp/native/nanovdb/GridHandle.h +366 -0
warp/native/nanovdb/HostBuffer.h +590 -0
warp/native/nanovdb/NanoVDB.h +6624 -0
warp/native/nanovdb/PNanoVDB.h +3390 -0
warp/native/noise.h +859 -0
warp/native/quat.h +1371 -0
warp/native/rand.h +342 -0
warp/native/range.h +139 -0
warp/native/reduce.cpp +174 -0
warp/native/reduce.cu +364 -0
warp/native/runlength_encode.cpp +79 -0
warp/native/runlength_encode.cu +61 -0
warp/native/scan.cpp +47 -0
warp/native/scan.cu +53 -0
warp/native/scan.h +23 -0
warp/native/solid_angle.h +466 -0
warp/native/sort.cpp +251 -0
warp/native/sort.cu +277 -0
warp/native/sort.h +33 -0
warp/native/sparse.cpp +378 -0
warp/native/sparse.cu +524 -0
warp/native/spatial.h +657 -0
warp/native/svd.h +702 -0
warp/native/temp_buffer.h +46 -0
warp/native/tile.h +2584 -0
warp/native/tile_reduce.h +264 -0
warp/native/vec.h +1426 -0
warp/native/volume.cpp +501 -0
warp/native/volume.cu +67 -0
warp/native/volume.h +969 -0
warp/native/volume_builder.cu +477 -0
warp/native/volume_builder.h +52 -0
warp/native/volume_impl.h +70 -0
warp/native/warp.cpp +1082 -0
warp/native/warp.cu +3636 -0
warp/native/warp.h +381 -0
warp/optim/__init__.py +17 -0
warp/optim/adam.py +163 -0
warp/optim/linear.py +1137 -0
warp/optim/sgd.py +112 -0
warp/paddle.py +407 -0
warp/render/__init__.py +18 -0
warp/render/render_opengl.py +3518 -0
warp/render/render_usd.py +784 -0
warp/render/utils.py +160 -0
warp/sim/__init__.py +65 -0
warp/sim/articulation.py +793 -0
warp/sim/collide.py +2395 -0
warp/sim/graph_coloring.py +300 -0
warp/sim/import_mjcf.py +790 -0
warp/sim/import_snu.py +227 -0
warp/sim/import_urdf.py +579 -0
warp/sim/import_usd.py +894 -0
warp/sim/inertia.py +324 -0
warp/sim/integrator.py +242 -0
warp/sim/integrator_euler.py +1997 -0
warp/sim/integrator_featherstone.py +2101 -0
warp/sim/integrator_vbd.py +2048 -0
warp/sim/integrator_xpbd.py +3292 -0
warp/sim/model.py +4791 -0
warp/sim/particles.py +121 -0
warp/sim/render.py +427 -0
warp/sim/utils.py +428 -0
warp/sparse.py +2057 -0
warp/stubs.py +3333 -0
warp/tape.py +1203 -0
warp/tests/__init__.py +1 -0
warp/tests/__main__.py +4 -0
warp/tests/assets/curlnoise_golden.npy +0 -0
warp/tests/assets/mlp_golden.npy +0 -0
warp/tests/assets/pixel.npy +0 -0
warp/tests/assets/pnoise_golden.npy +0 -0
warp/tests/assets/spiky.usd +0 -0
warp/tests/assets/test_grid.nvdb +0 -0
warp/tests/assets/test_index_grid.nvdb +0 -0
warp/tests/assets/test_int32_grid.nvdb +0 -0
warp/tests/assets/test_vec_grid.nvdb +0 -0
warp/tests/assets/torus.nvdb +0 -0
warp/tests/assets/torus.usda +105 -0
warp/tests/aux_test_class_kernel.py +34 -0
warp/tests/aux_test_compile_consts_dummy.py +18 -0
warp/tests/aux_test_conditional_unequal_types_kernels.py +29 -0
warp/tests/aux_test_dependent.py +29 -0
warp/tests/aux_test_grad_customs.py +29 -0
warp/tests/aux_test_instancing_gc.py +26 -0
warp/tests/aux_test_module_unload.py +23 -0
warp/tests/aux_test_name_clash1.py +40 -0
warp/tests/aux_test_name_clash2.py +40 -0
warp/tests/aux_test_reference.py +9 -0
warp/tests/aux_test_reference_reference.py +8 -0
warp/tests/aux_test_square.py +16 -0
warp/tests/aux_test_unresolved_func.py +22 -0
warp/tests/aux_test_unresolved_symbol.py +22 -0
warp/tests/cuda/__init__.py +0 -0
warp/tests/cuda/test_async.py +676 -0
warp/tests/cuda/test_ipc.py +124 -0
warp/tests/cuda/test_mempool.py +233 -0
warp/tests/cuda/test_multigpu.py +169 -0
warp/tests/cuda/test_peer.py +139 -0
warp/tests/cuda/test_pinned.py +84 -0
warp/tests/cuda/test_streams.py +634 -0
warp/tests/geometry/__init__.py +0 -0
warp/tests/geometry/test_bvh.py +200 -0
warp/tests/geometry/test_hash_grid.py +221 -0
warp/tests/geometry/test_marching_cubes.py +74 -0
warp/tests/geometry/test_mesh.py +316 -0
warp/tests/geometry/test_mesh_query_aabb.py +399 -0
warp/tests/geometry/test_mesh_query_point.py +932 -0
warp/tests/geometry/test_mesh_query_ray.py +311 -0
warp/tests/geometry/test_volume.py +1103 -0
warp/tests/geometry/test_volume_write.py +346 -0
warp/tests/interop/__init__.py +0 -0
warp/tests/interop/test_dlpack.py +729 -0
warp/tests/interop/test_jax.py +371 -0
warp/tests/interop/test_paddle.py +800 -0
warp/tests/interop/test_torch.py +1001 -0
warp/tests/run_coverage_serial.py +39 -0
warp/tests/sim/__init__.py +0 -0
warp/tests/sim/disabled_kinematics.py +244 -0
warp/tests/sim/flaky_test_sim_grad.py +290 -0
warp/tests/sim/test_collision.py +604 -0
warp/tests/sim/test_coloring.py +258 -0
warp/tests/sim/test_model.py +224 -0
warp/tests/sim/test_sim_grad_bounce_linear.py +212 -0
warp/tests/sim/test_sim_kinematics.py +98 -0
warp/tests/sim/test_vbd.py +597 -0
warp/tests/test_adam.py +163 -0
warp/tests/test_arithmetic.py +1096 -0
warp/tests/test_array.py +2972 -0
warp/tests/test_array_reduce.py +156 -0
warp/tests/test_assert.py +250 -0
warp/tests/test_atomic.py +153 -0
warp/tests/test_bool.py +220 -0
warp/tests/test_builtins_resolution.py +1298 -0
warp/tests/test_closest_point_edge_edge.py +327 -0
warp/tests/test_codegen.py +810 -0
warp/tests/test_codegen_instancing.py +1495 -0
warp/tests/test_compile_consts.py +215 -0
warp/tests/test_conditional.py +252 -0
warp/tests/test_context.py +42 -0
warp/tests/test_copy.py +238 -0
warp/tests/test_ctypes.py +638 -0
warp/tests/test_dense.py +73 -0
warp/tests/test_devices.py +97 -0
warp/tests/test_examples.py +482 -0
warp/tests/test_fabricarray.py +996 -0
warp/tests/test_fast_math.py +74 -0
warp/tests/test_fem.py +2003 -0
warp/tests/test_fp16.py +136 -0
warp/tests/test_func.py +454 -0
warp/tests/test_future_annotations.py +98 -0
warp/tests/test_generics.py +656 -0
warp/tests/test_grad.py +893 -0
warp/tests/test_grad_customs.py +339 -0
warp/tests/test_grad_debug.py +341 -0
warp/tests/test_implicit_init.py +411 -0
warp/tests/test_import.py +45 -0
warp/tests/test_indexedarray.py +1140 -0
warp/tests/test_intersect.py +73 -0
warp/tests/test_iter.py +76 -0
warp/tests/test_large.py +177 -0
warp/tests/test_launch.py +411 -0
warp/tests/test_lerp.py +151 -0
warp/tests/test_linear_solvers.py +193 -0
warp/tests/test_lvalue.py +427 -0
warp/tests/test_mat.py +2089 -0
warp/tests/test_mat_lite.py +122 -0
warp/tests/test_mat_scalar_ops.py +2913 -0
warp/tests/test_math.py +178 -0
warp/tests/test_mlp.py +282 -0
warp/tests/test_module_hashing.py +258 -0
warp/tests/test_modules_lite.py +44 -0
warp/tests/test_noise.py +252 -0
warp/tests/test_operators.py +299 -0
warp/tests/test_options.py +129 -0
warp/tests/test_overwrite.py +551 -0
warp/tests/test_print.py +339 -0
warp/tests/test_quat.py +2315 -0
warp/tests/test_rand.py +339 -0
warp/tests/test_reload.py +302 -0
warp/tests/test_rounding.py +185 -0
warp/tests/test_runlength_encode.py +196 -0
warp/tests/test_scalar_ops.py +105 -0
warp/tests/test_smoothstep.py +108 -0
warp/tests/test_snippet.py +318 -0
warp/tests/test_sparse.py +582 -0
warp/tests/test_spatial.py +2229 -0
warp/tests/test_special_values.py +361 -0
warp/tests/test_static.py +592 -0
warp/tests/test_struct.py +734 -0
warp/tests/test_tape.py +204 -0
warp/tests/test_transient_module.py +93 -0
warp/tests/test_triangle_closest_point.py +145 -0
warp/tests/test_types.py +562 -0
warp/tests/test_utils.py +588 -0
warp/tests/test_vec.py +1487 -0
warp/tests/test_vec_lite.py +80 -0
warp/tests/test_vec_scalar_ops.py +2327 -0
warp/tests/test_verify_fp.py +100 -0
warp/tests/tile/__init__.py +0 -0
warp/tests/tile/test_tile.py +780 -0
warp/tests/tile/test_tile_load.py +407 -0
warp/tests/tile/test_tile_mathdx.py +208 -0
warp/tests/tile/test_tile_mlp.py +402 -0
warp/tests/tile/test_tile_reduce.py +447 -0
warp/tests/tile/test_tile_shared_memory.py +247 -0
warp/tests/tile/test_tile_view.py +173 -0
warp/tests/unittest_serial.py +47 -0
warp/tests/unittest_suites.py +427 -0
warp/tests/unittest_utils.py +468 -0
warp/tests/walkthrough_debug.py +93 -0
warp/thirdparty/__init__.py +0 -0
warp/thirdparty/appdirs.py +598 -0
warp/thirdparty/dlpack.py +145 -0
warp/thirdparty/unittest_parallel.py +570 -0
warp/torch.py +391 -0
warp/types.py +5230 -0
warp/utils.py +1137 -0
warp_lang-1.7.0.dist-info/METADATA +516 -0
warp_lang-1.7.0.dist-info/RECORD +429 -0
warp_lang-1.7.0.dist-info/WHEEL +5 -0
warp_lang-1.7.0.dist-info/licenses/LICENSE.md +202 -0
warp_lang-1.7.0.dist-info/top_level.txt +1 -0

warp/tests/interop/test_torch.py ADDED Viewed

@@ -0,0 +1,1001 @@
+# SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import numpy as np
+import warp as wp
+from warp.tests.unittest_utils import *
+@wp.kernel
+def op_kernel(x: wp.array(dtype=float), y: wp.array(dtype=float)):
+    tid = wp.tid()
+    y[tid] = 0.5 - x[tid] * 2.0
+@wp.kernel
+def inc(a: wp.array(dtype=float)):
+    tid = wp.tid()
+    a[tid] = a[tid] + 1.0
+@wp.kernel
+def inc_vector(a: wp.array(dtype=wp.vec3f)):
+    tid = wp.tid()
+    a[tid] = a[tid] + wp.vec3f(1.0)
+@wp.kernel
+def inc_matrix(a: wp.array(dtype=wp.mat22f)):
+    tid = wp.tid()
+    a[tid] = a[tid] + wp.mat22f(1.0)
+@wp.kernel
+def arange(start: int, step: int, a: wp.array(dtype=int)):
+    tid = wp.tid()
+    a[tid] = start + step * tid
+# copy elements between non-contiguous 1d arrays of float
+@wp.kernel
+def copy1d_float_kernel(dst: wp.array(dtype=float), src: wp.array(dtype=float)):
+    i = wp.tid()
+    dst[i] = src[i]
+# copy elements between non-contiguous 2d arrays of float
+@wp.kernel
+def copy2d_float_kernel(dst: wp.array2d(dtype=float), src: wp.array2d(dtype=float)):
+    i, j = wp.tid()
+    dst[i, j] = src[i, j]
+# copy elements between non-contiguous 3d arrays of float
+@wp.kernel
+def copy3d_float_kernel(dst: wp.array3d(dtype=float), src: wp.array3d(dtype=float)):
+    i, j, k = wp.tid()
+    dst[i, j, k] = src[i, j, k]
+# copy elements between non-contiguous 2d arrays of vec3
+@wp.kernel
+def copy2d_vec3_kernel(dst: wp.array2d(dtype=wp.vec3), src: wp.array2d(dtype=wp.vec3)):
+    i, j = wp.tid()
+    dst[i, j] = src[i, j]
+# copy elements between non-contiguous 2d arrays of mat22
+@wp.kernel
+def copy2d_mat22_kernel(dst: wp.array2d(dtype=wp.mat22), src: wp.array2d(dtype=wp.mat22)):
+    i, j = wp.tid()
+    dst[i, j] = src[i, j]
+def test_dtype_from_torch(test, device):
+    import torch
+    def test_conversions(torch_type, warp_type):
+        test.assertEqual(wp.dtype_from_torch(torch_type), warp_type)
+    test_conversions(torch.float16, wp.float16)
+    test_conversions(torch.float32, wp.float32)
+    test_conversions(torch.float64, wp.float64)
+    test_conversions(torch.int8, wp.int8)
+    test_conversions(torch.int16, wp.int16)
+    test_conversions(torch.int32, wp.int32)
+    test_conversions(torch.int64, wp.int64)
+    test_conversions(torch.uint8, wp.uint8)
+    test_conversions(torch.bool, wp.bool)
+def test_dtype_to_torch(test, device):
+    import torch
+    def test_conversions(warp_type, torch_type):
+        test.assertEqual(wp.dtype_to_torch(warp_type), torch_type)
+    test_conversions(wp.float16, torch.float16)
+    test_conversions(wp.float32, torch.float32)
+    test_conversions(wp.float64, torch.float64)
+    test_conversions(wp.int8, torch.int8)
+    test_conversions(wp.int16, torch.int16)
+    test_conversions(wp.int32, torch.int32)
+    test_conversions(wp.int64, torch.int64)
+    test_conversions(wp.uint8, torch.uint8)
+    test_conversions(wp.uint16, torch.int16)
+    test_conversions(wp.uint32, torch.int32)
+    test_conversions(wp.uint64, torch.int64)
+    test_conversions(wp.bool, torch.bool)
+def test_device_conversion(test, device):
+    torch_device = wp.device_to_torch(device)
+    warp_device = wp.device_from_torch(torch_device)
+    test.assertEqual(warp_device, device)
+def test_torch_zerocopy(test, device):
+    import torch
+    a = wp.zeros(10, dtype=wp.float32, device=device)
+    t = wp.to_torch(a)
+    assert a.ptr == t.data_ptr()
+    torch_device = wp.device_to_torch(device)
+    t = torch.zeros(10, dtype=torch.float32, device=torch_device)
+    a = wp.from_torch(t)
+    assert a.ptr == t.data_ptr()
+def test_from_torch(test, device):
+    import torch
+    torch_device = wp.device_to_torch(device)
+    # automatically determine warp dtype
+    def wrap_scalar_tensor_implicit(torch_dtype, expected_warp_dtype):
+        t = torch.zeros(10, dtype=torch_dtype, device=torch_device)
+        a = wp.from_torch(t)
+        assert a.dtype == expected_warp_dtype
+        assert a.shape == tuple(t.shape)
+    wrap_scalar_tensor_implicit(torch.float64, wp.float64)
+    wrap_scalar_tensor_implicit(torch.float32, wp.float32)
+    wrap_scalar_tensor_implicit(torch.float16, wp.float16)
+    wrap_scalar_tensor_implicit(torch.int64, wp.int64)
+    wrap_scalar_tensor_implicit(torch.int32, wp.int32)
+    wrap_scalar_tensor_implicit(torch.int16, wp.int16)
+    wrap_scalar_tensor_implicit(torch.int8, wp.int8)
+    wrap_scalar_tensor_implicit(torch.uint8, wp.uint8)
+    wrap_scalar_tensor_implicit(torch.bool, wp.bool)
+    # explicitly specify warp dtype
+    def wrap_scalar_tensor_explicit(torch_dtype, expected_warp_dtype):
+        t = torch.zeros(10, dtype=torch_dtype, device=torch_device)
+        a = wp.from_torch(t, expected_warp_dtype)
+        assert a.dtype == expected_warp_dtype
+        assert a.shape == tuple(t.shape)
+    wrap_scalar_tensor_explicit(torch.float64, wp.float64)
+    wrap_scalar_tensor_explicit(torch.float32, wp.float32)
+    wrap_scalar_tensor_explicit(torch.float16, wp.float16)
+    wrap_scalar_tensor_explicit(torch.int64, wp.int64)
+    wrap_scalar_tensor_explicit(torch.int64, wp.uint64)
+    wrap_scalar_tensor_explicit(torch.int32, wp.int32)
+    wrap_scalar_tensor_explicit(torch.int32, wp.uint32)
+    wrap_scalar_tensor_explicit(torch.int16, wp.int16)
+    wrap_scalar_tensor_explicit(torch.int16, wp.uint16)
+    wrap_scalar_tensor_explicit(torch.int8, wp.int8)
+    wrap_scalar_tensor_explicit(torch.int8, wp.uint8)
+    wrap_scalar_tensor_explicit(torch.uint8, wp.uint8)
+    wrap_scalar_tensor_explicit(torch.uint8, wp.int8)
+    wrap_scalar_tensor_explicit(torch.bool, wp.uint8)
+    wrap_scalar_tensor_explicit(torch.bool, wp.int8)
+    wrap_scalar_tensor_explicit(torch.bool, wp.bool)
+    def wrap_vec_tensor(n, desired_warp_dtype):
+        t = torch.zeros((10, n), dtype=torch.float32, device=torch_device)
+        a = wp.from_torch(t, desired_warp_dtype)
+        assert a.dtype == desired_warp_dtype
+        assert a.shape == (10,)
+    wrap_vec_tensor(2, wp.vec2)
+    wrap_vec_tensor(3, wp.vec3)
+    wrap_vec_tensor(4, wp.vec4)
+    wrap_vec_tensor(6, wp.spatial_vector)
+    wrap_vec_tensor(7, wp.transform)
+    def wrap_mat_tensor(n, m, desired_warp_dtype):
+        t = torch.zeros((10, n, m), dtype=torch.float32, device=torch_device)
+        a = wp.from_torch(t, desired_warp_dtype)
+        assert a.dtype == desired_warp_dtype
+        assert a.shape == (10,)
+    wrap_mat_tensor(2, 2, wp.mat22)
+    wrap_mat_tensor(3, 3, wp.mat33)
+    wrap_mat_tensor(4, 4, wp.mat44)
+    wrap_mat_tensor(6, 6, wp.spatial_matrix)
+    def wrap_vec_tensor_with_grad(n, desired_warp_dtype):
+        t = torch.zeros((10, n), dtype=torch.float32, device=torch_device)
+        a = wp.from_torch(t, desired_warp_dtype, requires_grad=True)
+        assert a.dtype == desired_warp_dtype
+        assert a.shape == (10,)
+    wrap_vec_tensor_with_grad(2, wp.vec2)
+    wrap_vec_tensor_with_grad(3, wp.vec3)
+    wrap_vec_tensor_with_grad(4, wp.vec4)
+    wrap_vec_tensor_with_grad(6, wp.spatial_vector)
+    wrap_vec_tensor_with_grad(7, wp.transform)
+    def wrap_mat_tensor_with_grad(n, m, desired_warp_dtype):
+        t = torch.zeros((10, n, m), dtype=torch.float32, device=torch_device)
+        a = wp.from_torch(t, desired_warp_dtype, requires_grad=True)
+        assert a.dtype == desired_warp_dtype
+        assert a.shape == (10,)
+    wrap_mat_tensor_with_grad(2, 2, wp.mat22)
+    wrap_mat_tensor_with_grad(3, 3, wp.mat33)
+    wrap_mat_tensor_with_grad(4, 4, wp.mat44)
+    wrap_mat_tensor_with_grad(6, 6, wp.spatial_matrix)
+def test_array_ctype_from_torch(test, device):
+    import torch
+    torch_device = wp.device_to_torch(device)
+    # automatically determine warp dtype
+    def wrap_scalar_tensor_implicit(torch_dtype):
+        t = torch.zeros(10, dtype=torch_dtype, device=torch_device)
+        a = wp.from_torch(t, return_ctype=True)
+        warp_dtype = wp.dtype_from_torch(torch_dtype)
+        ctype_size = ctypes.sizeof(warp_dtype._type_)
+        assert a.data == t.data_ptr()
+        assert a.grad == 0
+        assert a.ndim == 1
+        assert a.shape[0] == t.shape[0]
+        assert a.strides[0] == t.stride()[0] * ctype_size
+    wrap_scalar_tensor_implicit(torch.float64)
+    wrap_scalar_tensor_implicit(torch.float32)
+    wrap_scalar_tensor_implicit(torch.float16)
+    wrap_scalar_tensor_implicit(torch.int64)
+    wrap_scalar_tensor_implicit(torch.int32)
+    wrap_scalar_tensor_implicit(torch.int16)
+    wrap_scalar_tensor_implicit(torch.int8)
+    wrap_scalar_tensor_implicit(torch.uint8)
+    wrap_scalar_tensor_implicit(torch.bool)
+    # explicitly specify warp dtype
+    def wrap_scalar_tensor_explicit(torch_dtype, warp_dtype):
+        t = torch.zeros(10, dtype=torch_dtype, device=torch_device)
+        a = wp.from_torch(t, dtype=warp_dtype, return_ctype=True)
+        ctype_size = ctypes.sizeof(warp_dtype._type_)
+        assert a.data == t.data_ptr()
+        assert a.grad == 0
+        assert a.ndim == 1
+        assert a.shape[0] == t.shape[0]
+        assert a.strides[0] == t.stride()[0] * ctype_size
+    wrap_scalar_tensor_explicit(torch.float64, wp.float64)
+    wrap_scalar_tensor_explicit(torch.float32, wp.float32)
+    wrap_scalar_tensor_explicit(torch.float16, wp.float16)
+    wrap_scalar_tensor_explicit(torch.int64, wp.int64)
+    wrap_scalar_tensor_explicit(torch.int64, wp.uint64)
+    wrap_scalar_tensor_explicit(torch.int32, wp.int32)
+    wrap_scalar_tensor_explicit(torch.int32, wp.uint32)
+    wrap_scalar_tensor_explicit(torch.int16, wp.int16)
+    wrap_scalar_tensor_explicit(torch.int16, wp.uint16)
+    wrap_scalar_tensor_explicit(torch.int8, wp.int8)
+    wrap_scalar_tensor_explicit(torch.int8, wp.uint8)
+    wrap_scalar_tensor_explicit(torch.uint8, wp.uint8)
+    wrap_scalar_tensor_explicit(torch.uint8, wp.int8)
+    wrap_scalar_tensor_explicit(torch.bool, wp.uint8)
+    wrap_scalar_tensor_explicit(torch.bool, wp.int8)
+    wrap_scalar_tensor_explicit(torch.bool, wp.bool)
+    def wrap_vec_tensor(vec_dtype):
+        t = torch.zeros((10, vec_dtype._length_), dtype=torch.float32, device=torch_device)
+        a = wp.from_torch(t, dtype=vec_dtype, return_ctype=True)
+        ctype_size = ctypes.sizeof(vec_dtype._type_)
+        assert a.data == t.data_ptr()
+        assert a.grad == 0
+        assert a.ndim == 1
+        assert a.shape[0] == t.shape[0]
+        assert a.strides[0] == t.stride()[0] * ctype_size
+    wrap_vec_tensor(wp.vec2)
+    wrap_vec_tensor(wp.vec3)
+    wrap_vec_tensor(wp.vec4)
+    wrap_vec_tensor(wp.spatial_vector)
+    wrap_vec_tensor(wp.transform)
+    def wrap_mat_tensor(mat_dtype):
+        t = torch.zeros((10, *mat_dtype._shape_), dtype=torch.float32, device=torch_device)
+        a = wp.from_torch(t, dtype=mat_dtype, return_ctype=True)
+        ctype_size = ctypes.sizeof(mat_dtype._type_)
+        assert a.data == t.data_ptr()
+        assert a.grad == 0
+        assert a.ndim == 1
+        assert a.shape[0] == t.shape[0]
+        assert a.strides[0] == t.stride()[0] * ctype_size
+    wrap_mat_tensor(wp.mat22)
+    wrap_mat_tensor(wp.mat33)
+    wrap_mat_tensor(wp.mat44)
+    wrap_mat_tensor(wp.spatial_matrix)
+    def wrap_vec_tensor_with_existing_grad(vec_dtype):
+        t = torch.zeros((10, vec_dtype._length_), dtype=torch.float32, device=torch_device, requires_grad=True)
+        t.grad = torch.zeros((10, vec_dtype._length_), dtype=torch.float32, device=torch_device)
+        a = wp.from_torch(t, dtype=vec_dtype, return_ctype=True)
+        ctype_size = ctypes.sizeof(vec_dtype._type_)
+        assert a.data == t.data_ptr()
+        assert a.grad == t.grad.data_ptr()
+        assert a.ndim == 1
+        assert a.shape[0] == t.shape[0]
+        assert a.strides[0] == t.stride()[0] * ctype_size
+    wrap_vec_tensor_with_existing_grad(wp.vec2)
+    wrap_vec_tensor_with_existing_grad(wp.vec3)
+    wrap_vec_tensor_with_existing_grad(wp.vec4)
+    wrap_vec_tensor_with_existing_grad(wp.spatial_vector)
+    wrap_vec_tensor_with_existing_grad(wp.transform)
+    def wrap_vec_tensor_with_new_grad(vec_dtype):
+        t = torch.zeros((10, vec_dtype._length_), dtype=torch.float32, device=torch_device)
+        a = wp.from_torch(t, dtype=vec_dtype, requires_grad=True, return_ctype=True)
+        ctype_size = ctypes.sizeof(vec_dtype._type_)
+        assert a.data == t.data_ptr()
+        assert a.grad == t.grad.data_ptr()
+        assert a.ndim == 1
+        assert a.shape[0] == t.shape[0]
+        assert a.strides[0] == t.stride()[0] * ctype_size
+    wrap_vec_tensor_with_new_grad(wp.vec2)
+    wrap_vec_tensor_with_new_grad(wp.vec3)
+    wrap_vec_tensor_with_new_grad(wp.vec4)
+    wrap_vec_tensor_with_new_grad(wp.spatial_vector)
+    wrap_vec_tensor_with_new_grad(wp.transform)
+    def wrap_vec_tensor_with_torch_grad(vec_dtype):
+        t = torch.zeros((10, vec_dtype._length_), dtype=torch.float32, device=torch_device)
+        grad = torch.zeros((10, vec_dtype._length_), dtype=torch.float32, device=torch_device)
+        a = wp.from_torch(t, dtype=vec_dtype, grad=grad, return_ctype=True)
+        ctype_size = ctypes.sizeof(vec_dtype._type_)
+        assert a.data == t.data_ptr()
+        assert a.grad == grad.data_ptr()
+        assert a.ndim == 1
+        assert a.shape[0] == t.shape[0]
+        assert a.strides[0] == t.stride()[0] * ctype_size
+    wrap_vec_tensor_with_torch_grad(wp.vec2)
+    wrap_vec_tensor_with_torch_grad(wp.vec3)
+    wrap_vec_tensor_with_torch_grad(wp.vec4)
+    wrap_vec_tensor_with_torch_grad(wp.spatial_vector)
+    wrap_vec_tensor_with_torch_grad(wp.transform)
+    def wrap_vec_tensor_with_warp_grad(vec_dtype):
+        t = torch.zeros((10, vec_dtype._length_), dtype=torch.float32, device=torch_device)
+        grad = wp.zeros(10, dtype=vec_dtype, device=device)
+        a = wp.from_torch(t, dtype=vec_dtype, grad=grad, return_ctype=True)
+        ctype_size = ctypes.sizeof(vec_dtype._type_)
+        assert a.data == t.data_ptr()
+        assert a.grad == grad.ptr
+        assert a.ndim == 1
+        assert a.shape[0] == t.shape[0]
+        assert a.strides[0] == t.stride()[0] * ctype_size
+    wrap_vec_tensor_with_warp_grad(wp.vec2)
+    wrap_vec_tensor_with_warp_grad(wp.vec3)
+    wrap_vec_tensor_with_warp_grad(wp.vec4)
+    wrap_vec_tensor_with_warp_grad(wp.spatial_vector)
+    wrap_vec_tensor_with_warp_grad(wp.transform)
+def test_cuda_array_interface(test, device):
+    # We should be able to construct Torch tensors from Warp arrays via __cuda_array_interface__ on GPU.
+    # Note that Torch does not support __array_interface__ on CPU.
+    torch_device = wp.device_to_torch(device)
+    n = 10
+    # test the types supported by both Warp and Torch
+    scalar_types = [wp.float16, wp.float32, wp.float64, wp.int8, wp.int16, wp.int32, wp.int64, wp.uint8]
+    for dtype in scalar_types:
+        # test round trip
+        a1 = wp.zeros(n, dtype=dtype, device=device)
+        t = torch.tensor(a1, device=torch_device)
+        a2 = wp.array(t, device=device)
+        assert a1.dtype == a2.dtype
+        assert a1.shape == a2.shape
+        assert a1.strides == a2.strides
+@wp.kernel
+def vec_sum_kernel(x: wp.array(dtype=wp.vec3), y: wp.array(dtype=wp.vec3), z: wp.array(dtype=wp.vec3)):
+    tid = wp.tid()
+    z[tid] = x[tid] + y[tid]
+# ensure torch arrays passed to Warp kernels are unchanged by Tape.backward()
+def test_tensor_in_warp_kernel(test, device):
+    torch_device = wp.device_to_torch(device)
+    x = torch.ones((10, 3), dtype=torch.float32, device=torch_device)
+    y = torch.ones((10, 3), dtype=torch.float32, device=torch_device)
+    wp_y = wp.from_torch(y, dtype=wp.vec3, requires_grad=True)
+    z = torch.zeros((10, 3), dtype=torch.float32, device=torch_device)
+    wp_z = wp.from_torch(z, dtype=wp.vec3, requires_grad=True)
+    tape = wp.Tape()
+    with tape:
+        wp.launch(vec_sum_kernel, dim=10, inputs=[x, wp_y], outputs=[wp_z], device=device)
+    assert_np_equal(x.cpu().numpy(), np.ones((10, 3), dtype=float))
+    tape.backward(grads={wp_z: wp.ones_like(wp_z)})
+    # x is unchanged by Tape.backward()
+    assert_np_equal(x.cpu().numpy(), np.ones((10, 3), dtype=float))
+    # we can still compute the gradient of y because Warp created an array for it
+    assert_np_equal(y.grad.cpu().numpy(), np.ones((10, 3), dtype=float))
+def test_to_torch(test, device):
+    import torch
+    def wrap_scalar_array(warp_dtype, expected_torch_dtype):
+        a = wp.zeros(10, dtype=warp_dtype, device=device)
+        t = wp.to_torch(a)
+        assert t.dtype == expected_torch_dtype
+        assert tuple(t.shape) == a.shape
+    wrap_scalar_array(wp.float64, torch.float64)
+    wrap_scalar_array(wp.float32, torch.float32)
+    wrap_scalar_array(wp.float16, torch.float16)
+    wrap_scalar_array(wp.int64, torch.int64)
+    wrap_scalar_array(wp.int32, torch.int32)
+    wrap_scalar_array(wp.int16, torch.int16)
+    wrap_scalar_array(wp.int8, torch.int8)
+    wrap_scalar_array(wp.uint8, torch.uint8)
+    wrap_scalar_array(wp.bool, torch.bool)
+    # not supported by torch
+    # wrap_scalar_array(wp.uint64, torch.int64)
+    # wrap_scalar_array(wp.uint32, torch.int32)
+    # wrap_scalar_array(wp.uint16, torch.int16)
+    def wrap_vec_array(n, warp_dtype):
+        a = wp.zeros(10, dtype=warp_dtype, device=device)
+        t = wp.to_torch(a)
+        assert t.dtype == torch.float32
+        assert tuple(t.shape) == (10, n)
+    wrap_vec_array(2, wp.vec2)
+    wrap_vec_array(3, wp.vec3)
+    wrap_vec_array(4, wp.vec4)
+    wrap_vec_array(6, wp.spatial_vector)
+    wrap_vec_array(7, wp.transform)
+    def wrap_mat_array(n, m, warp_dtype):
+        a = wp.zeros(10, dtype=warp_dtype, device=device)
+        t = wp.to_torch(a)
+        assert t.dtype == torch.float32
+        assert tuple(t.shape) == (10, n, m)
+    wrap_mat_array(2, 2, wp.mat22)
+    wrap_mat_array(3, 3, wp.mat33)
+    wrap_mat_array(4, 4, wp.mat44)
+    wrap_mat_array(6, 6, wp.spatial_matrix)
+def test_from_torch_slices(test, device):
+    import torch
+    torch_device = wp.device_to_torch(device)
+    # 1D slice, contiguous
+    t_base = torch.arange(10, dtype=torch.float32, device=torch_device)
+    t = t_base[2:9]
+    a = wp.from_torch(t)
+    assert a.ptr == t.data_ptr()
+    assert a.is_contiguous
+    assert a.shape == tuple(t.shape)
+    assert_np_equal(a.numpy(), t.cpu().numpy())
+    # 1D slice with non-contiguous stride
+    t_base = torch.arange(10, dtype=torch.float32, device=torch_device)
+    t = t_base[2:9:2]
+    a = wp.from_torch(t)
+    assert a.ptr == t.data_ptr()
+    assert not a.is_contiguous
+    assert a.shape == tuple(t.shape)
+    # copy contents to contiguous array
+    a_contiguous = wp.empty_like(a)
+    wp.launch(copy1d_float_kernel, dim=a.shape, inputs=[a_contiguous, a], device=device)
+    assert_np_equal(a_contiguous.numpy(), t.cpu().numpy())
+    # 2D slices (non-contiguous)
+    t_base = torch.arange(24, dtype=torch.float32, device=torch_device).reshape((4, 6))
+    t = t_base[1:3, 2:5]
+    a = wp.from_torch(t)
+    assert a.ptr == t.data_ptr()
+    assert not a.is_contiguous
+    assert a.shape == tuple(t.shape)
+    # copy contents to contiguous array
+    a_contiguous = wp.empty_like(a)
+    wp.launch(copy2d_float_kernel, dim=a.shape, inputs=[a_contiguous, a], device=device)
+    assert_np_equal(a_contiguous.numpy(), t.cpu().numpy())
+    # 3D slices (non-contiguous)
+    t_base = torch.arange(36, dtype=torch.float32, device=torch_device).reshape((4, 3, 3))
+    t = t_base[::2, 0:1, 1:2]
+    a = wp.from_torch(t)
+    assert a.ptr == t.data_ptr()
+    assert not a.is_contiguous
+    assert a.shape == tuple(t.shape)
+    # copy contents to contiguous array
+    a_contiguous = wp.empty_like(a)
+    wp.launch(copy3d_float_kernel, dim=a.shape, inputs=[a_contiguous, a], device=device)
+    assert_np_equal(a_contiguous.numpy(), t.cpu().numpy())
+    # 2D slices of vec3 (inner contiguous, outer non-contiguous)
+    t_base = torch.arange(150, dtype=torch.float32, device=torch_device).reshape((10, 5, 3))
+    t = t_base[1:7:2, 2:5]
+    a = wp.from_torch(t, dtype=wp.vec3)
+    assert a.ptr == t.data_ptr()
+    assert not a.is_contiguous
+    assert a.shape == tuple(t.shape[:-1])
+    # copy contents to contiguous array
+    a_contiguous = wp.empty_like(a)
+    wp.launch(copy2d_vec3_kernel, dim=a.shape, inputs=[a_contiguous, a], device=device)
+    assert_np_equal(a_contiguous.numpy(), t.cpu().numpy())
+    # 2D slices of mat22 (inner contiguous, outer non-contiguous)
+    t_base = torch.arange(200, dtype=torch.float32, device=torch_device).reshape((10, 5, 2, 2))
+    t = t_base[1:7:2, 2:5]
+    a = wp.from_torch(t, dtype=wp.mat22)
+    assert a.ptr == t.data_ptr()
+    assert not a.is_contiguous
+    assert a.shape == tuple(t.shape[:-2])
+    # copy contents to contiguous array
+    a_contiguous = wp.empty_like(a)
+    wp.launch(copy2d_mat22_kernel, dim=a.shape, inputs=[a_contiguous, a], device=device)
+    assert_np_equal(a_contiguous.numpy(), t.cpu().numpy())
+def test_from_torch_zero_strides(test, device):
+    import torch
+    torch_device = wp.device_to_torch(device)
+    t_base = torch.arange(9, dtype=torch.float32, device=torch_device).reshape((3, 3))
+    # expand outermost dimension
+    t = t_base.unsqueeze(0).expand(3, -1, -1)
+    a = wp.from_torch(t)
+    assert a.ptr == t.data_ptr()
+    assert not a.is_contiguous
+    assert a.shape == tuple(t.shape)
+    a_contiguous = wp.empty_like(a)
+    wp.launch(copy3d_float_kernel, dim=a.shape, inputs=[a_contiguous, a], device=device)
+    assert_np_equal(a_contiguous.numpy(), t.cpu().numpy())
+    # expand middle dimension
+    t = t_base.unsqueeze(1).expand(-1, 3, -1)
+    a = wp.from_torch(t)
+    assert a.ptr == t.data_ptr()
+    assert not a.is_contiguous
+    assert a.shape == tuple(t.shape)
+    a_contiguous = wp.empty_like(a)
+    wp.launch(copy3d_float_kernel, dim=a.shape, inputs=[a_contiguous, a], device=device)
+    assert_np_equal(a_contiguous.numpy(), t.cpu().numpy())
+    # expand innermost dimension
+    t = t_base.unsqueeze(2).expand(-1, -1, 3)
+    a = wp.from_torch(t)
+    assert a.ptr == t.data_ptr()
+    assert not a.is_contiguous
+    assert a.shape == tuple(t.shape)
+    a_contiguous = wp.empty_like(a)
+    wp.launch(copy3d_float_kernel, dim=a.shape, inputs=[a_contiguous, a], device=device)
+    assert_np_equal(a_contiguous.numpy(), t.cpu().numpy())
+def test_torch_mgpu_from_torch(test, device):
+    import torch
+    n = 32
+    t0 = torch.arange(0, n, 1, dtype=torch.int32, device="cuda:0")
+    t1 = torch.arange(0, n * 2, 2, dtype=torch.int32, device="cuda:1")
+    a0 = wp.from_torch(t0, dtype=wp.int32)
+    a1 = wp.from_torch(t1, dtype=wp.int32)
+    assert a0.device == "cuda:0"
+    assert a1.device == "cuda:1"
+    expected0 = np.arange(0, n, 1)
+    expected1 = np.arange(0, n * 2, 2)
+    assert_np_equal(a0.numpy(), expected0)
+    assert_np_equal(a1.numpy(), expected1)
+def test_torch_mgpu_to_torch(test, device):
+    n = 32
+    with wp.ScopedDevice("cuda:0"):
+        a0 = wp.empty(n, dtype=wp.int32)
+        wp.launch(arange, dim=a0.size, inputs=[0, 1, a0])
+    with wp.ScopedDevice("cuda:1"):
+        a1 = wp.empty(n, dtype=wp.int32)
+        wp.launch(arange, dim=a1.size, inputs=[0, 2, a1])
+    t0 = wp.to_torch(a0)
+    t1 = wp.to_torch(a1)
+    assert str(t0.device) == "cuda:0"
+    assert str(t1.device) == "cuda:1"
+    expected0 = np.arange(0, n, 1, dtype=np.int32)
+    expected1 = np.arange(0, n * 2, 2, dtype=np.int32)
+    assert_np_equal(t0.cpu().numpy(), expected0)
+    assert_np_equal(t1.cpu().numpy(), expected1)
+def test_torch_mgpu_interop(test, device):
+    import torch
+    n = 1024 * 1024
+    with torch.cuda.device(0):
+        t0 = torch.arange(n, dtype=torch.float32, device="cuda")
+        a0 = wp.from_torch(t0)
+        wp.launch(inc, dim=a0.size, inputs=[a0], stream=wp.stream_from_torch())
+    with torch.cuda.device(1):
+        t1 = torch.arange(n, dtype=torch.float32, device="cuda")
+        a1 = wp.from_torch(t1)
+        wp.launch(inc, dim=a1.size, inputs=[a1], stream=wp.stream_from_torch())
+    assert a0.device == "cuda:0"
+    assert a1.device == "cuda:1"
+    expected = np.arange(n, dtype=int) + 1
+    # ensure the torch tensors were modified by warp
+    assert_np_equal(t0.cpu().numpy(), expected)
+    assert_np_equal(t1.cpu().numpy(), expected)
+def test_torch_autograd(test, device):
+    """Test torch autograd with a custom Warp op"""
+    import torch
+    # custom autograd op
+    class TestFunc(torch.autograd.Function):
+        @staticmethod
+        def forward(ctx, x):
+            # allocate output array
+            y = torch.empty_like(x)
+            ctx.x = x
+            ctx.y = y
+            wp.launch(kernel=op_kernel, dim=len(x), inputs=[wp.from_torch(x)], outputs=[wp.from_torch(y)])
+            return y
+        @staticmethod
+        def backward(ctx, adj_y):
+            # adjoints should be allocated as zero initialized
+            adj_x = torch.zeros_like(ctx.x).contiguous()
+            adj_y = adj_y.contiguous()
+            wp_x = wp.from_torch(ctx.x, grad=adj_x)
+            wp_y = wp.from_torch(ctx.y, grad=adj_y)
+            wp.launch(
+                kernel=op_kernel,
+                dim=len(ctx.x),
+                # fwd inputs
+                inputs=[wp_x],
+                outputs=[wp_y],
+                # adj inputs (already stored in input/output arrays, passing null pointers)
+                adj_inputs=[None],
+                adj_outputs=[None],
+                adjoint=True,
+            )
+            return adj_x
+    # run autograd on given device
+    with wp.ScopedDevice(device):
+        torch_device = wp.device_to_torch(device)
+        # input data
+        x = torch.ones(16, dtype=torch.float32, device=torch_device, requires_grad=True)
+        # execute op
+        y = TestFunc.apply(x)
+        # compute grads
+        l = y.sum()
+        l.backward()
+        passed = (x.grad == -2.0).all()
+        assert passed.item()
+def test_torch_graph_torch_stream(test, device):
+    """Capture Torch graph on Torch stream"""
+    wp.load_module(device=device)
+    import torch
+    torch_device = wp.device_to_torch(device)
+    n = 1024 * 1024
+    t = torch.zeros(n, dtype=torch.float32, device=torch_device)
+    a = wp.from_torch(t)
+    g = torch.cuda.CUDAGraph()
+    # create a device-specific torch stream to use for capture
+    # (otherwise torch.cuda.graph reuses its capture stream, which can be problematic if it's from a different device)
+    torch_stream = torch.cuda.Stream(device=torch_device)
+    # make warp use the same stream
+    warp_stream = wp.stream_from_torch(torch_stream)
+    # capture graph
+    with wp.ScopedStream(warp_stream), torch.cuda.graph(g, stream=torch_stream):
+        wp.capture_begin(force_module_load=False, external=True)
+        try:
+            t += 1.0
+            wp.launch(inc, dim=n, inputs=[a])
+            t += 1.0
+            wp.launch(inc, dim=n, inputs=[a])
+        finally:
+            wp.capture_end()
+    # replay graph
+    num_iters = 10
+    for _i in range(num_iters):
+        g.replay()
+    passed = (t == num_iters * 4.0).all()
+    assert passed.item()
+def test_torch_graph_warp_stream(test, device):
+    """Capture Torch graph on Warp stream"""
+    import torch
+    torch_device = wp.device_to_torch(device)
+    n = 1024 * 1024
+    t = torch.zeros(n, dtype=torch.float32, device=torch_device)
+    a = wp.from_torch(t)
+    g = torch.cuda.CUDAGraph()
+    # make torch use the warp stream from the given device
+    torch_stream = wp.stream_to_torch(device)
+    # capture graph
+    with wp.ScopedDevice(device), torch.cuda.graph(g, stream=torch_stream):
+        wp.capture_begin(force_module_load=False, external=True)
+        try:
+            t += 1.0
+            wp.launch(inc, dim=n, inputs=[a])
+            t += 1.0
+            wp.launch(inc, dim=n, inputs=[a])
+        finally:
+            wp.capture_end()
+    # replay graph
+    num_iters = 10
+    for _i in range(num_iters):
+        g.replay()
+    passed = (t == num_iters * 4.0).all()
+    assert passed.item()
+def test_warp_graph_warp_stream(test, device):
+    """Capture Warp graph on Warp stream"""
+    import torch
+    torch_device = wp.device_to_torch(device)
+    n = 1024 * 1024
+    t = torch.zeros(n, dtype=torch.float32, device=torch_device)
+    a = wp.from_torch(t)
+    # make torch use the warp stream from the given device
+    torch_stream = wp.stream_to_torch(device)
+    # capture graph
+    with wp.ScopedDevice(device), torch.cuda.stream(torch_stream):
+        wp.capture_begin(force_module_load=False)
+        try:
+            t += 1.0
+            wp.launch(inc, dim=n, inputs=[a])
+            t += 1.0
+            wp.launch(inc, dim=n, inputs=[a])
+        finally:
+            g = wp.capture_end()
+    # replay graph
+    num_iters = 10
+    for _i in range(num_iters):
+        wp.capture_launch(g)
+    passed = (t == num_iters * 4.0).all()
+    assert passed.item()
+def test_warp_graph_torch_stream(test, device):
+    """Capture Warp graph on Torch stream"""
+    wp.load_module(device=device)
+    import torch
+    torch_device = wp.device_to_torch(device)
+    n = 1024 * 1024
+    t = torch.zeros(n, dtype=torch.float32, device=torch_device)
+    a = wp.from_torch(t)
+    # create a device-specific torch stream to use for capture
+    # (the default torch stream is not suitable for graph capture)
+    torch_stream = torch.cuda.Stream(device=torch_device)
+    # make warp use the same stream
+    warp_stream = wp.stream_from_torch(torch_stream)
+    # capture graph
+    with wp.ScopedStream(warp_stream), torch.cuda.stream(torch_stream):
+        wp.capture_begin(force_module_load=False)
+        try:
+            t += 1.0
+            wp.launch(inc, dim=n, inputs=[a])
+            t += 1.0
+            wp.launch(inc, dim=n, inputs=[a])
+        finally:
+            g = wp.capture_end()
+    # replay graph
+    num_iters = 10
+    for _i in range(num_iters):
+        wp.capture_launch(g)
+    passed = (t == num_iters * 4.0).all()
+    assert passed.item()
+def test_direct(test, device):
+    """Pass Torch tensors to Warp kernels directly"""
+    import torch
+    torch_device = wp.device_to_torch(device)
+    n = 12
+    s = torch.arange(n, dtype=torch.float32, device=torch_device)
+    v = torch.arange(n, dtype=torch.float32, device=torch_device).reshape((n // 3, 3))
+    m = torch.arange(n, dtype=torch.float32, device=torch_device).reshape((n // 4, 2, 2))
+    wp.launch(inc, dim=n, inputs=[s], device=device)
+    wp.launch(inc_vector, dim=n // 3, inputs=[v], device=device)
+    wp.launch(inc_matrix, dim=n // 4, inputs=[m], device=device)
+    expected = torch.arange(1, n + 1, dtype=torch.float32, device=torch_device)
+    assert torch.equal(s, expected)
+    assert torch.equal(v.reshape(n), expected)
+    assert torch.equal(m.reshape(n), expected)
+class TestTorch(unittest.TestCase):
+    pass
+test_devices = get_test_devices()
+try:
+    import torch
+    # check which Warp devices work with Torch
+    # CUDA devices may fail if Torch was not compiled with CUDA support
+    torch_compatible_devices = []
+    torch_compatible_cuda_devices = []
+    for d in test_devices:
+        try:
+            t = torch.arange(10, device=wp.device_to_torch(d))
+            t += 1
+            torch_compatible_devices.append(d)
+            if d.is_cuda:
+                torch_compatible_cuda_devices.append(d)
+        except Exception as e:
+            print(f"Skipping Torch tests on device '{d}' due to exception: {e}")
+    add_function_test(TestTorch, "test_dtype_from_torch", test_dtype_from_torch, devices=None)
+    add_function_test(TestTorch, "test_dtype_to_torch", test_dtype_to_torch, devices=None)
+    if torch_compatible_devices:
+        add_function_test(TestTorch, "test_device_conversion", test_device_conversion, devices=torch_compatible_devices)
+        add_function_test(TestTorch, "test_from_torch", test_from_torch, devices=torch_compatible_devices)
+        add_function_test(TestTorch, "test_from_torch_slices", test_from_torch_slices, devices=torch_compatible_devices)
+        add_function_test(
+            TestTorch, "test_array_ctype_from_torch", test_array_ctype_from_torch, devices=torch_compatible_devices
+        )
+        add_function_test(
+            TestTorch,
+            "test_from_torch_zero_strides",
+            test_from_torch_zero_strides,
+            devices=torch_compatible_devices,
+        )
+        add_function_test(TestTorch, "test_to_torch", test_to_torch, devices=torch_compatible_devices)
+        add_function_test(TestTorch, "test_torch_zerocopy", test_torch_zerocopy, devices=torch_compatible_devices)
+        add_function_test(TestTorch, "test_torch_autograd", test_torch_autograd, devices=torch_compatible_devices)
+        add_function_test(TestTorch, "test_direct", test_direct, devices=torch_compatible_devices)
+        add_function_test(
+            TestTorch, "test_tensor_in_warp_kernel", test_tensor_in_warp_kernel, devices=torch_compatible_devices
+        )
+    if torch_compatible_cuda_devices:
+        add_function_test(
+            TestTorch,
+            "test_torch_graph_torch_stream",
+            test_torch_graph_torch_stream,
+            devices=torch_compatible_cuda_devices,
+        )
+        add_function_test(
+            TestTorch,
+            "test_torch_graph_warp_stream",
+            test_torch_graph_warp_stream,
+            devices=torch_compatible_cuda_devices,
+        )
+        add_function_test(
+            TestTorch,
+            "test_warp_graph_warp_stream",
+            test_warp_graph_warp_stream,
+            devices=torch_compatible_cuda_devices,
+        )
+        add_function_test(
+            TestTorch,
+            "test_warp_graph_torch_stream",
+            test_warp_graph_torch_stream,
+            devices=torch_compatible_cuda_devices,
+        )
+        add_function_test(
+            TestTorch, "test_cuda_array_interface", test_cuda_array_interface, devices=torch_compatible_cuda_devices
+        )
+    # multi-GPU tests
+    if len(torch_compatible_cuda_devices) > 1:
+        add_function_test(TestTorch, "test_torch_mgpu_from_torch", test_torch_mgpu_from_torch)
+        add_function_test(TestTorch, "test_torch_mgpu_to_torch", test_torch_mgpu_to_torch)
+        add_function_test(TestTorch, "test_torch_mgpu_interop", test_torch_mgpu_interop)
+except Exception as e:
+    print(f"Skipping Torch tests due to exception: {e}")
+if __name__ == "__main__":
+    wp.clear_kernel_cache()
+    unittest.main(verbosity=2)