PyPI - warp-lang - Versions diffs - 0.11.0__py3-none-manylinux2014_x86_64.whl → 1.0.0__py3-none-manylinux2014_x86_64.whl - Mend

warp-lang 0.11.0__py3-none-manylinux2014_x86_64.whl → 1.0.0__py3-none-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (170) hide show

warp/__init__.py +8 -0
warp/bin/warp-clang.so +0 -0
warp/bin/warp.so +0 -0
warp/build.py +7 -6
warp/build_dll.py +70 -79
warp/builtins.py +10 -6
warp/codegen.py +51 -19
warp/config.py +7 -8
warp/constants.py +3 -0
warp/context.py +948 -245
warp/dlpack.py +198 -113
warp/examples/assets/bunny.usd +0 -0
warp/examples/assets/cartpole.urdf +110 -0
warp/examples/assets/crazyflie.usd +0 -0
warp/examples/assets/cube.usda +42 -0
warp/examples/assets/nv_ant.xml +92 -0
warp/examples/assets/nv_humanoid.xml +183 -0
warp/examples/assets/quadruped.urdf +268 -0
warp/examples/assets/rocks.nvdb +0 -0
warp/examples/assets/rocks.usd +0 -0
warp/examples/assets/sphere.usda +56 -0
warp/examples/assets/torus.usda +105 -0
warp/examples/benchmarks/benchmark_api.py +383 -0
warp/examples/benchmarks/benchmark_cloth.py +279 -0
warp/examples/benchmarks/benchmark_cloth_cupy.py +88 -0
warp/examples/benchmarks/benchmark_cloth_jax.py +100 -0
warp/examples/benchmarks/benchmark_cloth_numba.py +142 -0
warp/examples/benchmarks/benchmark_cloth_numpy.py +77 -0
warp/examples/benchmarks/benchmark_cloth_pytorch.py +86 -0
warp/examples/benchmarks/benchmark_cloth_taichi.py +112 -0
warp/examples/benchmarks/benchmark_cloth_warp.py +146 -0
warp/examples/benchmarks/benchmark_launches.py +295 -0
warp/examples/core/example_dem.py +221 -0
warp/examples/core/example_fluid.py +267 -0
warp/examples/core/example_graph_capture.py +129 -0
warp/examples/core/example_marching_cubes.py +177 -0
warp/examples/core/example_mesh.py +154 -0
warp/examples/core/example_mesh_intersect.py +193 -0
warp/examples/core/example_nvdb.py +169 -0
warp/examples/core/example_raycast.py +89 -0
warp/examples/core/example_raymarch.py +178 -0
warp/examples/core/example_render_opengl.py +141 -0
warp/examples/core/example_sph.py +389 -0
warp/examples/core/example_torch.py +181 -0
warp/examples/core/example_wave.py +249 -0
warp/examples/fem/bsr_utils.py +380 -0
warp/examples/fem/example_apic_fluid.py +391 -0
warp/examples/fem/example_convection_diffusion.py +168 -0
warp/examples/fem/example_convection_diffusion_dg.py +209 -0
warp/examples/fem/example_convection_diffusion_dg0.py +194 -0
warp/examples/fem/example_deformed_geometry.py +159 -0
warp/examples/fem/example_diffusion.py +173 -0
warp/examples/fem/example_diffusion_3d.py +152 -0
warp/examples/fem/example_diffusion_mgpu.py +214 -0
warp/examples/fem/example_mixed_elasticity.py +222 -0
warp/examples/fem/example_navier_stokes.py +243 -0
warp/examples/fem/example_stokes.py +192 -0
warp/examples/fem/example_stokes_transfer.py +249 -0
warp/examples/fem/mesh_utils.py +109 -0
warp/examples/fem/plot_utils.py +287 -0
warp/examples/optim/example_bounce.py +248 -0
warp/examples/optim/example_cloth_throw.py +210 -0
warp/examples/optim/example_diffray.py +535 -0
warp/examples/optim/example_drone.py +850 -0
warp/examples/optim/example_inverse_kinematics.py +169 -0
warp/examples/optim/example_inverse_kinematics_torch.py +170 -0
warp/examples/optim/example_spring_cage.py +234 -0
warp/examples/optim/example_trajectory.py +201 -0
warp/examples/sim/example_cartpole.py +128 -0
warp/examples/sim/example_cloth.py +184 -0
warp/examples/sim/example_granular.py +113 -0
warp/examples/sim/example_granular_collision_sdf.py +185 -0
warp/examples/sim/example_jacobian_ik.py +213 -0
warp/examples/sim/example_particle_chain.py +106 -0
warp/examples/sim/example_quadruped.py +179 -0
warp/examples/sim/example_rigid_chain.py +191 -0
warp/examples/sim/example_rigid_contact.py +176 -0
warp/examples/sim/example_rigid_force.py +126 -0
warp/examples/sim/example_rigid_gyroscopic.py +97 -0
warp/examples/sim/example_rigid_soft_contact.py +124 -0
warp/examples/sim/example_soft_body.py +178 -0
warp/fabric.py +29 -20
warp/fem/cache.py +0 -1
warp/fem/dirichlet.py +0 -2
warp/fem/integrate.py +0 -1
warp/jax.py +45 -0
warp/jax_experimental.py +339 -0
warp/native/builtin.h +12 -0
warp/native/bvh.cu +18 -18
warp/native/clang/clang.cpp +8 -3
warp/native/cuda_util.cpp +94 -5
warp/native/cuda_util.h +35 -6
warp/native/cutlass_gemm.cpp +1 -1
warp/native/cutlass_gemm.cu +4 -1
warp/native/error.cpp +66 -0
warp/native/error.h +27 -0
warp/native/mesh.cu +2 -2
warp/native/reduce.cu +4 -4
warp/native/runlength_encode.cu +2 -2
warp/native/scan.cu +2 -2
warp/native/sparse.cu +0 -1
warp/native/temp_buffer.h +2 -2
warp/native/warp.cpp +95 -60
warp/native/warp.cu +1053 -218
warp/native/warp.h +49 -32
warp/optim/linear.py +33 -16
warp/render/render_opengl.py +202 -101
warp/render/render_usd.py +82 -40
warp/sim/__init__.py +13 -4
warp/sim/articulation.py +4 -5
warp/sim/collide.py +320 -175
warp/sim/import_mjcf.py +25 -30
warp/sim/import_urdf.py +94 -63
warp/sim/import_usd.py +51 -36
warp/sim/inertia.py +3 -2
warp/sim/integrator.py +233 -0
warp/sim/integrator_euler.py +447 -469
warp/sim/integrator_featherstone.py +1991 -0
warp/sim/integrator_xpbd.py +1420 -640
warp/sim/model.py +765 -487
warp/sim/particles.py +2 -1
warp/sim/render.py +35 -13
warp/sim/utils.py +222 -11
warp/stubs.py +8 -0
warp/tape.py +16 -1
warp/tests/aux_test_grad_customs.py +23 -0
warp/tests/test_array.py +190 -1
warp/tests/test_async.py +656 -0
warp/tests/test_bool.py +50 -0
warp/tests/test_dlpack.py +164 -11
warp/tests/test_examples.py +166 -74
warp/tests/test_fem.py +8 -1
warp/tests/test_generics.py +15 -5
warp/tests/test_grad.py +1 -1
warp/tests/test_grad_customs.py +172 -12
warp/tests/test_jax.py +254 -0
warp/tests/test_large.py +29 -6
warp/tests/test_launch.py +25 -0
warp/tests/test_linear_solvers.py +20 -3
warp/tests/test_matmul.py +61 -16
warp/tests/test_matmul_lite.py +13 -13
warp/tests/test_mempool.py +186 -0
warp/tests/test_multigpu.py +3 -0
warp/tests/test_options.py +16 -2
warp/tests/test_peer.py +137 -0
warp/tests/test_print.py +3 -1
warp/tests/test_quat.py +23 -0
warp/tests/test_sim_kinematics.py +97 -0
warp/tests/test_snippet.py +126 -3
warp/tests/test_streams.py +108 -79
warp/tests/test_torch.py +16 -8
warp/tests/test_utils.py +32 -27
warp/tests/test_verify_fp.py +65 -0
warp/tests/test_volume.py +1 -1
warp/tests/unittest_serial.py +2 -0
warp/tests/unittest_suites.py +12 -0
warp/tests/unittest_utils.py +14 -7
warp/thirdparty/unittest_parallel.py +15 -3
warp/torch.py +10 -8
warp/types.py +363 -246
warp/utils.py +143 -19
warp_lang-1.0.0.dist-info/LICENSE.md +126 -0
warp_lang-1.0.0.dist-info/METADATA +394 -0
{warp_lang-0.11.0.dist-info → warp_lang-1.0.0.dist-info}/RECORD +167 -86
warp/sim/optimizer.py +0 -138
warp_lang-0.11.0.dist-info/LICENSE.md +0 -36
warp_lang-0.11.0.dist-info/METADATA +0 -238
/warp/tests/{walkthough_debug.py → walkthrough_debug.py} +0 -0
{warp_lang-0.11.0.dist-info → warp_lang-1.0.0.dist-info}/WHEEL +0 -0
{warp_lang-0.11.0.dist-info → warp_lang-1.0.0.dist-info}/top_level.txt +0 -0

warp/tests/test_matmul.py CHANGED Viewed

@@ -10,6 +10,7 @@ import unittest
 import numpy as np
 import warp as wp
+from typing import Any
 from warp.tests.unittest_utils import *
 wp.init()
@@ -76,7 +77,7 @@ class gemm_test_bed_runner:
         if batch_count == 1:
             tape = wp.Tape()
             with tape:
-                wp.matmul(A, B, C, D, alpha, beta, False, self.device)
+                wp.matmul(A, B, C, D, alpha, beta, False)
             tape.backward(grads={D: ones})
             D_np = alpha * (A.numpy() @ B.numpy()) + beta * C.numpy()
@@ -89,7 +90,7 @@ class gemm_test_bed_runner:
         else:
             tape = wp.Tape()
             with tape:
-                wp.batched_matmul(A, B, C, D, alpha, beta, False, self.device)
+                wp.batched_matmul(A, B, C, D, alpha, beta, False)
             tape.backward(grads={D: ones})
             D_np = alpha * np.matmul(A.numpy(), B.numpy()) + beta * C.numpy()
@@ -196,9 +197,9 @@ class gemm_test_bed_runner_transpose:
             BTT2 = BT2.transpose([1, 0])
             tape = wp.Tape()
             with tape:
-                wp.matmul(A, BTT1, C1, D1, alpha, beta, False, self.device)
-                wp.matmul(ATT1, B, C2, D2, alpha, beta, False, self.device)
-                wp.matmul(ATT2, BTT2, C3, D3, alpha, beta, False, self.device)
+                wp.matmul(A, BTT1, C1, D1, alpha, beta, False)
+                wp.matmul(ATT1, B, C2, D2, alpha, beta, False)
+                wp.matmul(ATT2, BTT2, C3, D3, alpha, beta, False)
             tape.backward(grads={D1: ones1, D2: ones2, D3: ones3})
             D_np = alpha * (A.numpy() @ B.numpy()) + beta * C1.numpy()
@@ -217,9 +218,9 @@ class gemm_test_bed_runner_transpose:
             BTT2 = BT2.transpose([0, 2, 1])
             tape = wp.Tape()
             with tape:
-                wp.batched_matmul(A, BTT1, C1, D1, alpha, beta, False, self.device)
-                wp.batched_matmul(ATT1, B, C2, D2, alpha, beta, False, self.device)
-                wp.batched_matmul(ATT2, BTT2, C3, D3, alpha, beta, False, self.device)
+                wp.batched_matmul(A, BTT1, C1, D1, alpha, beta, False)
+                wp.batched_matmul(ATT1, B, C2, D2, alpha, beta, False)
+                wp.batched_matmul(ATT2, BTT2, C3, D3, alpha, beta, False)
             tape.backward(grads={D1: ones1, D2: ones2, D3: ones3})
             D_np = alpha * np.matmul(A.numpy(), B.numpy()) + beta * C1.numpy()
@@ -300,7 +301,7 @@ def test_tape(test, device):
     # test tape
     tape = wp.Tape()
     with tape:
-        wp.matmul(A, B, C, D, device=device)
+        wp.matmul(A, B, C, D)
         wp.launch(matrix_sum_kernel, dim=(m, n), inputs=[D, loss], device=device)
     tape.backward(loss=loss)
@@ -308,8 +309,8 @@ def test_tape(test, device):
     tape.reset()
     # test adjoint
-    D.grad = wp.array2d(np.ones((m, n)), dtype=float, device=device)
-    wp.adj_matmul(A, B, C, A.grad, B.grad, C.grad, D.grad, device=device)
+    D.grad = wp.ones((m, n), dtype=float, device=device)
+    wp.adj_matmul(A, B, C, A.grad, B.grad, C.grad, D.grad)
     assert_np_equal(A_grad, A.grad.numpy())
     # test zero
@@ -342,7 +343,7 @@ def test_operator(test, device):
     tape.backward(loss=loss)
     # test adjoint
-    D.grad = wp.array2d(np.ones((m, n)), dtype=float, device=device)
+    D.grad = wp.ones((m, n), dtype=float, device=device)
     B_transpose = wp.array2d(B.transpose().numpy(), dtype=float, device=device)
     adj_A = D.grad @ B_transpose
@@ -389,7 +390,7 @@ def test_large_batch_count(test, device):
     tape = wp.Tape()
     with tape:
-        wp.batched_matmul(A, B, C, D, alpha=alpha, beta=beta, allow_tf32x3_arith=False, device=device)
+        wp.batched_matmul(A, B, C, D, alpha=alpha, beta=beta, allow_tf32x3_arith=False)
     tape.backward(grads={D: ones})
     D_np = alpha * np.matmul(A.numpy(), B.numpy()) + beta * C.numpy()
@@ -420,8 +421,8 @@ def test_adjoint_accumulation(test, device):
     tape = wp.Tape()
     with tape:
-        wp.matmul(a_wp, b_wp, c_wp, d1_wp, alpha=1.0, beta=1.0, device=device)
-        wp.matmul(a_wp, b_wp, d1_wp, d2_wp, alpha=1.0, beta=1.0, device=device)
+        wp.matmul(a_wp, b_wp, c_wp, d1_wp, alpha=1.0, beta=1.0)
+        wp.matmul(a_wp, b_wp, d1_wp, d2_wp, alpha=1.0, beta=1.0)
     d_grad = wp.zeros_like(d2_wp, device=device)
     d_grad.fill_(1.0)
@@ -433,8 +434,51 @@ def test_adjoint_accumulation(test, device):
     assert np.array_equal(c_wp.grad.numpy(), np.ones(shape=(2, 2)))
-devices = get_test_devices()
+@unittest.skipUnless(runtime.core.is_cutlass_enabled(), "Warp was not built with CUTLASS support")
+def test_cuda_graph_capture(test, device):
+    @wp.kernel
+    def mat_sum(mat: wp.array2d(dtype=Any), loss: wp.array(dtype=Any)):
+        i, j = wp.tid()
+        e = mat[i,j]
+        wp.atomic_add(loss, 0, e)
+    for T in [wp.float16, wp.float32, wp.float64]:
+        wp.overload(mat_sum, [wp.array2d(dtype=T), wp.array(dtype=T)])
+    wp.load_module(device=device)
+    wp.load_module(module="warp.utils", device=device)
+    for T in [wp.float16, wp.float32, wp.float64]:
+        m = 8
+        n = 8
+        k = 8
+        A = wp.ones((m, n), dtype=T, device=device, requires_grad=True)
+        B = wp.ones((n, k), dtype=T, device=device, requires_grad=True)
+        C = wp.zeros((m, k), dtype=T, device=device, requires_grad=True)
+        D = wp.zeros((m, k), dtype=T, device=device, requires_grad=True)
+        loss = wp.zeros(1, dtype=T, device=device, requires_grad=True)
+        wp.capture_begin(device, force_module_load=False)
+        try:
+            tape = wp.Tape()
+            with tape:
+                wp.matmul(A, B, C, D)
+                wp.launch(mat_sum, dim=(m, k), inputs=[D, loss], device=device)
+            tape.backward(loss=loss)
+        finally:
+            graph = wp.capture_end(device)
+        wp.capture_launch(graph)
+        assert_np_equal(A.grad.numpy(), 8.0 * np.ones((m, n), dtype=T))
+devices = get_test_devices()
+cuda_devices = get_unique_cuda_test_devices()
 class TestMatmul(unittest.TestCase):
     pass
@@ -447,6 +491,7 @@ add_function_test(TestMatmul, "test_tape", test_tape, devices=devices)
 add_function_test(TestMatmul, "test_operator", test_operator, devices=devices)
 add_function_test(TestMatmul, "test_large_batch_count", test_large_batch_count, devices=devices)
 add_function_test(TestMatmul, "test_adjoint_accumulation", test_adjoint_accumulation, devices=devices)
+add_function_test(TestMatmul, "test_cuda_graph_capture", test_cuda_graph_capture, devices=cuda_devices)
 if __name__ == "__main__":

warp/tests/test_matmul_lite.py CHANGED Viewed

@@ -76,7 +76,7 @@ class gemm_test_bed_runner:
         if batch_count == 1:
             tape = wp.Tape()
             with tape:
-                wp.matmul(A, B, C, D, alpha, beta, False, self.device)
+                wp.matmul(A, B, C, D, alpha, beta, False)
             tape.backward(grads={D: ones})
             D_np = alpha * (A.numpy() @ B.numpy()) + beta * C.numpy()
@@ -89,7 +89,7 @@ class gemm_test_bed_runner:
         else:
             tape = wp.Tape()
             with tape:
-                wp.batched_matmul(A, B, C, D, alpha, beta, False, self.device)
+                wp.batched_matmul(A, B, C, D, alpha, beta, False)
             tape.backward(grads={D: ones})
             D_np = alpha * np.matmul(A.numpy(), B.numpy()) + beta * C.numpy()
@@ -196,9 +196,9 @@ class gemm_test_bed_runner_transpose:
             BTT2 = BT2.transpose([1, 0])
             tape = wp.Tape()
             with tape:
-                wp.matmul(A, BTT1, C1, D1, alpha, beta, False, self.device)
-                wp.matmul(ATT1, B, C2, D2, alpha, beta, False, self.device)
-                wp.matmul(ATT2, BTT2, C3, D3, alpha, beta, False, self.device)
+                wp.matmul(A, BTT1, C1, D1, alpha, beta, False)
+                wp.matmul(ATT1, B, C2, D2, alpha, beta, False)
+                wp.matmul(ATT2, BTT2, C3, D3, alpha, beta, False)
             tape.backward(grads={D1: ones1, D2: ones2, D3: ones3})
             D_np = alpha * (A.numpy() @ B.numpy()) + beta * C1.numpy()
@@ -217,9 +217,9 @@ class gemm_test_bed_runner_transpose:
             BTT2 = BT2.transpose([0, 2, 1])
             tape = wp.Tape()
             with tape:
-                wp.batched_matmul(A, BTT1, C1, D1, alpha, beta, False, self.device)
-                wp.batched_matmul(ATT1, B, C2, D2, alpha, beta, False, self.device)
-                wp.batched_matmul(ATT2, BTT2, C3, D3, alpha, beta, False, self.device)
+                wp.batched_matmul(A, BTT1, C1, D1, alpha, beta, False)
+                wp.batched_matmul(ATT1, B, C2, D2, alpha, beta, False)
+                wp.batched_matmul(ATT2, BTT2, C3, D3, alpha, beta, False)
             tape.backward(grads={D1: ones1, D2: ones2, D3: ones3})
             D_np = alpha * np.matmul(A.numpy(), B.numpy()) + beta * C1.numpy()
@@ -288,7 +288,7 @@ def test_tape(test, device):
     # test tape
     tape = wp.Tape()
     with tape:
-        wp.matmul(A, B, C, D, device=device)
+        wp.matmul(A, B, C, D)
         wp.launch(matrix_sum_kernel, dim=(m, n), inputs=[D, loss], device=device)
     tape.backward(loss=loss)
@@ -296,8 +296,8 @@ def test_tape(test, device):
     tape.reset()
     # test adjoint
-    D.grad = wp.array2d(np.ones((m, n)), dtype=float, device=device)
-    wp.adj_matmul(A, B, C, A.grad, B.grad, C.grad, D.grad, device=device)
+    D.grad = wp.ones((m, n), dtype=float, device=device)
+    wp.adj_matmul(A, B, C, A.grad, B.grad, C.grad, D.grad)
     assert_np_equal(A_grad, A.grad.numpy())
     # test zero
@@ -330,7 +330,7 @@ def test_operator(test, device):
     tape.backward(loss=loss)
     # test adjoint
-    D.grad = wp.array2d(np.ones((m, n)), dtype=float, device=device)
+    D.grad = wp.ones((m, n), dtype=float, device=device)
     B_transpose = wp.array2d(B.transpose().numpy(), dtype=float, device=device)
     adj_A = D.grad @ B_transpose
@@ -377,7 +377,7 @@ def test_large_batch_count(test, device):
     tape = wp.Tape()
     with tape:
-        wp.batched_matmul(A, B, C, D, alpha=alpha, beta=beta, allow_tf32x3_arith=False, device=device)
+        wp.batched_matmul(A, B, C, D, alpha=alpha, beta=beta, allow_tf32x3_arith=False)
     tape.backward(grads={D: ones})
     D_np = alpha * np.matmul(A.numpy(), B.numpy()) + beta * C.numpy()

warp/tests/test_mempool.py ADDED Viewed

@@ -0,0 +1,186 @@
+# Copyright (c) 2023 NVIDIA CORPORATION.  All rights reserved.
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+import unittest
+import warp as wp
+from warp.tests.unittest_utils import *
+wp.init()
+def get_device_pair_with_mempool_access_support():
+    devices = wp.get_cuda_devices()
+    for target_device in devices:
+        for peer_device in devices:
+            if target_device != peer_device:
+                if wp.is_mempool_access_supported(target_device, peer_device):
+                    return (target_device, peer_device)
+    return None
+def get_device_pair_without_mempool_access_support():
+    devices = wp.get_cuda_devices()
+    for target_device in devices:
+        for peer_device in devices:
+            if target_device != peer_device:
+                if not wp.is_mempool_access_supported(target_device, peer_device):
+                    return (target_device, peer_device)
+    return None
+def test_mempool_release_threshold(test, device):
+    device = wp.get_device(device)
+    assert device.is_mempool_supported
+    test.assertEqual(wp.is_mempool_supported(device), device.is_mempool_supported)
+    was_enabled = wp.is_mempool_enabled(device)
+    # toggle
+    wp.set_mempool_enabled(device, not was_enabled)
+    test.assertEqual(wp.is_mempool_enabled(device), not was_enabled)
+    # restore
+    wp.set_mempool_enabled(device, was_enabled)
+    test.assertEqual(wp.is_mempool_enabled(device), was_enabled)
+    saved_threshold = wp.get_mempool_release_threshold(device)
+    # set new threshold
+    wp.set_mempool_release_threshold(device, 42000)
+    test.assertEqual(wp.get_mempool_release_threshold(device), 42000)
+    # restore threshold
+    wp.set_mempool_release_threshold(device, saved_threshold)
+    test.assertEqual(wp.get_mempool_release_threshold(device), saved_threshold)
+def test_mempool_exceptions(test, device):
+    device = wp.get_device(device)
+    assert not device.is_mempool_supported
+    if device.is_cuda:
+        expected_error = RuntimeError
+    else:
+        expected_error = ValueError
+    with test.assertRaises(expected_error):
+        wp.get_mempool_release_threshold(device)
+    with test.assertRaises(expected_error):
+        wp.set_mempool_release_threshold(device, 42000)
+def test_mempool_access_self(test, device):
+    device = wp.get_device(device)
+    assert device.is_mempool_supported
+    # setting mempool access to self is a no-op
+    wp.set_mempool_access_enabled(device, device, True)
+    wp.set_mempool_access_enabled(device, device, False)
+    # should always be enabled
+    enabled = wp.is_mempool_access_enabled(device, device)
+    test.assertTrue(enabled)
+@unittest.skipUnless(get_device_pair_with_mempool_access_support(), "Requires devices with mempool access support")
+def test_mempool_access(test, _):
+    target_device, peer_device = get_device_pair_with_mempool_access_support()
+    was_enabled = wp.is_mempool_access_enabled(target_device, peer_device)
+    if was_enabled:
+        # try disabling
+        wp.set_mempool_access_enabled(target_device, peer_device, False)
+        is_enabled = wp.is_mempool_access_enabled(target_device, peer_device)
+        test.assertFalse(is_enabled)
+        # try re-enabling
+        wp.set_mempool_access_enabled(target_device, peer_device, True)
+        is_enabled = wp.is_mempool_access_enabled(target_device, peer_device)
+        test.assertTrue(is_enabled)
+    else:
+        # try enabling
+        wp.set_mempool_access_enabled(target_device, peer_device, True)
+        is_enabled = wp.is_mempool_access_enabled(target_device, peer_device)
+        test.assertTrue(is_enabled)
+        # try re-disabling
+        wp.set_mempool_access_enabled(target_device, peer_device, False)
+        is_enabled = wp.is_mempool_access_enabled(target_device, peer_device)
+        test.assertFalse(is_enabled)
+@unittest.skipUnless(get_device_pair_without_mempool_access_support(), "Requires devices without mempool access support")
+def test_mempool_access_exceptions_unsupported(test, _):
+    # get a CUDA device pair without mempool access support
+    target_device, peer_device = get_device_pair_without_mempool_access_support()
+    # querying is ok, but must return False
+    test.assertFalse(wp.is_mempool_access_enabled(target_device, peer_device))
+    # enabling should raise RuntimeError
+    with test.assertRaises(RuntimeError):
+        wp.set_mempool_access_enabled(target_device, peer_device, True)
+    # disabling should not raise an error
+    wp.set_mempool_access_enabled(target_device, peer_device, False)
+@unittest.skipUnless(wp.is_cpu_available() and wp.is_cuda_available(), "Requires both CUDA and CPU devices")
+def test_mempool_access_exceptions_cpu(test, _):
+    # querying is ok, but must return False
+    test.assertFalse(wp.is_mempool_access_enabled("cuda:0", "cpu"))
+    test.assertFalse(wp.is_mempool_access_enabled("cpu", "cuda:0"))
+    # enabling should raise ValueError
+    with test.assertRaises(ValueError):
+        wp.set_mempool_access_enabled("cpu", "cuda:0", True)
+    with test.assertRaises(ValueError):
+        wp.set_mempool_access_enabled("cuda:0", "cpu", True)
+    # disabling should not raise an error
+    wp.set_mempool_access_enabled("cpu", "cuda:0", False)
+    wp.set_mempool_access_enabled("cuda:0", "cpu", False)
+class TestMempool(unittest.TestCase):
+    pass
+devices_with_mempools = [d for d in get_test_devices() if d.is_mempool_supported]
+devices_without_mempools = [d for d in get_test_devices() if not d.is_mempool_supported]
+# test devices with mempool support
+add_function_test(TestMempool, "test_mempool_release_threshold", test_mempool_release_threshold, devices=devices_with_mempools)
+add_function_test(TestMempool, "test_mempool_access_self", test_mempool_access_self, devices=devices_with_mempools)
+# test devices without mempool support
+add_function_test(TestMempool, "test_mempool_exceptions", test_mempool_exceptions, devices=devices_without_mempools)
+# mempool access tests
+add_function_test(TestMempool, "test_mempool_access", test_mempool_access)
+# mempool access exceptions
+add_function_test(TestMempool, "test_mempool_access_exceptions_unsupported", test_mempool_access_exceptions_unsupported)
+add_function_test(TestMempool, "test_mempool_access_exceptions_cpu", test_mempool_access_exceptions_cpu)
+if __name__ == "__main__":
+    wp.build.clear_kernel_cache()
+    unittest.main(verbosity=2)

warp/tests/test_multigpu.py CHANGED Viewed

@@ -10,6 +10,7 @@ import unittest
 import numpy as np
 import warp as wp
+from warp.utils import check_iommu
 from warp.tests.unittest_utils import *
 wp.init()
@@ -106,6 +107,7 @@ class TestMultiGPU(unittest.TestCase):
         assert wp.get_cuda_device() == initial_cuda_device
     @unittest.skipUnless(len(wp.get_cuda_devices()) > 1, "Requires at least two CUDA devices")
+    @unittest.skipUnless(check_iommu(), "IOMMU seems enabled")
     def test_multigpu_pingpong(self):
         n = 1024 * 1024
@@ -129,6 +131,7 @@ class TestMultiGPU(unittest.TestCase):
         assert_np_equal(a1.numpy(), expected)
     @unittest.skipUnless(len(wp.get_cuda_devices()) > 1, "Requires at least two CUDA devices")
+    @unittest.skipUnless(check_iommu(), "IOMMU seems enabled")
     def test_multigpu_pingpong_streams(self):
         n = 1024 * 1024

warp/tests/test_options.py CHANGED Viewed

@@ -6,12 +6,16 @@
 # license agreement from NVIDIA CORPORATION is strictly prohibited.
 import unittest
+import contextlib
+import io
 import warp as wp
 from warp.tests.unittest_utils import *
 wp.init()
+from warp.context import runtime  # noqa: E402
 @wp.kernel
 def scale(
@@ -47,7 +51,12 @@ def test_options_1(test, device):
     with tape:
         wp.launch(scale, dim=1, inputs=[x, y], device=device)
-    tape.backward(y)
+    with contextlib.redirect_stdout(io.StringIO()) as f:
+        tape.backward(y)
+    expected = f"Warp UserWarning: Running the tape backwards may produce incorrect gradients because recorded kernel {scale.key} is defined in a module with the option 'enable_backward=False' set.\n"
+    assert f.getvalue() == expected
     assert_np_equal(tape.gradients[x].numpy(), np.array(0.0))
@@ -89,7 +98,12 @@ def test_options_4(test, device):
     with tape:
         wp.launch(scale_2, dim=1, inputs=[x, y], device=device)
-    tape.backward(y)
+    with contextlib.redirect_stdout(io.StringIO()) as f:
+        tape.backward(y)
+    expected = f"Warp UserWarning: Running the tape backwards may produce incorrect gradients because recorded kernel {scale_2.key} is configured with the option 'enable_backward=False'.\n"
+    assert f.getvalue() == expected
     assert_np_equal(tape.gradients[x].numpy(), np.array(0.0))

warp/tests/test_peer.py ADDED Viewed

@@ -0,0 +1,137 @@
+# Copyright (c) 2023 NVIDIA CORPORATION.  All rights reserved.
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+import unittest
+import warp as wp
+from warp.tests.unittest_utils import *
+wp.init()
+def get_device_pair_with_peer_access_support():
+    devices = wp.get_cuda_devices()
+    for target_device in devices:
+        for peer_device in devices:
+            if target_device != peer_device:
+                if wp.is_peer_access_supported(target_device, peer_device):
+                    return (target_device, peer_device)
+    return None
+def get_device_pair_without_peer_access_support():
+    devices = wp.get_cuda_devices()
+    for target_device in devices:
+        for peer_device in devices:
+            if target_device != peer_device:
+                if not wp.is_peer_access_supported(target_device, peer_device):
+                    return (target_device, peer_device)
+    return None
+def test_peer_access_self(test, device):
+    device = wp.get_device(device)
+    assert device.is_cuda
+    # device can access self
+    can_access = wp.is_peer_access_supported(device, device)
+    test.assertTrue(can_access)
+    # setting peer access to self is a no-op
+    wp.set_peer_access_enabled(device, device, True)
+    wp.set_peer_access_enabled(device, device, False)
+    # should always be enabled
+    enabled = wp.is_peer_access_enabled(device, device)
+    test.assertTrue(enabled)
+@unittest.skipUnless(get_device_pair_with_peer_access_support(), "Requires devices with peer access support")
+def test_peer_access(test, _):
+    target_device, peer_device = get_device_pair_with_peer_access_support()
+    was_enabled = wp.is_peer_access_enabled(target_device, peer_device)
+    if was_enabled:
+        # try disabling
+        wp.set_peer_access_enabled(target_device, peer_device, False)
+        is_enabled = wp.is_peer_access_enabled(target_device, peer_device)
+        test.assertFalse(is_enabled)
+        # try re-enabling
+        wp.set_peer_access_enabled(target_device, peer_device, True)
+        is_enabled = wp.is_peer_access_enabled(target_device, peer_device)
+        test.assertTrue(is_enabled)
+    else:
+        # try enabling
+        wp.set_peer_access_enabled(target_device, peer_device, True)
+        is_enabled = wp.is_peer_access_enabled(target_device, peer_device)
+        test.assertTrue(is_enabled)
+        # try re-disabling
+        wp.set_peer_access_enabled(target_device, peer_device, False)
+        is_enabled = wp.is_peer_access_enabled(target_device, peer_device)
+        test.assertFalse(is_enabled)
+@unittest.skipUnless(get_device_pair_without_peer_access_support(), "Requires devices without peer access support")
+def test_peer_access_exceptions_unsupported(test, _):
+    # get a CUDA device pair without peer access support
+    target_device, peer_device = get_device_pair_without_peer_access_support()
+    # querying is ok, but must return False
+    test.assertFalse(wp.is_peer_access_enabled(target_device, peer_device))
+    # enabling should raise RuntimeError
+    with test.assertRaises(RuntimeError):
+        wp.set_peer_access_enabled(target_device, peer_device, True)
+    # disabling should not raise an error
+    wp.set_peer_access_enabled(target_device, peer_device, False)
+@unittest.skipUnless(wp.is_cpu_available() and wp.is_cuda_available(), "Requires both CUDA and CPU devices")
+def test_peer_access_exceptions_cpu(test, _):
+    # querying is ok, but must return False
+    test.assertFalse(wp.is_peer_access_enabled("cuda:0", "cpu"))
+    test.assertFalse(wp.is_peer_access_enabled("cpu", "cuda:0"))
+    # enabling should raise ValueError
+    with test.assertRaises(ValueError):
+        wp.set_peer_access_enabled("cpu", "cuda:0", True)
+    with test.assertRaises(ValueError):
+        wp.set_peer_access_enabled("cuda:0", "cpu", True)
+    # disabling should not raise an error
+    wp.set_peer_access_enabled("cpu", "cuda:0", False)
+    wp.set_peer_access_enabled("cuda:0", "cpu", False)
+class TestPeer(unittest.TestCase):
+    pass
+cuda_test_devices = get_cuda_test_devices()
+add_function_test(TestPeer, "test_peer_access_self", test_peer_access_self, devices=cuda_test_devices)
+# peer access tests
+add_function_test(TestPeer, "test_peer_access", test_peer_access)
+# peer access exceptions
+add_function_test(TestPeer, "test_peer_access_exceptions_unsupported", test_peer_access_exceptions_unsupported)
+add_function_test(TestPeer, "test_peer_access_exceptions_cpu", test_peer_access_exceptions_cpu)
+if __name__ == "__main__":
+    wp.build.clear_kernel_cache()
+    unittest.main(verbosity=2)

warp/tests/test_print.py CHANGED Viewed

@@ -5,6 +5,7 @@
 # distribution of this software and related documentation without an express
 # license agreement from NVIDIA CORPORATION is strictly prohibited.
+import sys
 import unittest
 import warp as wp
@@ -30,7 +31,8 @@ def test_print(test, device):
     s = capture.end()
     # The CPU kernel printouts don't get captured by StdOutCapture()
-    if device.is_cuda:
+    # We skip the win32 comparison for now since the capture sometimes is an empty string
+    if device.is_cuda and sys.platform != "win32":
         test.assertRegex(
             s,
             rf"1{os.linesep}"

warp/tests/test_quat.py CHANGED Viewed

@@ -11,6 +11,7 @@ import numpy as np
 import warp as wp
 from warp.tests.unittest_utils import *
+import warp.sim
 wp.init()
@@ -1871,6 +1872,21 @@ def test_quat_identity(test, device, dtype, register_kernels=False):
     assert_np_equal(output.numpy(), expected)
+############################################################
+def test_quat_euler_conversion(test, device, dtype, register_kernels=False):
+    rng = np.random.default_rng(123)
+    N = 3
+    rpy_arr = rng.uniform(low=-np.pi, high=np.pi, size=(N, 3))
+    quats_from_euler = [list(wp.sim.quat_from_euler(wp.vec3(*rpy), 0, 1, 2)) for rpy in rpy_arr]
+    quats_from_rpy = [list(wp.quat_rpy(rpy[0], rpy[1], rpy[2])) for rpy in rpy_arr]
+    assert_np_equal(np.array(quats_from_euler), np.array(quats_from_rpy), tol=1e-4)
 def test_anon_type_instance(test, device, dtype, register_kernels=False):
     rng = np.random.default_rng(123)
     wptype = wp.types.np_dtype_to_warp_type[np.dtype(dtype)]
@@ -2053,6 +2069,13 @@ for dtype in np_float_types:
     add_function_test_register_kernel(
         TestQuat, f"test_quat_to_matrix_{dtype.__name__}", test_quat_to_matrix, devices=devices, dtype=dtype
     )
+    add_function_test_register_kernel(
+        TestQuat,
+        f"test_quat_euler_conversion_{dtype.__name__}",
+        test_quat_euler_conversion,
+        devices=devices,
+        dtype=dtype,
+    )
     add_function_test(
         TestQuat, f"test_py_arithmetic_ops_{dtype.__name__}", test_py_arithmetic_ops, devices=None, dtype=dtype
     )