PyPI - warp-lang - Versions diffs - 0.11.0__py3-none-manylinux2014_x86_64.whl → 1.0.0__py3-none-manylinux2014_x86_64.whl - Mend

warp-lang 0.11.0__py3-none-manylinux2014_x86_64.whl → 1.0.0__py3-none-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (170) hide show

warp/__init__.py +8 -0
warp/bin/warp-clang.so +0 -0
warp/bin/warp.so +0 -0
warp/build.py +7 -6
warp/build_dll.py +70 -79
warp/builtins.py +10 -6
warp/codegen.py +51 -19
warp/config.py +7 -8
warp/constants.py +3 -0
warp/context.py +948 -245
warp/dlpack.py +198 -113
warp/examples/assets/bunny.usd +0 -0
warp/examples/assets/cartpole.urdf +110 -0
warp/examples/assets/crazyflie.usd +0 -0
warp/examples/assets/cube.usda +42 -0
warp/examples/assets/nv_ant.xml +92 -0
warp/examples/assets/nv_humanoid.xml +183 -0
warp/examples/assets/quadruped.urdf +268 -0
warp/examples/assets/rocks.nvdb +0 -0
warp/examples/assets/rocks.usd +0 -0
warp/examples/assets/sphere.usda +56 -0
warp/examples/assets/torus.usda +105 -0
warp/examples/benchmarks/benchmark_api.py +383 -0
warp/examples/benchmarks/benchmark_cloth.py +279 -0
warp/examples/benchmarks/benchmark_cloth_cupy.py +88 -0
warp/examples/benchmarks/benchmark_cloth_jax.py +100 -0
warp/examples/benchmarks/benchmark_cloth_numba.py +142 -0
warp/examples/benchmarks/benchmark_cloth_numpy.py +77 -0
warp/examples/benchmarks/benchmark_cloth_pytorch.py +86 -0
warp/examples/benchmarks/benchmark_cloth_taichi.py +112 -0
warp/examples/benchmarks/benchmark_cloth_warp.py +146 -0
warp/examples/benchmarks/benchmark_launches.py +295 -0
warp/examples/core/example_dem.py +221 -0
warp/examples/core/example_fluid.py +267 -0
warp/examples/core/example_graph_capture.py +129 -0
warp/examples/core/example_marching_cubes.py +177 -0
warp/examples/core/example_mesh.py +154 -0
warp/examples/core/example_mesh_intersect.py +193 -0
warp/examples/core/example_nvdb.py +169 -0
warp/examples/core/example_raycast.py +89 -0
warp/examples/core/example_raymarch.py +178 -0
warp/examples/core/example_render_opengl.py +141 -0
warp/examples/core/example_sph.py +389 -0
warp/examples/core/example_torch.py +181 -0
warp/examples/core/example_wave.py +249 -0
warp/examples/fem/bsr_utils.py +380 -0
warp/examples/fem/example_apic_fluid.py +391 -0
warp/examples/fem/example_convection_diffusion.py +168 -0
warp/examples/fem/example_convection_diffusion_dg.py +209 -0
warp/examples/fem/example_convection_diffusion_dg0.py +194 -0
warp/examples/fem/example_deformed_geometry.py +159 -0
warp/examples/fem/example_diffusion.py +173 -0
warp/examples/fem/example_diffusion_3d.py +152 -0
warp/examples/fem/example_diffusion_mgpu.py +214 -0
warp/examples/fem/example_mixed_elasticity.py +222 -0
warp/examples/fem/example_navier_stokes.py +243 -0
warp/examples/fem/example_stokes.py +192 -0
warp/examples/fem/example_stokes_transfer.py +249 -0
warp/examples/fem/mesh_utils.py +109 -0
warp/examples/fem/plot_utils.py +287 -0
warp/examples/optim/example_bounce.py +248 -0
warp/examples/optim/example_cloth_throw.py +210 -0
warp/examples/optim/example_diffray.py +535 -0
warp/examples/optim/example_drone.py +850 -0
warp/examples/optim/example_inverse_kinematics.py +169 -0
warp/examples/optim/example_inverse_kinematics_torch.py +170 -0
warp/examples/optim/example_spring_cage.py +234 -0
warp/examples/optim/example_trajectory.py +201 -0
warp/examples/sim/example_cartpole.py +128 -0
warp/examples/sim/example_cloth.py +184 -0
warp/examples/sim/example_granular.py +113 -0
warp/examples/sim/example_granular_collision_sdf.py +185 -0
warp/examples/sim/example_jacobian_ik.py +213 -0
warp/examples/sim/example_particle_chain.py +106 -0
warp/examples/sim/example_quadruped.py +179 -0
warp/examples/sim/example_rigid_chain.py +191 -0
warp/examples/sim/example_rigid_contact.py +176 -0
warp/examples/sim/example_rigid_force.py +126 -0
warp/examples/sim/example_rigid_gyroscopic.py +97 -0
warp/examples/sim/example_rigid_soft_contact.py +124 -0
warp/examples/sim/example_soft_body.py +178 -0
warp/fabric.py +29 -20
warp/fem/cache.py +0 -1
warp/fem/dirichlet.py +0 -2
warp/fem/integrate.py +0 -1
warp/jax.py +45 -0
warp/jax_experimental.py +339 -0
warp/native/builtin.h +12 -0
warp/native/bvh.cu +18 -18
warp/native/clang/clang.cpp +8 -3
warp/native/cuda_util.cpp +94 -5
warp/native/cuda_util.h +35 -6
warp/native/cutlass_gemm.cpp +1 -1
warp/native/cutlass_gemm.cu +4 -1
warp/native/error.cpp +66 -0
warp/native/error.h +27 -0
warp/native/mesh.cu +2 -2
warp/native/reduce.cu +4 -4
warp/native/runlength_encode.cu +2 -2
warp/native/scan.cu +2 -2
warp/native/sparse.cu +0 -1
warp/native/temp_buffer.h +2 -2
warp/native/warp.cpp +95 -60
warp/native/warp.cu +1053 -218
warp/native/warp.h +49 -32
warp/optim/linear.py +33 -16
warp/render/render_opengl.py +202 -101
warp/render/render_usd.py +82 -40
warp/sim/__init__.py +13 -4
warp/sim/articulation.py +4 -5
warp/sim/collide.py +320 -175
warp/sim/import_mjcf.py +25 -30
warp/sim/import_urdf.py +94 -63
warp/sim/import_usd.py +51 -36
warp/sim/inertia.py +3 -2
warp/sim/integrator.py +233 -0
warp/sim/integrator_euler.py +447 -469
warp/sim/integrator_featherstone.py +1991 -0
warp/sim/integrator_xpbd.py +1420 -640
warp/sim/model.py +765 -487
warp/sim/particles.py +2 -1
warp/sim/render.py +35 -13
warp/sim/utils.py +222 -11
warp/stubs.py +8 -0
warp/tape.py +16 -1
warp/tests/aux_test_grad_customs.py +23 -0
warp/tests/test_array.py +190 -1
warp/tests/test_async.py +656 -0
warp/tests/test_bool.py +50 -0
warp/tests/test_dlpack.py +164 -11
warp/tests/test_examples.py +166 -74
warp/tests/test_fem.py +8 -1
warp/tests/test_generics.py +15 -5
warp/tests/test_grad.py +1 -1
warp/tests/test_grad_customs.py +172 -12
warp/tests/test_jax.py +254 -0
warp/tests/test_large.py +29 -6
warp/tests/test_launch.py +25 -0
warp/tests/test_linear_solvers.py +20 -3
warp/tests/test_matmul.py +61 -16
warp/tests/test_matmul_lite.py +13 -13
warp/tests/test_mempool.py +186 -0
warp/tests/test_multigpu.py +3 -0
warp/tests/test_options.py +16 -2
warp/tests/test_peer.py +137 -0
warp/tests/test_print.py +3 -1
warp/tests/test_quat.py +23 -0
warp/tests/test_sim_kinematics.py +97 -0
warp/tests/test_snippet.py +126 -3
warp/tests/test_streams.py +108 -79
warp/tests/test_torch.py +16 -8
warp/tests/test_utils.py +32 -27
warp/tests/test_verify_fp.py +65 -0
warp/tests/test_volume.py +1 -1
warp/tests/unittest_serial.py +2 -0
warp/tests/unittest_suites.py +12 -0
warp/tests/unittest_utils.py +14 -7
warp/thirdparty/unittest_parallel.py +15 -3
warp/torch.py +10 -8
warp/types.py +363 -246
warp/utils.py +143 -19
warp_lang-1.0.0.dist-info/LICENSE.md +126 -0
warp_lang-1.0.0.dist-info/METADATA +394 -0
{warp_lang-0.11.0.dist-info → warp_lang-1.0.0.dist-info}/RECORD +167 -86
warp/sim/optimizer.py +0 -138
warp_lang-0.11.0.dist-info/LICENSE.md +0 -36
warp_lang-0.11.0.dist-info/METADATA +0 -238
/warp/tests/{walkthough_debug.py → walkthrough_debug.py} +0 -0
{warp_lang-0.11.0.dist-info → warp_lang-1.0.0.dist-info}/WHEEL +0 -0
{warp_lang-0.11.0.dist-info → warp_lang-1.0.0.dist-info}/top_level.txt +0 -0

warp/tests/test_sim_kinematics.py ADDED Viewed

@@ -0,0 +1,97 @@
+# Copyright (c) 2022 NVIDIA CORPORATION.  All rights reserved.
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+import unittest
+import warp as wp
+from warp.tests.unittest_utils import *
+import math
+import os
+import numpy as np
+import warp as wp
+import warp.sim
+wp.init()
+def test_fk_ik(test, device):
+    builder = wp.sim.ModelBuilder()
+    num_envs = 1
+    for i in range(num_envs):
+        wp.sim.parse_mjcf(
+            os.path.join(os.path.dirname(__file__), "../examples/assets/nv_ant.xml"),
+            builder,
+            stiffness=0.0,
+            damping=1.0,
+            armature=0.1,
+            contact_ke=1.0e4,
+            contact_kd=1.0e2,
+            contact_kf=1.0e2,
+            contact_mu=0.75,
+            limit_ke=1.0e3,
+            limit_kd=1.0e1,
+            up_axis="y",
+        )
+        coord_count = 15
+        dof_count = 14
+        coord_start = i * coord_count
+        dof_start = i * dof_count
+        # base
+        builder.joint_q[coord_start : coord_start + 3] = [i * 2.0, 0.70, 0.0]
+        builder.joint_q[coord_start + 3 : coord_start + 7] = wp.quat_from_axis_angle(
+            wp.vec3(1.0, 0.0, 0.0), -math.pi * 0.5
+        )
+        # joints
+        builder.joint_q[coord_start + 7 : coord_start + coord_count] = [0.0, 1.0, 0.0, -1.0, 0.0, -1.0, 0.0, 1.0]
+        builder.joint_qd[dof_start + 6 : dof_start + dof_count] = [1.0, 1.0, 1.0, -1.0, 1.0, -1.0, 1.0, 1.0]
+    # finalize model
+    model = builder.finalize()
+    model.ground = True
+    model.joint_attach_ke *= 16.0
+    model.joint_attach_kd *= 4.0
+    state = model.state()
+    # save a copy of joint values
+    q_fk = model.joint_q.numpy()
+    qd_fk = model.joint_qd.numpy()
+    wp.sim.eval_fk(model, model.joint_q, model.joint_qd, None, state)
+    q_ik = wp.zeros_like(model.joint_q)
+    qd_ik = wp.zeros_like(model.joint_qd)
+    wp.sim.eval_ik(model, state, q_ik, qd_ik)
+    assert_np_equal(q_fk, q_ik.numpy(), tol=1e-6)
+    assert_np_equal(qd_fk, qd_ik.numpy(), tol=1e-6)
+devices = get_test_devices()
+class TestSimKinematics(unittest.TestCase):
+    pass
+add_function_test(TestSimKinematics, "test_fk_ik", test_fk_ik, devices=devices)
+if __name__ == "__main__":
+    wp.build.clear_kernel_cache()
+    unittest.main(verbosity=2, failfast=True)

warp/tests/test_snippet.py CHANGED Viewed

@@ -13,9 +13,9 @@ def test_basic(test, device):
     out[tid] = a * x[tid] + y[tid];
     """
     adj_snippet = """
-    adj_a = x[tid] * adj_out[tid];
-    adj_x[tid] = a * adj_out[tid];
-    adj_y[tid] = adj_out[tid];
+    adj_a += x[tid] * adj_out[tid];
+    adj_x[tid] += a * adj_out[tid];
+    adj_y[tid] += adj_out[tid];
     """
     @wp.func_native(snippet, adj_snippet)
@@ -86,6 +86,7 @@ def test_shared_memory(test, device):
     @wp.func_native(snippet)
     def reverse(d: wp.array(dtype=int), N: int, tid: int):
+        """Reverse the array d in place using shared memory."""
         return
     @wp.kernel
@@ -100,6 +101,7 @@ def test_shared_memory(test, device):
     wp.launch(kernel=reverse_kernel, dim=N, inputs=[x, N], device=device)
     assert_np_equal(x.numpy(), y)
+    assert reverse.__doc__ == "Reverse the array d in place using shared memory."
 def test_cpu_snippet(test, device):
@@ -130,6 +132,124 @@ def test_cpu_snippet(test, device):
     assert_np_equal(out.numpy(), np.arange(1, N + 1, 1, dtype=np.int32))
+def test_custom_replay_grad(test, device):
+    num_threads = 16
+    counter = wp.zeros(1, dtype=wp.int32, device=device)
+    thread_ids = wp.zeros(num_threads, dtype=wp.int32, device=device)
+    inputs = wp.array(np.arange(num_threads, dtype=np.float32), device=device, requires_grad=True)
+    outputs = wp.zeros_like(inputs)
+    snippet = """
+        int next_index = atomicAdd(counter, 1);
+        thread_values[tid] = next_index;
+        """
+    replay_snippet = ""
+    @wp.func_native(snippet, replay_snippet=replay_snippet)
+    def reversible_increment(
+        counter: wp.array(dtype=int), thread_values: wp.array(dtype=int), tid: int
+    ):
+        ...
+    @wp.kernel
+    def run_atomic_add(
+        input: wp.array(dtype=float),
+        counter: wp.array(dtype=int),
+        thread_values: wp.array(dtype=int),
+        output: wp.array(dtype=float),
+    ):
+        tid = wp.tid()
+        reversible_increment(counter, thread_values, tid)
+        idx = thread_values[tid]
+        output[idx] = input[idx] ** 2.0
+    tape = wp.Tape()
+    with tape:
+        wp.launch(
+            run_atomic_add, dim=num_threads, inputs=[inputs, counter, thread_ids], outputs=[outputs], device=device
+        )
+    tape.backward(grads={outputs: wp.array(np.ones(num_threads, dtype=np.float32), device=device)})
+    assert_np_equal(inputs.grad.numpy(), 2.0 * inputs.numpy(), tol=1e-4)
+def test_replay_simplification(test, device):
+    num_threads = 8
+    x = wp.array(1.0 + np.arange(num_threads, dtype=np.float32), device=device, requires_grad=True)
+    y = wp.zeros_like(x)
+    z = wp.zeros_like(x)
+    snippet = "y[tid] = powf(x[tid], 2.0);"
+    replay_snippet = "y[tid] = x[tid];"
+    adj_snippet = "adj_x[tid] += 2.0 * adj_y[tid];"
+    @wp.func_native(snippet, adj_snippet=adj_snippet, replay_snippet=replay_snippet)
+    def square(x: wp.array(dtype=float), y: wp.array(dtype=float), tid: int):
+        ...
+    @wp.kernel
+    def log_square_kernel(
+        x: wp.array(dtype=float),
+        y: wp.array(dtype=float),
+        z: wp.array(dtype=float)
+    ):
+        tid = wp.tid()
+        square(x, y, tid)
+        z[tid] = wp.log(y[tid])
+    tape = wp.Tape()
+    with tape:
+        wp.launch(log_square_kernel, dim=num_threads, inputs=[x, y], outputs=[z], device=device)
+    tape.backward(grads={z: wp.array(np.ones(num_threads, dtype=np.float32), device=device)})
+    assert_np_equal(x.grad.numpy(), 2.0 / (1.0 + np.arange(num_threads)), tol=1e-6)
+def test_recompile_snippet(test, device):
+    snippet = """
+    int inc = 1;
+    out[tid] = x[tid] + inc;
+    """
+    @wp.func_native(snippet)
+    def increment_snippet(
+        x: wp.array(dtype=wp.int32),
+        out: wp.array(dtype=wp.int32),
+        tid: int,
+    ):
+        ...
+    @wp.kernel
+    def increment(x: wp.array(dtype=wp.int32), out: wp.array(dtype=wp.int32)):
+        tid = wp.tid()
+        increment_snippet(x, out, tid)
+    N = 128
+    x = wp.array(np.arange(N, dtype=np.int32), dtype=wp.int32, device=device)
+    out = wp.zeros(N, dtype=wp.int32, device=device)
+    wp.launch(kernel=increment, dim=N, inputs=[x], outputs=[out], device=device)
+    assert_np_equal(out.numpy(), np.arange(1, N + 1, 1, dtype=np.int32))
+    snippet = """
+    int inc = 2;
+    out[tid] = x[tid] + inc;
+    """
+    @wp.func_native(snippet)
+    def increment_snippet(
+        x: wp.array(dtype=wp.int32),
+        out: wp.array(dtype=wp.int32),
+        tid: int,
+    ):
+        ...
+    wp.launch(kernel=increment, dim=N, inputs=[x], outputs=[out], device=device)
+    assert_np_equal(out.numpy(), 1 + np.arange(1, N + 1, 1, dtype=np.int32))
 class TestSnippets(unittest.TestCase):
     pass
@@ -137,6 +257,9 @@ class TestSnippets(unittest.TestCase):
 add_function_test(TestSnippets, "test_basic", test_basic, devices=get_unique_cuda_test_devices())
 add_function_test(TestSnippets, "test_shared_memory", test_shared_memory, devices=get_unique_cuda_test_devices())
 add_function_test(TestSnippets, "test_cpu_snippet", test_cpu_snippet, devices=["cpu"])
+add_function_test(TestSnippets, "test_custom_replay_grad", test_custom_replay_grad, devices=get_unique_cuda_test_devices())
+add_function_test(TestSnippets, "test_replay_simplification", test_replay_simplification, devices=get_unique_cuda_test_devices())
+add_function_test(TestSnippets, "test_recompile_snippet", test_recompile_snippet, devices=get_unique_cuda_test_devices())
 if __name__ == "__main__":

warp/tests/test_streams.py CHANGED Viewed

@@ -10,6 +10,7 @@ import unittest
 import numpy as np
 import warp as wp
+from warp.utils import check_iommu
 from warp.tests.unittest_utils import *
 wp.init()
@@ -37,19 +38,34 @@ def sum(a: wp.array(dtype=float), b: wp.array(dtype=float), c: wp.array(dtype=fl
 N = 10 * 1024 * 1024
-def test_stream_arg_implicit_sync(test, device):
-    # wp.zeros() and array.numpy() should not require explicit sync
+def test_stream_set(test, device):
+    device = wp.get_device(device)
+    old_stream = device.stream
+    new_stream = wp.Stream(device)
+    try:
+        wp.set_stream(new_stream, device)
+        test.assertTrue(device.has_stream)
+        test.assertEqual(device.stream, new_stream)
+    finally:
+        # restore original stream
+        wp.set_stream(old_stream, device)
+def test_stream_arg_explicit_sync(test, device):
     a = wp.zeros(N, dtype=float, device=device)
-    b = wp.empty(N, dtype=float, device=device)
+    b = wp.full(N, 42, dtype=float, device=device)
     c = wp.empty(N, dtype=float, device=device)
+    old_stream = wp.get_stream(device)
     new_stream = wp.Stream(device)
-    # Exercise code path
-    wp.set_stream(new_stream, device)
-    test.assertTrue(wp.get_device(device).has_stream)
+    # allocations need to be explicitly synced before launching work using stream arguments
+    new_stream.wait_stream(old_stream)
     # launch work on new stream
     wp.launch(inc, dim=a.size, inputs=[a], stream=new_stream)
@@ -64,17 +80,17 @@ def test_stream_arg_implicit_sync(test, device):
 def test_stream_scope_implicit_sync(test, device):
-    # wp.zeros() and array.numpy() should not require explicit sync
     with wp.ScopedDevice(device):
         a = wp.zeros(N, dtype=float)
-        b = wp.empty(N, dtype=float)
+        b = wp.full(N, 42, dtype=float)
         c = wp.empty(N, dtype=float)
         old_stream = wp.get_stream()
         new_stream = wp.Stream()
         # launch work on new stream
+        # allocations are implicitly synced when entering wp.ScopedStream
         with wp.ScopedStream(new_stream):
             assert wp.get_stream() == new_stream
@@ -309,103 +325,116 @@ class TestStreams(unittest.TestCase):
             cpu_stream = cpu_device.stream  # noqa: F841
     @unittest.skipUnless(len(wp.get_cuda_devices()) > 1, "Requires at least two CUDA devices")
+    @unittest.skipUnless(check_iommu(), "IOMMU seems enabled")
     def test_stream_arg_graph_mgpu(self):
         wp.load_module(device="cuda:0")
         wp.load_module(device="cuda:1")
-        # resources on GPU 0
-        stream0 = wp.get_stream("cuda:0")
-        a0 = wp.zeros(N, dtype=float, device="cuda:0")
-        b0 = wp.empty(N, dtype=float, device="cuda:0")
-        c0 = wp.empty(N, dtype=float, device="cuda:0")
+        # Peer-to-peer copies are not possible during graph capture if the arrays were
+        # allocated using pooled allocators and mempool access is not enabled.
+        # Here, we force default CUDA allocators and pre-allocate the memory.
+        with wp.ScopedMempool("cuda:0", False), wp.ScopedMempool("cuda:1", False):
-        # resources on GPU 1
-        stream1 = wp.get_stream("cuda:1")
-        a1 = wp.zeros(N, dtype=float, device="cuda:1")
+            # resources on GPU 0
+            stream0 = wp.get_stream("cuda:0")
+            a0 = wp.zeros(N, dtype=float, device="cuda:0")
+            b0 = wp.empty(N, dtype=float, device="cuda:0")
+            c0 = wp.empty(N, dtype=float, device="cuda:0")
-        # start recording on stream0
-        wp.capture_begin(stream=stream0, force_module_load=False)
-        try:
-            # branch into stream1
-            stream1.wait_stream(stream0)
+            # resources on GPU 1
+            stream1 = wp.get_stream("cuda:1")
+            a1 = wp.zeros(N, dtype=float, device="cuda:1")
-            # launch concurrent kernels on each stream
-            wp.launch(inc, dim=N, inputs=[a0], stream=stream0)
-            wp.launch(inc, dim=N, inputs=[a1], stream=stream1)
+            # start recording on stream0
+            wp.capture_begin(stream=stream0, force_module_load=False)
+            try:
+                # branch into stream1
+                stream1.wait_stream(stream0)
-            # wait for stream1 to finish
-            stream0.wait_stream(stream1)
+                # launch concurrent kernels on each stream
+                wp.launch(inc, dim=N, inputs=[a0], stream=stream0)
+                wp.launch(inc, dim=N, inputs=[a1], stream=stream1)
-            # copy values from stream1
-            wp.copy(b0, a1, stream=stream0)
+                # wait for stream1 to finish
+                stream0.wait_stream(stream1)
+                # copy values from stream1
+                wp.copy(b0, a1, stream=stream0)
-            # compute sum
-            wp.launch(sum, dim=N, inputs=[a0, b0, c0], stream=stream0)
-        finally:
-            # finish recording on stream0
-            g = wp.capture_end(stream=stream0)
+                # compute sum
+                wp.launch(sum, dim=N, inputs=[a0, b0, c0], stream=stream0)
+            finally:
+                # finish recording on stream0
+                g = wp.capture_end(stream=stream0)
-        # replay
-        num_iters = 10
-        for _ in range(num_iters):
-            wp.capture_launch(g, stream=stream0)
+            # replay
+            num_iters = 10
+            for _ in range(num_iters):
+                wp.capture_launch(g, stream=stream0)
-        # check results
-        assert_np_equal(c0.numpy(), np.full(N, fill_value=2 * num_iters))
+            # check results
+            assert_np_equal(c0.numpy(), np.full(N, fill_value=2 * num_iters))
     @unittest.skipUnless(len(wp.get_cuda_devices()) > 1, "Requires at least two CUDA devices")
+    @unittest.skipUnless(check_iommu(), "IOMMU seems enabled")
     def test_stream_scope_graph_mgpu(self):
         wp.load_module(device="cuda:0")
         wp.load_module(device="cuda:1")
-        # resources on GPU 0
-        with wp.ScopedDevice("cuda:0"):
-            stream0 = wp.get_stream()
-            a0 = wp.zeros(N, dtype=float)
-            b0 = wp.empty(N, dtype=float)
-            c0 = wp.empty(N, dtype=float)
-        # resources on GPU 1
-        with wp.ScopedDevice("cuda:1"):
-            stream1 = wp.get_stream()
-            a1 = wp.zeros(N, dtype=float)
-        # capture graph
-        with wp.ScopedDevice("cuda:0"):
-            # start recording
-            wp.capture_begin(force_module_load=False)
-            try:
-                with wp.ScopedDevice("cuda:1"):
-                    # branch into stream1
-                    wp.wait_stream(stream0)
+        # Peer-to-peer copies are not possible during graph capture if the arrays were
+        # allocated using pooled allocators and mempool access is not enabled.
+        # Here, we force default CUDA allocators and pre-allocate the memory.
+        with wp.ScopedMempool("cuda:0", False), wp.ScopedMempool("cuda:1", False):
-                    wp.launch(inc, dim=N, inputs=[a1])
+            # resources on GPU 0
+            with wp.ScopedDevice("cuda:0"):
+                stream0 = wp.get_stream()
+                a0 = wp.zeros(N, dtype=float)
+                b0 = wp.empty(N, dtype=float)
+                c0 = wp.empty(N, dtype=float)
-                wp.launch(inc, dim=N, inputs=[a0])
+            # resources on GPU 1
+            with wp.ScopedDevice("cuda:1"):
+                stream1 = wp.get_stream()
+                a1 = wp.zeros(N, dtype=float)
-                # wait for stream1 to finish
-                wp.wait_stream(stream1)
+            # capture graph
+            with wp.ScopedDevice("cuda:0"):
+                # start recording
+                wp.capture_begin(force_module_load=False)
+                try:
+                    with wp.ScopedDevice("cuda:1"):
+                        # branch into stream1
+                        wp.wait_stream(stream0)
-                # copy values from stream1
-                wp.copy(b0, a1)
+                        wp.launch(inc, dim=N, inputs=[a1])
-                # compute sum
-                wp.launch(sum, dim=N, inputs=[a0, b0, c0])
-            finally:
-                # finish recording
-                g = wp.capture_end()
+                    wp.launch(inc, dim=N, inputs=[a0])
-        # replay
-        with wp.ScopedDevice("cuda:0"):
-            num_iters = 10
-            for _ in range(num_iters):
-                wp.capture_launch(g)
+                    # wait for stream1 to finish
+                    wp.wait_stream(stream1)
+                    # copy values from stream1
+                    wp.copy(b0, a1)
+                    # compute sum
+                    wp.launch(sum, dim=N, inputs=[a0, b0, c0])
+                finally:
+                    # finish recording
+                    g = wp.capture_end()
+            # replay
+            with wp.ScopedDevice("cuda:0"):
+                num_iters = 10
+                for _ in range(num_iters):
+                    wp.capture_launch(g)
-        # check results
-        assert_np_equal(c0.numpy(), np.full(N, fill_value=2 * num_iters))
+            # check results
+            assert_np_equal(c0.numpy(), np.full(N, fill_value=2 * num_iters))
-add_function_test(TestStreams, "test_stream_arg_implicit_sync", test_stream_arg_implicit_sync, devices=devices)
+add_function_test(TestStreams, "test_stream_set", test_stream_set, devices=devices)
+add_function_test(TestStreams, "test_stream_arg_explicit_sync", test_stream_arg_explicit_sync, devices=devices)
 add_function_test(TestStreams, "test_stream_scope_implicit_sync", test_stream_scope_implicit_sync, devices=devices)
 add_function_test(TestStreams, "test_stream_arg_synchronize", test_stream_arg_synchronize, devices=devices)

warp/tests/test_torch.py CHANGED Viewed

@@ -490,10 +490,14 @@ def test_torch_graph_torch_stream(test, device):
     # capture graph
     with wp.ScopedStream(warp_stream), torch.cuda.graph(g, stream=torch_stream):
-        t += 1.0
-        wp.launch(inc, dim=n, inputs=[a])
-        t += 1.0
-        wp.launch(inc, dim=n, inputs=[a])
+        wp.capture_begin(force_module_load=False, external=True)
+        try:
+            t += 1.0
+            wp.launch(inc, dim=n, inputs=[a])
+            t += 1.0
+            wp.launch(inc, dim=n, inputs=[a])
+        finally:
+            wp.capture_end()
     # replay graph
     num_iters = 10
@@ -522,10 +526,14 @@ def test_torch_graph_warp_stream(test, device):
     # capture graph
     with wp.ScopedDevice(device), torch.cuda.graph(g, stream=torch_stream):
-        t += 1.0
-        wp.launch(inc, dim=n, inputs=[a])
-        t += 1.0
-        wp.launch(inc, dim=n, inputs=[a])
+        wp.capture_begin(force_module_load=False, external=True)
+        try:
+            t += 1.0
+            wp.launch(inc, dim=n, inputs=[a])
+            t += 1.0
+            wp.launch(inc, dim=n, inputs=[a])
+        finally:
+            wp.capture_end()
     # replay graph
     num_iters = 10

warp/tests/test_utils.py CHANGED Viewed

@@ -267,55 +267,60 @@ class TestUtils(unittest.TestCase):
     def test_warn(self):
         # Multiple warnings get printed out each time.
         with contextlib.redirect_stdout(io.StringIO()) as f:
-            frame_info = inspect.getframeinfo(inspect.currentframe())
             wp.utils.warn("hello, world!")
             wp.utils.warn("hello, world!")
         expected = (
-            "{}:{}: {}\n"
-            "{}:{}: {}\n"
-        ).format(
-            frame_info.filename,
-            frame_info.lineno + 1,
-            "UserWarning: hello, world!\n  wp.utils.warn(\"hello, world!\")",
-            frame_info.filename,
-            frame_info.lineno + 2,
-            "UserWarning: hello, world!\n  wp.utils.warn(\"hello, world!\")",
+            "Warp UserWarning: hello, world!\n"
+            "Warp UserWarning: hello, world!\n"
         )
         self.assertEqual(f.getvalue(), expected)
+        # Test verbose warnings
+        saved_verbosity = wp.config.verbose_warnings
+        try:
+            wp.config.verbose_warnings = True
+            with contextlib.redirect_stdout(io.StringIO()) as f:
+                frame_info = inspect.getframeinfo(inspect.currentframe())
+                wp.utils.warn("hello, world!")
+                wp.utils.warn("hello, world!")
+            expected = (
+                f"Warp UserWarning: hello, world! ({frame_info.filename}:{frame_info.lineno + 1})\n"
+                "  wp.utils.warn(\"hello, world!\")\n"
+                f"Warp UserWarning: hello, world! ({frame_info.filename}:{frame_info.lineno + 2})\n"
+                "  wp.utils.warn(\"hello, world!\")\n"
+            )
+            self.assertEqual(f.getvalue(), expected)
+        finally:
+            # make sure to restore warning verbosity
+            wp.config.verbose_warnings = saved_verbosity
         # Multiple similar deprecation warnings get printed out only once.
         with contextlib.redirect_stdout(io.StringIO()) as f:
-            frame_info = inspect.getframeinfo(inspect.currentframe())
             wp.utils.warn("hello, world!", category=DeprecationWarning)
             wp.utils.warn("hello, world!", category=DeprecationWarning)
         expected = (
-            "{}:{}: {}\n"
-        ).format(
-            frame_info.filename,
-            frame_info.lineno + 1,
-            "DeprecationWarning: hello, world!\n  wp.utils.warn(\"hello, world!\", category=DeprecationWarning)",
+            "Warp DeprecationWarning: hello, world!\n"
         )
         self.assertEqual(f.getvalue(), expected)
         # Multiple different deprecation warnings get printed out each time.
         with contextlib.redirect_stdout(io.StringIO()) as f:
-            frame_info = inspect.getframeinfo(inspect.currentframe())
             wp.utils.warn("foo", category=DeprecationWarning)
             wp.utils.warn("bar", category=DeprecationWarning)
         expected = (
-            "{}:{}: {}\n"
-            "{}:{}: {}\n"
-        ).format(
-            frame_info.filename,
-            frame_info.lineno + 1,
-            "DeprecationWarning: foo\n  wp.utils.warn(\"foo\", category=DeprecationWarning)",
-            frame_info.filename,
-            frame_info.lineno + 2,
-            "DeprecationWarning: bar\n  wp.utils.warn(\"bar\", category=DeprecationWarning)",
+            "Warp DeprecationWarning: foo\n"
+            "Warp DeprecationWarning: bar\n"
         )
         self.assertEqual(f.getvalue(), expected)
     def test_transform_expand(self):
@@ -425,7 +430,7 @@ class TestUtils(unittest.TestCase):
             with wp.ScopedTimer("hello", detailed=True):
                 pass
-        self.assertRegex(f.getvalue(), r"^         2 function calls in \d+\.\d+ seconds")
+        self.assertRegex(f.getvalue(), r"^         4 function calls in \d+\.\d+ seconds")
         self.assertRegex(f.getvalue(), r"hello took \d+\.\d+ ms$")