PyPI - warp-lang - Versions diffs - 1.4.1__py3-none-manylinux2014_x86_64.whl → 1.5.0__py3-none-manylinux2014_x86_64.whl - Mend

warp-lang 1.4.1__py3-none-manylinux2014_x86_64.whl → 1.5.0__py3-none-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (164) hide show

warp/__init__.py +4 -0
warp/autograd.py +43 -8
warp/bin/warp-clang.so +0 -0
warp/bin/warp.so +0 -0
warp/build.py +21 -2
warp/build_dll.py +23 -6
warp/builtins.py +1920 -111
warp/codegen.py +186 -62
warp/config.py +2 -2
warp/context.py +322 -73
warp/examples/assets/pixel.jpg +0 -0
warp/examples/benchmarks/benchmark_cloth_paddle.py +86 -0
warp/examples/benchmarks/benchmark_gemm.py +121 -0
warp/examples/benchmarks/benchmark_interop_paddle.py +158 -0
warp/examples/benchmarks/benchmark_tile.py +179 -0
warp/examples/core/example_dem.py +2 -1
warp/examples/core/example_mesh_intersect.py +3 -3
warp/examples/fem/example_adaptive_grid.py +37 -10
warp/examples/fem/example_apic_fluid.py +3 -2
warp/examples/fem/example_convection_diffusion_dg.py +4 -5
warp/examples/fem/example_deformed_geometry.py +1 -1
warp/examples/fem/example_diffusion_3d.py +47 -4
warp/examples/fem/example_distortion_energy.py +220 -0
warp/examples/fem/example_magnetostatics.py +127 -85
warp/examples/fem/example_nonconforming_contact.py +5 -5
warp/examples/fem/example_stokes.py +3 -1
warp/examples/fem/example_streamlines.py +12 -19
warp/examples/fem/utils.py +38 -15
warp/examples/optim/example_walker.py +2 -2
warp/examples/sim/example_cloth.py +2 -25
warp/examples/sim/example_jacobian_ik.py +6 -2
warp/examples/sim/example_quadruped.py +2 -1
warp/examples/tile/example_tile_convolution.py +58 -0
warp/examples/tile/example_tile_fft.py +47 -0
warp/examples/tile/example_tile_filtering.py +105 -0
warp/examples/tile/example_tile_matmul.py +79 -0
warp/examples/tile/example_tile_mlp.py +375 -0
warp/fem/__init__.py +8 -0
warp/fem/cache.py +16 -12
warp/fem/dirichlet.py +1 -1
warp/fem/domain.py +44 -1
warp/fem/field/__init__.py +1 -2
warp/fem/field/field.py +31 -19
warp/fem/field/nodal_field.py +101 -49
warp/fem/field/virtual.py +794 -0
warp/fem/geometry/__init__.py +2 -2
warp/fem/geometry/deformed_geometry.py +3 -105
warp/fem/geometry/element.py +13 -0
warp/fem/geometry/geometry.py +165 -5
warp/fem/geometry/grid_2d.py +3 -6
warp/fem/geometry/grid_3d.py +31 -28
warp/fem/geometry/hexmesh.py +3 -46
warp/fem/geometry/nanogrid.py +3 -2
warp/fem/geometry/{quadmesh_2d.py → quadmesh.py} +280 -159
warp/fem/geometry/tetmesh.py +2 -43
warp/fem/geometry/{trimesh_2d.py → trimesh.py} +354 -186
warp/fem/integrate.py +683 -261
warp/fem/linalg.py +404 -0
warp/fem/operator.py +101 -18
warp/fem/polynomial.py +5 -5
warp/fem/quadrature/quadrature.py +45 -21
warp/fem/space/__init__.py +45 -11
warp/fem/space/basis_function_space.py +451 -0
warp/fem/space/basis_space.py +58 -11
warp/fem/space/function_space.py +146 -5
warp/fem/space/grid_2d_function_space.py +80 -66
warp/fem/space/grid_3d_function_space.py +113 -68
warp/fem/space/hexmesh_function_space.py +96 -108
warp/fem/space/nanogrid_function_space.py +62 -110
warp/fem/space/quadmesh_function_space.py +208 -0
warp/fem/space/shape/__init__.py +45 -7
warp/fem/space/shape/cube_shape_function.py +328 -54
warp/fem/space/shape/shape_function.py +10 -1
warp/fem/space/shape/square_shape_function.py +328 -60
warp/fem/space/shape/tet_shape_function.py +269 -19
warp/fem/space/shape/triangle_shape_function.py +238 -19
warp/fem/space/tetmesh_function_space.py +69 -37
warp/fem/space/topology.py +38 -0
warp/fem/space/trimesh_function_space.py +179 -0
warp/fem/utils.py +6 -331
warp/jax_experimental.py +3 -1
warp/native/array.h +55 -40
warp/native/builtin.h +124 -43
warp/native/bvh.h +4 -0
warp/native/coloring.cpp +600 -0
warp/native/cuda_util.cpp +14 -0
warp/native/cuda_util.h +2 -1
warp/native/fabric.h +8 -0
warp/native/hashgrid.h +4 -0
warp/native/marching.cu +8 -0
warp/native/mat.h +14 -3
warp/native/mathdx.cpp +59 -0
warp/native/mesh.h +4 -0
warp/native/range.h +13 -1
warp/native/reduce.cpp +9 -1
warp/native/reduce.cu +7 -0
warp/native/runlength_encode.cpp +9 -1
warp/native/runlength_encode.cu +7 -1
warp/native/scan.cpp +8 -0
warp/native/scan.cu +8 -0
warp/native/scan.h +8 -1
warp/native/sparse.cpp +8 -0
warp/native/sparse.cu +8 -0
warp/native/temp_buffer.h +7 -0
warp/native/tile.h +1857 -0
warp/native/tile_gemm.h +341 -0
warp/native/tile_reduce.h +210 -0
warp/native/volume_builder.cu +8 -0
warp/native/volume_builder.h +8 -0
warp/native/warp.cpp +10 -2
warp/native/warp.cu +369 -15
warp/native/warp.h +12 -2
warp/optim/adam.py +39 -4
warp/paddle.py +29 -12
warp/render/render_opengl.py +137 -65
warp/sim/graph_coloring.py +292 -0
warp/sim/integrator_euler.py +4 -2
warp/sim/integrator_featherstone.py +115 -44
warp/sim/integrator_vbd.py +6 -0
warp/sim/model.py +90 -17
warp/stubs.py +651 -85
warp/tape.py +12 -7
warp/tests/assets/pixel.npy +0 -0
warp/tests/aux_test_instancing_gc.py +18 -0
warp/tests/test_array.py +207 -48
warp/tests/test_closest_point_edge_edge.py +8 -8
warp/tests/test_codegen.py +120 -1
warp/tests/test_codegen_instancing.py +30 -0
warp/tests/test_collision.py +110 -0
warp/tests/test_coloring.py +241 -0
warp/tests/test_context.py +34 -0
warp/tests/test_examples.py +18 -4
warp/tests/test_fabricarray.py +33 -0
warp/tests/test_fem.py +453 -113
warp/tests/test_func.py +48 -1
warp/tests/test_generics.py +52 -0
warp/tests/test_iter.py +68 -0
warp/tests/test_mat_scalar_ops.py +1 -1
warp/tests/test_mesh_query_point.py +5 -4
warp/tests/test_module_hashing.py +23 -0
warp/tests/test_paddle.py +27 -87
warp/tests/test_print.py +191 -1
warp/tests/test_spatial.py +1 -1
warp/tests/test_tile.py +700 -0
warp/tests/test_tile_mathdx.py +144 -0
warp/tests/test_tile_mlp.py +383 -0
warp/tests/test_tile_reduce.py +374 -0
warp/tests/test_tile_shared_memory.py +190 -0
warp/tests/test_vbd.py +12 -20
warp/tests/test_volume.py +43 -0
warp/tests/unittest_suites.py +23 -2
warp/tests/unittest_utils.py +4 -0
warp/types.py +339 -73
warp/utils.py +22 -1
{warp_lang-1.4.1.dist-info → warp_lang-1.5.0.dist-info}/METADATA +33 -7
{warp_lang-1.4.1.dist-info → warp_lang-1.5.0.dist-info}/RECORD +159 -132
{warp_lang-1.4.1.dist-info → warp_lang-1.5.0.dist-info}/WHEEL +1 -1
warp/fem/field/test.py +0 -180
warp/fem/field/trial.py +0 -183
warp/fem/space/collocated_function_space.py +0 -102
warp/fem/space/quadmesh_2d_function_space.py +0 -261
warp/fem/space/trimesh_2d_function_space.py +0 -153
{warp_lang-1.4.1.dist-info → warp_lang-1.5.0.dist-info}/LICENSE.md +0 -0
{warp_lang-1.4.1.dist-info → warp_lang-1.5.0.dist-info}/top_level.txt +0 -0

warp/tests/test_func.py CHANGED Viewed

@@ -7,7 +7,7 @@
 import math
 import unittest
-from typing import Tuple
+from typing import Any, Tuple
 import numpy as np
@@ -191,6 +191,47 @@ def test_user_func_return_multiple_values():
     wp.expect_eq(b, 54756.0)
+@wp.func
+def user_func_overload(
+    b: wp.array(dtype=Any),
+    i: int,
+):
+    return b[i] * 2.0
+@wp.kernel
+def user_func_overload_resolution_kernel(
+    a: wp.array(dtype=Any),
+    b: wp.array(dtype=Any),
+):
+    i = wp.tid()
+    a[i] = user_func_overload(b, i)
+def test_user_func_overload_resolution(test, device):
+    a0 = wp.array((1, 2, 3), dtype=wp.vec3)
+    b0 = wp.array((2, 3, 4), dtype=wp.vec3)
+    a1 = wp.array((5,), dtype=float)
+    b1 = wp.array((6,), dtype=float)
+    wp.launch(user_func_overload_resolution_kernel, a0.shape, (a0, b0))
+    wp.launch(user_func_overload_resolution_kernel, a1.shape, (a1, b1))
+    assert_np_equal(a0.numpy()[0], (4, 6, 8))
+    assert a1.numpy()[0] == 12
+@wp.func
+def user_func_return_none() -> None:
+    pass
+@wp.kernel
+def test_return_annotation_none() -> None:
+    user_func_return_none()
 devices = get_test_devices()
@@ -375,6 +416,12 @@ add_kernel_test(
     dim=1,
     devices=devices,
 )
+add_function_test(
+    TestFunc, func=test_user_func_overload_resolution, name="test_user_func_overload_resolution", devices=devices
+)
+add_kernel_test(
+    TestFunc, kernel=test_return_annotation_none, name="test_return_annotation_none", dim=1, devices=devices
+)
 if __name__ == "__main__":

warp/tests/test_generics.py CHANGED Viewed

@@ -522,6 +522,57 @@ def test_type_attribute_error(test, device):
         )
+@wp.func
+def vec_int_annotation_func(v: wp.vec(3, wp.Int)) -> wp.Int:
+    return v[0] + v[1] + v[2]
+@wp.func
+def vec_float_annotation_func(v: wp.vec(3, wp.Float)) -> wp.Float:
+    return v[0] + v[1] + v[2]
+@wp.func
+def vec_scalar_annotation_func(v: wp.vec(3, wp.Scalar)) -> wp.Scalar:
+    return v[0] + v[1] + v[2]
+@wp.func
+def mat_int_annotation_func(m: wp.mat((2, 2), wp.Int)) -> wp.Int:
+    return m[0, 0] + m[0, 1] + m[1, 0] + m[1, 1]
+@wp.func
+def mat_float_annotation_func(m: wp.mat((2, 2), wp.Float)) -> wp.Float:
+    return m[0, 0] + m[0, 1] + m[1, 0] + m[1, 1]
+@wp.func
+def mat_scalar_annotation_func(m: wp.mat((2, 2), wp.Scalar)) -> wp.Scalar:
+    return m[0, 0] + m[0, 1] + m[1, 0] + m[1, 1]
+mat22s = wp.mat((2, 2), wp.int16)
+mat22d = wp.mat((2, 2), wp.float64)
+@wp.kernel
+def test_annotations_kernel():
+    vi16 = wp.vec3s(wp.int16(1), wp.int16(2), wp.int16(3))
+    vf64 = wp.vec3d(wp.float64(1), wp.float64(2), wp.float64(3))
+    wp.expect_eq(vec_int_annotation_func(vi16), wp.int16(6))
+    wp.expect_eq(vec_float_annotation_func(vf64), wp.float64(6))
+    wp.expect_eq(vec_scalar_annotation_func(vi16), wp.int16(6))
+    wp.expect_eq(vec_scalar_annotation_func(vf64), wp.float64(6))
+    mi16 = mat22s(wp.int16(1), wp.int16(2), wp.int16(3), wp.int16(4))
+    mf64 = mat22d(wp.float64(1), wp.float64(2), wp.float64(3), wp.float64(4))
+    wp.expect_eq(mat_int_annotation_func(mi16), wp.int16(10))
+    wp.expect_eq(mat_float_annotation_func(mf64), wp.float64(10))
+    wp.expect_eq(mat_scalar_annotation_func(mi16), wp.int16(10))
+    wp.expect_eq(mat_scalar_annotation_func(mf64), wp.float64(10))
 class TestGenerics(unittest.TestCase):
     pass
@@ -590,6 +641,7 @@ add_kernel_test(
 )
 add_function_test(TestGenerics, "test_type_operator_misspell", test_type_operator_misspell, devices=devices)
 add_function_test(TestGenerics, "test_type_attribute_error", test_type_attribute_error, devices=devices)
+add_kernel_test(TestGenerics, name="test_annotations_kernel", kernel=test_annotations_kernel, dim=1, devices=devices)
 if __name__ == "__main__":
     wp.clear_kernel_cache()

warp/tests/test_iter.py ADDED Viewed

@@ -0,0 +1,68 @@
+# Copyright (c) 2024 NVIDIA CORPORATION.  All rights reserved.
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+import unittest
+import warp as wp
+from warp.tests.unittest_utils import *
+@wp.kernel
+def reversed_kernel(
+    start: wp.int32,
+    end: wp.int32,
+    step: wp.int32,
+    out_count: wp.array(dtype=wp.int32),
+    out_values: wp.array(dtype=wp.int32),
+):
+    count = wp.int32(0)
+    for i in reversed(range(start, end, step)):
+        out_values[count] = i
+        count += 1
+    out_count[0] = count
+def test_reversed(test, device):
+    count = wp.empty(1, dtype=wp.int32)
+    values = wp.empty(32, dtype=wp.int32)
+    start, end, step = (-2, 8, 3)
+    wp.launch(
+        reversed_kernel,
+        dim=1,
+        inputs=(start, end, step),
+        outputs=(count, values),
+    )
+    expected = tuple(reversed(range(start, end, step)))
+    assert count.numpy()[0] == len(expected)
+    assert_np_equal(values.numpy()[: len(expected)], expected)
+    start, end, step = (9, -3, -2)
+    wp.launch(
+        reversed_kernel,
+        dim=1,
+        inputs=(start, end, step),
+        outputs=(count, values),
+    )
+    expected = tuple(reversed(range(start, end, step)))
+    assert count.numpy()[0] == len(expected)
+    assert_np_equal(values.numpy()[: len(expected)], expected)
+devices = get_test_devices()
+class TestIter(unittest.TestCase):
+    pass
+add_function_test(TestIter, "test_reversed", test_reversed, devices=devices)
+if __name__ == "__main__":
+    wp.clear_kernel_cache()
+    unittest.main(verbosity=2)

warp/tests/test_mat_scalar_ops.py CHANGED Viewed

@@ -1501,7 +1501,7 @@ def test_matmat_multiplication(test, device, dtype, register_kernels=False):
     tol = {
         np.float16: 2.0e-2,
         np.float32: 5.0e-6,
-        np.float64: 1.0e-8,
+        np.float64: 5.0e-7,
     }.get(dtype, 0)
     wptype = wp.types.np_dtype_to_warp_type[np.dtype(dtype)]

warp/tests/test_mesh_query_point.py CHANGED Viewed

@@ -769,20 +769,21 @@ def point_query_aabb_and_closest(
 @unittest.skipUnless(USD_AVAILABLE, "Requires usd-core")
 def test_set_mesh_points(test, device):
+    rng = np.random.default_rng(123)
     vs, fs = load_mesh()
     vertices1 = wp.array(vs, dtype=wp.vec3, device=device)
-    velocities1_np = np.random.randn(vertices1.shape[0], 3)
+    velocities1_np = rng.standard_normal(size=(vertices1.shape[0], 3))
     velocities1 = wp.array(velocities1_np, dtype=wp.vec3, device=device)
     faces = wp.array(fs, dtype=wp.int32, device=device)
     mesh = wp.Mesh(vertices1, faces, velocities=velocities1)
     fs_2D = faces.reshape((-1, 3))
-    np.random.seed(12345)
     n = 1000
     query_radius = 0.2
-    pts1 = wp.array(np.random.randn(n, 3), dtype=wp.vec3, device=device)
+    pts1 = wp.array(rng.standard_normal(size=(n, 3)), dtype=wp.vec3, device=device)
     query_results_num_cols1 = wp.zeros(n, dtype=wp.int32, device=device)
     query_results_min_dist1 = wp.zeros(n, dtype=float, device=device)
@@ -804,7 +805,7 @@ def test_set_mesh_points(test, device):
         device=device,
     )
-    shift = np.random.randn(3)
+    shift = rng.standard_normal(size=3)
     vs_higher = vs + shift
     vertices2 = wp.array(vs_higher, dtype=wp.vec3, device=device)

warp/tests/test_module_hashing.py CHANGED Viewed

@@ -214,12 +214,35 @@ def test_function_generic_overload_hashing(test, device):
     test.assertNotEqual(hash4, hash1)
+SIMPLE_MODULE = """# -*- coding: utf-8 -*-
+import warp as wp
+@wp.kernel
+def k():
+    pass
+"""
+def test_module_load(test, device):
+    """Ensure that loading a module does not change its hash"""
+    m = load_code_as_module(SIMPLE_MODULE, "simple_module")
+    hash1 = m.hash_module()
+    m.load(device)
+    hash2 = m.hash_module()
+    test.assertEqual(hash1, hash2)
 class TestModuleHashing(unittest.TestCase):
     pass
+devices = get_test_devices()
 add_function_test(TestModuleHashing, "test_function_overload_hashing", test_function_overload_hashing)
 add_function_test(TestModuleHashing, "test_function_generic_overload_hashing", test_function_generic_overload_hashing)
+add_function_test(TestModuleHashing, "test_module_load", test_module_load, devices=devices)
 if __name__ == "__main__":

warp/tests/test_paddle.py CHANGED Viewed

@@ -7,8 +7,6 @@
 import unittest
-import numpy as np
 import warp as wp
 from warp.tests.unittest_utils import *
@@ -444,7 +442,7 @@ def test_from_paddle_slices(test, device):
     assert a.ptr == t.data_ptr()
     assert a.is_contiguous
     assert a.shape == tuple(t.shape)
-    assert_np_equal(a.numpy(), t.cpu().numpy())
+    assert_np_equal(a.numpy(), t.numpy())
     # 1D slice with non-contiguous stride
     t_base = paddle.arange(10, dtype=paddle.float32).to(device=paddle_device)
@@ -456,7 +454,7 @@ def test_from_paddle_slices(test, device):
     # copy contents to contiguous array
     a_contiguous = wp.empty_like(a)
     wp.launch(copy1d_float_kernel, dim=a.shape, inputs=[a_contiguous, a], device=device)
-    assert_np_equal(a_contiguous.numpy(), t.cpu().numpy())
+    assert_np_equal(a_contiguous.numpy(), t.numpy())
     # 2D slices (non-contiguous)
     t_base = paddle.arange(24, dtype=paddle.float32).to(device=paddle_device).reshape((4, 6))
@@ -468,7 +466,7 @@ def test_from_paddle_slices(test, device):
     # copy contents to contiguous array
     a_contiguous = wp.empty_like(a)
     wp.launch(copy2d_float_kernel, dim=a.shape, inputs=[a_contiguous, a], device=device)
-    assert_np_equal(a_contiguous.numpy(), t.cpu().numpy())
+    assert_np_equal(a_contiguous.numpy(), t.numpy())
     # 3D slices (non-contiguous)
     t_base = paddle.arange(36, dtype=paddle.float32).to(device=paddle_device).reshape((4, 3, 3))
@@ -480,7 +478,7 @@ def test_from_paddle_slices(test, device):
     # copy contents to contiguous array
     a_contiguous = wp.empty_like(a)
     wp.launch(copy3d_float_kernel, dim=a.shape, inputs=[a_contiguous, a], device=device)
-    assert_np_equal(a_contiguous.numpy(), t.cpu().numpy())
+    assert_np_equal(a_contiguous.numpy(), t.numpy())
     # 2D slices of vec3 (inner contiguous, outer non-contiguous)
     t_base = paddle.arange(150, dtype=paddle.float32).to(device=paddle_device).reshape((10, 5, 3))
@@ -492,7 +490,7 @@ def test_from_paddle_slices(test, device):
     # copy contents to contiguous array
     a_contiguous = wp.empty_like(a)
     wp.launch(copy2d_vec3_kernel, dim=a.shape, inputs=[a_contiguous, a], device=device)
-    assert_np_equal(a_contiguous.numpy(), t.cpu().numpy())
+    assert_np_equal(a_contiguous.numpy(), t.numpy())
     # 2D slices of mat22 (inner contiguous, outer non-contiguous)
     t_base = paddle.arange(200, dtype=paddle.float32).to(device=paddle_device).reshape((10, 5, 2, 2))
@@ -504,7 +502,7 @@ def test_from_paddle_slices(test, device):
     # copy contents to contiguous array
     a_contiguous = wp.empty_like(a)
     wp.launch(copy2d_mat22_kernel, dim=a.shape, inputs=[a_contiguous, a], device=device)
-    assert_np_equal(a_contiguous.numpy(), t.cpu().numpy())
+    assert_np_equal(a_contiguous.numpy(), t.numpy())
 def test_from_paddle_zero_strides(test, device):
@@ -522,7 +520,7 @@ def test_from_paddle_zero_strides(test, device):
     assert a.shape == tuple(t.shape)
     a_contiguous = wp.empty_like(a)
     wp.launch(copy3d_float_kernel, dim=a.shape, inputs=[a_contiguous, a], device=device)
-    assert_np_equal(a_contiguous.numpy(), t.cpu().numpy())
+    assert_np_equal(a_contiguous.numpy(), t.numpy())
     # expand middle dimension
     t = t_base.unsqueeze(1).expand([-1, 3, -1])
@@ -532,7 +530,7 @@ def test_from_paddle_zero_strides(test, device):
     assert a.shape == tuple(t.shape)
     a_contiguous = wp.empty_like(a)
     wp.launch(copy3d_float_kernel, dim=a.shape, inputs=[a_contiguous, a], device=device)
-    assert_np_equal(a_contiguous.numpy(), t.cpu().numpy())
+    assert_np_equal(a_contiguous.numpy(), t.numpy())
     # expand innermost dimension
     t = t_base.unsqueeze(2).expand([-1, -1, 3])
@@ -542,77 +540,7 @@ def test_from_paddle_zero_strides(test, device):
     assert a.shape == tuple(t.shape)
     a_contiguous = wp.empty_like(a)
     wp.launch(copy3d_float_kernel, dim=a.shape, inputs=[a_contiguous, a], device=device)
-    assert_np_equal(a_contiguous.numpy(), t.cpu().numpy())
-def test_paddle_mgpu_from_paddle(test, device):
-    import paddle
-    n = 32
-    t0 = paddle.arange(0, n, 1, dtype=paddle.int32).to(device="gpu:0")
-    t1 = paddle.arange(0, n * 2, 2, dtype=paddle.int32).to(device="gpu:1")
-    a0 = wp.from_paddle(t0, dtype=wp.int32)
-    a1 = wp.from_paddle(t1, dtype=wp.int32)
-    assert a0.device == "gpu:0"
-    assert a1.device == "gpu:1"
-    expected0 = np.arange(0, n, 1)
-    expected1 = np.arange(0, n * 2, 2)
-    assert_np_equal(a0.numpy(), expected0)
-    assert_np_equal(a1.numpy(), expected1)
-def test_paddle_mgpu_to_paddle(test, device):
-    n = 32
-    with wp.ScopedDevice("gpu:0"):
-        a0 = wp.empty(n, dtype=wp.int32)
-        wp.launch(arange, dim=a0.size, inputs=[0, 1, a0])
-    with wp.ScopedDevice("gpu:1"):
-        a1 = wp.empty(n, dtype=wp.int32)
-        wp.launch(arange, dim=a1.size, inputs=[0, 2, a1])
-    t0 = wp.to_paddle(a0)
-    t1 = wp.to_paddle(a1)
-    assert str(t0.device) == "gpu:0"
-    assert str(t1.device) == "gpu:1"
-    expected0 = np.arange(0, n, 1, dtype=np.int32)
-    expected1 = np.arange(0, n * 2, 2, dtype=np.int32)
-    assert_np_equal(t0.cpu().numpy(), expected0)
-    assert_np_equal(t1.cpu().numpy(), expected1)
-def test_paddle_mgpu_interop(test, device):
-    import paddle
-    n = 1024 * 1024
-    with paddle.cuda.device(0):
-        t0 = paddle.arange(n, dtype=paddle.float32).to(device="gpu")
-        a0 = wp.from_paddle(t0)
-        wp.launch(inc, dim=a0.size, inputs=[a0], stream=wp.stream_from_paddle())
-    with paddle.cuda.device(1):
-        t1 = paddle.arange(n, dtype=paddle.float32).to(device="gpu")
-        a1 = wp.from_paddle(t1)
-        wp.launch(inc, dim=a1.size, inputs=[a1], stream=wp.stream_from_paddle())
-    assert a0.device == "gpu:0"
-    assert a1.device == "gpu:1"
-    expected = np.arange(n, dtype=int) + 1
-    # ensure the paddle tensors were modified by warp
-    assert_np_equal(t0.cpu().numpy(), expected)
-    assert_np_equal(t1.cpu().numpy(), expected)
+    assert_np_equal(a_contiguous.numpy(), t.numpy())
 def test_paddle_autograd(test, device):
@@ -624,6 +552,9 @@ def test_paddle_autograd(test, device):
     class TestFunc(paddle.autograd.PyLayer):
         @staticmethod
         def forward(ctx, x):
+            # ensure Paddle operations complete before running Warp
+            wp.synchronize_device()
             # allocate output array
             y = paddle.empty_like(x)
@@ -632,10 +563,16 @@ def test_paddle_autograd(test, device):
             wp.launch(kernel=op_kernel, dim=len(x), inputs=[wp.from_paddle(x)], outputs=[wp.from_paddle(y)])
+            # ensure Warp operations complete before returning data to Paddle
+            wp.synchronize_device()
             return y
         @staticmethod
         def backward(ctx, adj_y):
+            # ensure Paddle operations complete before running Warp
+            wp.synchronize_device()
             # adjoints should be allocated as zero initialized
             adj_x = paddle.zeros_like(ctx.x).contiguous()
             adj_y = adj_y.contiguous()
@@ -655,6 +592,9 @@ def test_paddle_autograd(test, device):
                 adjoint=True,
             )
+            # ensure Warp operations complete before returning data to Paddle
+            wp.synchronize_device()
             return adj_x
     # run autograd on given device
@@ -691,7 +631,7 @@ def test_warp_graph_warp_stream(test, device):
     paddle_stream = wp.stream_to_paddle(device)
     # capture graph
-    with wp.ScopedDevice(device), paddle.device.stream(paddle_stream):
+    with wp.ScopedDevice(device), paddle.device.stream_guard(paddle.device.Stream(paddle_stream)):
         wp.capture_begin(force_module_load=False)
         try:
             t += 1.0
@@ -837,11 +777,11 @@ try:
     #         devices=paddle_compatible_cuda_devices,
     #     )
-    # multi-GPU tests
-    if len(paddle_compatible_cuda_devices) > 1:
-        add_function_test(TestPaddle, "test_paddle_mgpu_from_paddle", test_paddle_mgpu_from_paddle)
-        add_function_test(TestPaddle, "test_paddle_mgpu_to_paddle", test_paddle_mgpu_to_paddle)
-        add_function_test(TestPaddle, "test_paddle_mgpu_interop", test_paddle_mgpu_interop)
+    # multi-GPU not supported yet.
+    # if len(paddle_compatible_cuda_devices) > 1:
+    #     add_function_test(TestPaddle, "test_paddle_mgpu_from_paddle", test_paddle_mgpu_from_paddle)
+    #     add_function_test(TestPaddle, "test_paddle_mgpu_to_paddle", test_paddle_mgpu_to_paddle)
+    #     add_function_test(TestPaddle, "test_paddle_mgpu_interop", test_paddle_mgpu_interop)
 except Exception as e:
     print(f"Skipping Paddle tests due to exception: {e}")