PyPI - warp-lang - Versions diffs - 1.3.2__py3-none-win_amd64.whl → 1.4.0__py3-none-win_amd64.whl - Mend

warp-lang 1.3.2__py3-none-win_amd64.whl → 1.4.0__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (108) hide show

warp/__init__.py +6 -0
warp/autograd.py +59 -6
warp/bin/warp-clang.dll +0 -0
warp/bin/warp.dll +0 -0
warp/build_dll.py +8 -10
warp/builtins.py +126 -4
warp/codegen.py +435 -53
warp/config.py +1 -1
warp/context.py +678 -403
warp/dlpack.py +2 -0
warp/examples/benchmarks/benchmark_cloth.py +10 -0
warp/examples/core/example_render_opengl.py +12 -10
warp/examples/fem/example_adaptive_grid.py +251 -0
warp/examples/fem/example_apic_fluid.py +1 -1
warp/examples/fem/example_diffusion_3d.py +2 -2
warp/examples/fem/example_magnetostatics.py +1 -1
warp/examples/fem/example_streamlines.py +1 -0
warp/examples/fem/utils.py +23 -4
warp/examples/sim/example_cloth.py +50 -6
warp/fem/__init__.py +2 -0
warp/fem/adaptivity.py +493 -0
warp/fem/field/field.py +2 -1
warp/fem/field/nodal_field.py +18 -26
warp/fem/field/test.py +4 -4
warp/fem/field/trial.py +4 -4
warp/fem/geometry/__init__.py +1 -0
warp/fem/geometry/adaptive_nanogrid.py +843 -0
warp/fem/geometry/nanogrid.py +55 -28
warp/fem/space/__init__.py +1 -1
warp/fem/space/nanogrid_function_space.py +69 -35
warp/fem/utils.py +113 -107
warp/jax_experimental.py +28 -15
warp/native/array.h +0 -1
warp/native/builtin.h +103 -6
warp/native/bvh.cu +2 -0
warp/native/cuda_util.cpp +14 -0
warp/native/cuda_util.h +2 -0
warp/native/error.cpp +4 -2
warp/native/exports.h +99 -17
warp/native/mat.h +97 -0
warp/native/mesh.cpp +36 -0
warp/native/mesh.cu +51 -0
warp/native/mesh.h +1 -0
warp/native/quat.h +43 -0
warp/native/spatial.h +6 -0
warp/native/vec.h +74 -0
warp/native/warp.cpp +2 -1
warp/native/warp.cu +10 -3
warp/native/warp.h +8 -1
warp/paddle.py +382 -0
warp/sim/__init__.py +1 -0
warp/sim/collide.py +519 -0
warp/sim/integrator_euler.py +18 -5
warp/sim/integrator_featherstone.py +5 -5
warp/sim/integrator_vbd.py +1026 -0
warp/sim/model.py +49 -23
warp/stubs.py +459 -0
warp/tape.py +2 -0
warp/tests/aux_test_dependent.py +1 -0
warp/tests/aux_test_name_clash1.py +32 -0
warp/tests/aux_test_name_clash2.py +32 -0
warp/tests/aux_test_square.py +1 -0
warp/tests/test_array.py +222 -0
warp/tests/test_async.py +3 -3
warp/tests/test_atomic.py +6 -0
warp/tests/test_closest_point_edge_edge.py +93 -1
warp/tests/test_codegen.py +62 -15
warp/tests/test_codegen_instancing.py +1457 -0
warp/tests/test_collision.py +486 -0
warp/tests/test_compile_consts.py +3 -28
warp/tests/test_dlpack.py +170 -0
warp/tests/test_examples.py +22 -8
warp/tests/test_fast_math.py +10 -4
warp/tests/test_fem.py +64 -0
warp/tests/test_func.py +46 -0
warp/tests/test_implicit_init.py +49 -0
warp/tests/test_jax.py +58 -0
warp/tests/test_mat.py +84 -0
warp/tests/test_mesh_query_point.py +188 -0
warp/tests/test_module_hashing.py +40 -0
warp/tests/test_multigpu.py +3 -3
warp/tests/test_overwrite.py +8 -0
warp/tests/test_paddle.py +852 -0
warp/tests/test_print.py +89 -0
warp/tests/test_quat.py +111 -0
warp/tests/test_reload.py +31 -1
warp/tests/test_scalar_ops.py +2 -0
warp/tests/test_static.py +412 -0
warp/tests/test_streams.py +64 -3
warp/tests/test_struct.py +4 -4
warp/tests/test_torch.py +24 -0
warp/tests/test_triangle_closest_point.py +137 -0
warp/tests/test_types.py +1 -1
warp/tests/test_vbd.py +386 -0
warp/tests/test_vec.py +143 -0
warp/tests/test_vec_scalar_ops.py +139 -0
warp/tests/test_volume.py +30 -0
warp/tests/unittest_suites.py +12 -0
warp/tests/unittest_utils.py +9 -5
warp/thirdparty/dlpack.py +3 -1
warp/types.py +157 -34
warp/utils.py +37 -14
{warp_lang-1.3.2.dist-info → warp_lang-1.4.0.dist-info}/METADATA +10 -8
{warp_lang-1.3.2.dist-info → warp_lang-1.4.0.dist-info}/RECORD +107 -95
warp/tests/test_point_triangle_closest_point.py +0 -143
{warp_lang-1.3.2.dist-info → warp_lang-1.4.0.dist-info}/LICENSE.md +0 -0
{warp_lang-1.3.2.dist-info → warp_lang-1.4.0.dist-info}/WHEEL +0 -0
{warp_lang-1.3.2.dist-info → warp_lang-1.4.0.dist-info}/top_level.txt +0 -0

warp/tests/test_paddle.py ADDED Viewed

@@ -0,0 +1,852 @@
+# Copyright (c) 2022 NVIDIA CORPORATION.  All rights reserved.
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+import unittest
+import numpy as np
+import warp as wp
+from warp.tests.unittest_utils import *
+@wp.kernel
+def op_kernel(x: wp.array(dtype=float), y: wp.array(dtype=float)):
+    tid = wp.tid()
+    y[tid] = 0.5 - x[tid] * 2.0
+@wp.kernel
+def inc(a: wp.array(dtype=float)):
+    tid = wp.tid()
+    a[tid] = a[tid] + 1.0
+@wp.kernel
+def inc_vector(a: wp.array(dtype=wp.vec3f)):
+    tid = wp.tid()
+    a[tid] = a[tid] + wp.vec3f(1.0)
+@wp.kernel
+def inc_matrix(a: wp.array(dtype=wp.mat22f)):
+    tid = wp.tid()
+    a[tid] = a[tid] + wp.mat22f(1.0)
+@wp.kernel
+def arange(start: int, step: int, a: wp.array(dtype=int)):
+    tid = wp.tid()
+    a[tid] = start + step * tid
+# copy elements between non-contiguous 1d arrays of float
+@wp.kernel
+def copy1d_float_kernel(dst: wp.array(dtype=float), src: wp.array(dtype=float)):
+    i = wp.tid()
+    dst[i] = src[i]
+# copy elements between non-contiguous 2d arrays of float
+@wp.kernel
+def copy2d_float_kernel(dst: wp.array2d(dtype=float), src: wp.array2d(dtype=float)):
+    i, j = wp.tid()
+    dst[i, j] = src[i, j]
+# copy elements between non-contiguous 3d arrays of float
+@wp.kernel
+def copy3d_float_kernel(dst: wp.array3d(dtype=float), src: wp.array3d(dtype=float)):
+    i, j, k = wp.tid()
+    dst[i, j, k] = src[i, j, k]
+# copy elements between non-contiguous 2d arrays of vec3
+@wp.kernel
+def copy2d_vec3_kernel(dst: wp.array2d(dtype=wp.vec3), src: wp.array2d(dtype=wp.vec3)):
+    i, j = wp.tid()
+    dst[i, j] = src[i, j]
+# copy elements between non-contiguous 2d arrays of mat22
+@wp.kernel
+def copy2d_mat22_kernel(dst: wp.array2d(dtype=wp.mat22), src: wp.array2d(dtype=wp.mat22)):
+    i, j = wp.tid()
+    dst[i, j] = src[i, j]
+def test_dtype_from_paddle(test, device):
+    import paddle
+    def test_conversions(paddle_type, warp_type):
+        test.assertEqual(wp.dtype_from_paddle(paddle_type), warp_type)
+    test_conversions(paddle.float16, wp.float16)
+    test_conversions(paddle.float32, wp.float32)
+    test_conversions(paddle.float64, wp.float64)
+    test_conversions(paddle.int8, wp.int8)
+    test_conversions(paddle.int16, wp.int16)
+    test_conversions(paddle.int32, wp.int32)
+    test_conversions(paddle.int64, wp.int64)
+    test_conversions(paddle.uint8, wp.uint8)
+    test_conversions(paddle.bool, wp.bool)
+def test_dtype_to_paddle(test, device):
+    import paddle
+    def test_conversions(warp_type, paddle_type):
+        test.assertEqual(wp.dtype_to_paddle(warp_type), paddle_type)
+    test_conversions(wp.float16, paddle.float16)
+    test_conversions(wp.float32, paddle.float32)
+    test_conversions(wp.float64, paddle.float64)
+    test_conversions(wp.int8, paddle.int8)
+    test_conversions(wp.int16, paddle.int16)
+    test_conversions(wp.int32, paddle.int32)
+    test_conversions(wp.int64, paddle.int64)
+    test_conversions(wp.uint8, paddle.uint8)
+    test_conversions(wp.uint16, paddle.int16)
+    test_conversions(wp.uint32, paddle.int32)
+    test_conversions(wp.uint64, paddle.int64)
+    test_conversions(wp.bool, paddle.bool)
+def test_device_conversion(test, device):
+    paddle_device = wp.device_to_paddle(device)
+    warp_device = wp.device_from_paddle(paddle_device)
+    test.assertEqual(warp_device, device)
+def test_paddle_zerocopy(test, device):
+    import paddle
+    a = wp.zeros(10, dtype=wp.float32, device=device)
+    t = wp.to_paddle(a)
+    assert a.ptr == t.data_ptr()
+    paddle_device = wp.device_to_paddle(device)
+    t = paddle.zeros([10], dtype=paddle.float32).to(device=paddle_device)
+    a = wp.from_paddle(t)
+    assert a.ptr == t.data_ptr()
+def test_from_paddle(test, device):
+    import paddle
+    paddle_device = wp.device_to_paddle(device)
+    # automatically determine warp dtype
+    def wrap_scalar_tensor_implicit(paddle_dtype, expected_warp_dtype):
+        t = paddle.zeros([10], dtype=paddle_dtype).to(device=paddle_device)
+        a = wp.from_paddle(t)
+        assert a.dtype == expected_warp_dtype
+        assert a.shape == tuple(t.shape)
+    wrap_scalar_tensor_implicit(paddle.float64, wp.float64)
+    wrap_scalar_tensor_implicit(paddle.float32, wp.float32)
+    wrap_scalar_tensor_implicit(paddle.float16, wp.float16)
+    wrap_scalar_tensor_implicit(paddle.int64, wp.int64)
+    wrap_scalar_tensor_implicit(paddle.int32, wp.int32)
+    wrap_scalar_tensor_implicit(paddle.int16, wp.int16)
+    wrap_scalar_tensor_implicit(paddle.int8, wp.int8)
+    wrap_scalar_tensor_implicit(paddle.uint8, wp.uint8)
+    wrap_scalar_tensor_implicit(paddle.bool, wp.bool)
+    # explicitly specify warp dtype
+    def wrap_scalar_tensor_explicit(paddle_dtype, expected_warp_dtype):
+        t = paddle.zeros([10], dtype=paddle_dtype).to(device=paddle_device)
+        a = wp.from_paddle(t, expected_warp_dtype)
+        assert a.dtype == expected_warp_dtype
+        assert a.shape == tuple(t.shape)
+    wrap_scalar_tensor_explicit(paddle.float64, wp.float64)
+    wrap_scalar_tensor_explicit(paddle.float32, wp.float32)
+    wrap_scalar_tensor_explicit(paddle.float16, wp.float16)
+    wrap_scalar_tensor_explicit(paddle.int64, wp.int64)
+    wrap_scalar_tensor_explicit(paddle.int64, wp.uint64)
+    wrap_scalar_tensor_explicit(paddle.int32, wp.int32)
+    wrap_scalar_tensor_explicit(paddle.int32, wp.uint32)
+    wrap_scalar_tensor_explicit(paddle.int16, wp.int16)
+    wrap_scalar_tensor_explicit(paddle.int16, wp.uint16)
+    wrap_scalar_tensor_explicit(paddle.int8, wp.int8)
+    wrap_scalar_tensor_explicit(paddle.int8, wp.uint8)
+    wrap_scalar_tensor_explicit(paddle.uint8, wp.uint8)
+    wrap_scalar_tensor_explicit(paddle.uint8, wp.int8)
+    wrap_scalar_tensor_explicit(paddle.bool, wp.uint8)
+    wrap_scalar_tensor_explicit(paddle.bool, wp.int8)
+    wrap_scalar_tensor_explicit(paddle.bool, wp.bool)
+    def wrap_vec_tensor(n, desired_warp_dtype):
+        t = paddle.zeros((10, n), dtype=paddle.float32).to(device=paddle_device)
+        a = wp.from_paddle(t, desired_warp_dtype)
+        assert a.dtype == desired_warp_dtype
+        assert a.shape == (10,)
+    wrap_vec_tensor(2, wp.vec2)
+    wrap_vec_tensor(3, wp.vec3)
+    wrap_vec_tensor(4, wp.vec4)
+    wrap_vec_tensor(6, wp.spatial_vector)
+    wrap_vec_tensor(7, wp.transform)
+    def wrap_mat_tensor(n, m, desired_warp_dtype):
+        t = paddle.zeros((10, n, m), dtype=paddle.float32).to(device=paddle_device)
+        a = wp.from_paddle(t, desired_warp_dtype)
+        assert a.dtype == desired_warp_dtype
+        assert a.shape == (10,)
+    wrap_mat_tensor(2, 2, wp.mat22)
+    wrap_mat_tensor(3, 3, wp.mat33)
+    wrap_mat_tensor(4, 4, wp.mat44)
+    wrap_mat_tensor(6, 6, wp.spatial_matrix)
+    def wrap_vec_tensor_with_grad(n, desired_warp_dtype):
+        t = paddle.zeros((10, n), dtype=paddle.float32).to(device=paddle_device)
+        a = wp.from_paddle(t, desired_warp_dtype)
+        a.reuqires_grad = True
+        assert a.dtype == desired_warp_dtype
+        assert a.shape == (10,)
+    wrap_vec_tensor_with_grad(2, wp.vec2)
+    wrap_vec_tensor_with_grad(3, wp.vec3)
+    wrap_vec_tensor_with_grad(4, wp.vec4)
+    wrap_vec_tensor_with_grad(6, wp.spatial_vector)
+    wrap_vec_tensor_with_grad(7, wp.transform)
+    def wrap_mat_tensor_with_grad(n, m, desired_warp_dtype):
+        t = paddle.zeros((10, n, m), dtype=paddle.float32).to(device=paddle_device)
+        a = wp.from_paddle(t, desired_warp_dtype, requires_grad=True)
+        assert a.dtype == desired_warp_dtype
+        assert a.shape == (10,)
+    wrap_mat_tensor_with_grad(2, 2, wp.mat22)
+    wrap_mat_tensor_with_grad(3, 3, wp.mat33)
+    wrap_mat_tensor_with_grad(4, 4, wp.mat44)
+    wrap_mat_tensor_with_grad(6, 6, wp.spatial_matrix)
+def test_array_ctype_from_paddle(test, device):
+    import paddle
+    paddle_device = wp.device_to_paddle(device)
+    # automatically determine warp dtype
+    def wrap_scalar_tensor_implicit(paddle_dtype):
+        t = paddle.zeros([10], dtype=paddle_dtype).to(device=paddle_device)
+        a = wp.from_paddle(t, return_ctype=True)
+        warp_dtype = wp.dtype_from_paddle(paddle_dtype)
+        ctype_size = ctypes.sizeof(warp_dtype._type_)
+        assert a.data == t.data_ptr()
+        assert a.grad == 0
+        assert a.ndim == 1
+        assert a.shape[0] == t.shape[0]
+        assert a.strides[0] == t.strides[0] * ctype_size
+    wrap_scalar_tensor_implicit(paddle.float64)
+    wrap_scalar_tensor_implicit(paddle.float32)
+    wrap_scalar_tensor_implicit(paddle.float16)
+    wrap_scalar_tensor_implicit(paddle.int64)
+    wrap_scalar_tensor_implicit(paddle.int32)
+    wrap_scalar_tensor_implicit(paddle.int16)
+    wrap_scalar_tensor_implicit(paddle.int8)
+    wrap_scalar_tensor_implicit(paddle.uint8)
+    wrap_scalar_tensor_implicit(paddle.bool)
+    # explicitly specify warp dtype
+    def wrap_scalar_tensor_explicit(paddle_dtype, warp_dtype):
+        t = paddle.zeros([10], dtype=paddle_dtype).to(device=paddle_device)
+        a = wp.from_paddle(t, dtype=warp_dtype, return_ctype=True)
+        ctype_size = ctypes.sizeof(warp_dtype._type_)
+        assert a.data == t.data_ptr()
+        assert a.grad == 0
+        assert a.ndim == 1
+        assert a.shape[0] == t.shape[0]
+        assert a.strides[0] == t.strides[0] * ctype_size
+    wrap_scalar_tensor_explicit(paddle.float64, wp.float64)
+    wrap_scalar_tensor_explicit(paddle.float32, wp.float32)
+    wrap_scalar_tensor_explicit(paddle.float16, wp.float16)
+    wrap_scalar_tensor_explicit(paddle.int64, wp.int64)
+    wrap_scalar_tensor_explicit(paddle.int64, wp.uint64)
+    wrap_scalar_tensor_explicit(paddle.int32, wp.int32)
+    wrap_scalar_tensor_explicit(paddle.int32, wp.uint32)
+    wrap_scalar_tensor_explicit(paddle.int16, wp.int16)
+    wrap_scalar_tensor_explicit(paddle.int16, wp.uint16)
+    wrap_scalar_tensor_explicit(paddle.int8, wp.int8)
+    wrap_scalar_tensor_explicit(paddle.int8, wp.uint8)
+    wrap_scalar_tensor_explicit(paddle.uint8, wp.uint8)
+    wrap_scalar_tensor_explicit(paddle.uint8, wp.int8)
+    wrap_scalar_tensor_explicit(paddle.bool, wp.uint8)
+    wrap_scalar_tensor_explicit(paddle.bool, wp.int8)
+    wrap_scalar_tensor_explicit(paddle.bool, wp.bool)
+    def wrap_vec_tensor(vec_dtype):
+        t = paddle.zeros((10, vec_dtype._length_), dtype=paddle.float32).to(device=paddle_device)
+        a = wp.from_paddle(t, dtype=vec_dtype, return_ctype=True)
+        ctype_size = ctypes.sizeof(vec_dtype._type_)
+        assert a.data == t.data_ptr()
+        assert a.grad == 0
+        assert a.ndim == 1
+        assert a.shape[0] == t.shape[0]
+        assert a.strides[0] == t.strides[0] * ctype_size
+    wrap_vec_tensor(wp.vec2)
+    wrap_vec_tensor(wp.vec3)
+    wrap_vec_tensor(wp.vec4)
+    wrap_vec_tensor(wp.spatial_vector)
+    wrap_vec_tensor(wp.transform)
+    def wrap_mat_tensor(mat_dtype):
+        t = paddle.zeros((10, *mat_dtype._shape_), dtype=paddle.float32).to(device=paddle_device)
+        a = wp.from_paddle(t, dtype=mat_dtype, return_ctype=True)
+        ctype_size = ctypes.sizeof(mat_dtype._type_)
+        assert a.data == t.data_ptr()
+        assert a.grad == 0
+        assert a.ndim == 1
+        assert a.shape[0] == t.shape[0]
+        assert a.strides[0] == t.strides[0] * ctype_size
+    wrap_mat_tensor(wp.mat22)
+    wrap_mat_tensor(wp.mat33)
+    wrap_mat_tensor(wp.mat44)
+    wrap_mat_tensor(wp.spatial_matrix)
+    def wrap_vec_tensor_with_existing_grad(vec_dtype):
+        t = paddle.zeros((10, vec_dtype._length_), dtype=paddle.float32).to(device=paddle_device)
+        t.stop_gradient = False
+        t.grad_ = paddle.zeros((10, vec_dtype._length_), dtype=paddle.float32).to(device=paddle_device)
+        a = wp.from_paddle(t, dtype=vec_dtype, return_ctype=True)
+        ctype_size = ctypes.sizeof(vec_dtype._type_)
+        assert a.data == t.data_ptr()
+        assert a.grad == t.grad.data_ptr()
+        assert a.ndim == 1
+        assert a.shape[0] == t.shape[0]
+        assert a.strides[0] == t.strides[0] * ctype_size
+    wrap_vec_tensor_with_existing_grad(wp.vec2)
+    wrap_vec_tensor_with_existing_grad(wp.vec3)
+    wrap_vec_tensor_with_existing_grad(wp.vec4)
+    wrap_vec_tensor_with_existing_grad(wp.spatial_vector)
+    wrap_vec_tensor_with_existing_grad(wp.transform)
+    def wrap_vec_tensor_with_new_grad(vec_dtype):
+        t = paddle.zeros((10, vec_dtype._length_), dtype=paddle.float32).to(device=paddle_device)
+        a = wp.from_paddle(t, dtype=vec_dtype, requires_grad=True, return_ctype=True)
+        ctype_size = ctypes.sizeof(vec_dtype._type_)
+        assert a.data == t.data_ptr()
+        assert a.grad == t.grad.data_ptr()
+        assert a.ndim == 1
+        assert a.shape[0] == t.shape[0]
+        assert a.strides[0] == t.strides[0] * ctype_size
+    wrap_vec_tensor_with_new_grad(wp.vec2)
+    wrap_vec_tensor_with_new_grad(wp.vec3)
+    wrap_vec_tensor_with_new_grad(wp.vec4)
+    wrap_vec_tensor_with_new_grad(wp.spatial_vector)
+    wrap_vec_tensor_with_new_grad(wp.transform)
+    def wrap_vec_tensor_with_paddle_grad(vec_dtype):
+        t = paddle.zeros((10, vec_dtype._length_), dtype=paddle.float32).to(device=paddle_device)
+        grad = paddle.zeros((10, vec_dtype._length_), dtype=paddle.float32).to(device=paddle_device)
+        a = wp.from_paddle(t, dtype=vec_dtype, grad=grad, return_ctype=True)
+        ctype_size = ctypes.sizeof(vec_dtype._type_)
+        assert a.data == t.data_ptr()
+        assert a.grad == grad.data_ptr()
+        assert a.ndim == 1
+        assert a.shape[0] == t.shape[0]
+        assert a.strides[0] == t.strides[0] * ctype_size
+    wrap_vec_tensor_with_paddle_grad(wp.vec2)
+    wrap_vec_tensor_with_paddle_grad(wp.vec3)
+    wrap_vec_tensor_with_paddle_grad(wp.vec4)
+    wrap_vec_tensor_with_paddle_grad(wp.spatial_vector)
+    wrap_vec_tensor_with_paddle_grad(wp.transform)
+    def wrap_vec_tensor_with_warp_grad(vec_dtype):
+        t = paddle.zeros((10, vec_dtype._length_), dtype=paddle.float32).to(device=paddle_device)
+        grad = wp.zeros(10, dtype=vec_dtype, device=device)
+        a = wp.from_paddle(t, dtype=vec_dtype, grad=grad, return_ctype=True)
+        ctype_size = ctypes.sizeof(vec_dtype._type_)
+        assert a.data == t.data_ptr()
+        assert a.grad == grad.ptr
+        assert a.ndim == 1
+        assert a.shape[0] == t.shape[0]
+        assert a.strides[0] == t.strides[0] * ctype_size
+    wrap_vec_tensor_with_warp_grad(wp.vec2)
+    wrap_vec_tensor_with_warp_grad(wp.vec3)
+    wrap_vec_tensor_with_warp_grad(wp.vec4)
+    wrap_vec_tensor_with_warp_grad(wp.spatial_vector)
+    wrap_vec_tensor_with_warp_grad(wp.transform)
+def test_to_paddle(test, device):
+    import paddle
+    def wrap_scalar_array(warp_dtype, expected_paddle_dtype):
+        a = wp.zeros(10, dtype=warp_dtype, device=device)
+        t = wp.to_paddle(a)
+        assert t.dtype == expected_paddle_dtype
+        assert tuple(t.shape) == a.shape
+    wrap_scalar_array(wp.float64, paddle.float64)
+    wrap_scalar_array(wp.float32, paddle.float32)
+    wrap_scalar_array(wp.float16, paddle.float16)
+    wrap_scalar_array(wp.int64, paddle.int64)
+    wrap_scalar_array(wp.int32, paddle.int32)
+    wrap_scalar_array(wp.int16, paddle.int16)
+    wrap_scalar_array(wp.int8, paddle.int8)
+    wrap_scalar_array(wp.uint8, paddle.uint8)
+    wrap_scalar_array(wp.bool, paddle.bool)
+    # not supported by paddle
+    # wrap_scalar_array(wp.uint64, paddle.int64)
+    # wrap_scalar_array(wp.uint32, paddle.int32)
+    # wrap_scalar_array(wp.uint16, paddle.int16)
+    def wrap_vec_array(n, warp_dtype):
+        a = wp.zeros(10, dtype=warp_dtype, device=device)
+        t = wp.to_paddle(a)
+        assert t.dtype == paddle.float32
+        assert tuple(t.shape) == (10, n)
+    wrap_vec_array(2, wp.vec2)
+    wrap_vec_array(3, wp.vec3)
+    wrap_vec_array(4, wp.vec4)
+    wrap_vec_array(6, wp.spatial_vector)
+    wrap_vec_array(7, wp.transform)
+    def wrap_mat_array(n, m, warp_dtype):
+        a = wp.zeros(10, dtype=warp_dtype, device=device)
+        t = wp.to_paddle(a)
+        assert t.dtype == paddle.float32
+        assert tuple(t.shape) == (10, n, m)
+    wrap_mat_array(2, 2, wp.mat22)
+    wrap_mat_array(3, 3, wp.mat33)
+    wrap_mat_array(4, 4, wp.mat44)
+    wrap_mat_array(6, 6, wp.spatial_matrix)
+def test_from_paddle_slices(test, device):
+    import paddle
+    paddle_device = wp.device_to_paddle(device)
+    # 1D slice, contiguous
+    t_base = paddle.arange(10, dtype=paddle.float32).to(device=paddle_device)
+    t = t_base[2:9]
+    a = wp.from_paddle(t)
+    assert a.ptr == t.data_ptr()
+    assert a.is_contiguous
+    assert a.shape == tuple(t.shape)
+    assert_np_equal(a.numpy(), t.cpu().numpy())
+    # 1D slice with non-contiguous stride
+    t_base = paddle.arange(10, dtype=paddle.float32).to(device=paddle_device)
+    t = t_base[2:9:2]
+    a = wp.from_paddle(t)
+    assert a.ptr == t.data_ptr()
+    assert not a.is_contiguous
+    assert a.shape == tuple(t.shape)
+    # copy contents to contiguous array
+    a_contiguous = wp.empty_like(a)
+    wp.launch(copy1d_float_kernel, dim=a.shape, inputs=[a_contiguous, a], device=device)
+    assert_np_equal(a_contiguous.numpy(), t.cpu().numpy())
+    # 2D slices (non-contiguous)
+    t_base = paddle.arange(24, dtype=paddle.float32).to(device=paddle_device).reshape((4, 6))
+    t = t_base[1:3, 2:5]
+    a = wp.from_paddle(t)
+    assert a.ptr == t.data_ptr()
+    assert not a.is_contiguous
+    assert a.shape == tuple(t.shape)
+    # copy contents to contiguous array
+    a_contiguous = wp.empty_like(a)
+    wp.launch(copy2d_float_kernel, dim=a.shape, inputs=[a_contiguous, a], device=device)
+    assert_np_equal(a_contiguous.numpy(), t.cpu().numpy())
+    # 3D slices (non-contiguous)
+    t_base = paddle.arange(36, dtype=paddle.float32).to(device=paddle_device).reshape((4, 3, 3))
+    t = t_base[::2, 0:1, 1:2]
+    a = wp.from_paddle(t)
+    assert a.ptr == t.data_ptr()
+    assert not a.is_contiguous
+    assert a.shape == tuple(t.shape)
+    # copy contents to contiguous array
+    a_contiguous = wp.empty_like(a)
+    wp.launch(copy3d_float_kernel, dim=a.shape, inputs=[a_contiguous, a], device=device)
+    assert_np_equal(a_contiguous.numpy(), t.cpu().numpy())
+    # 2D slices of vec3 (inner contiguous, outer non-contiguous)
+    t_base = paddle.arange(150, dtype=paddle.float32).to(device=paddle_device).reshape((10, 5, 3))
+    t = t_base[1:7:2, 2:5]
+    a = wp.from_paddle(t, dtype=wp.vec3)
+    assert a.ptr == t.data_ptr()
+    assert not a.is_contiguous
+    assert a.shape == tuple(t.shape[:-1])
+    # copy contents to contiguous array
+    a_contiguous = wp.empty_like(a)
+    wp.launch(copy2d_vec3_kernel, dim=a.shape, inputs=[a_contiguous, a], device=device)
+    assert_np_equal(a_contiguous.numpy(), t.cpu().numpy())
+    # 2D slices of mat22 (inner contiguous, outer non-contiguous)
+    t_base = paddle.arange(200, dtype=paddle.float32).to(device=paddle_device).reshape((10, 5, 2, 2))
+    t = t_base[1:7:2, 2:5]
+    a = wp.from_paddle(t, dtype=wp.mat22)
+    assert a.ptr == t.data_ptr()
+    assert not a.is_contiguous
+    assert a.shape == tuple(t.shape[:-2])
+    # copy contents to contiguous array
+    a_contiguous = wp.empty_like(a)
+    wp.launch(copy2d_mat22_kernel, dim=a.shape, inputs=[a_contiguous, a], device=device)
+    assert_np_equal(a_contiguous.numpy(), t.cpu().numpy())
+def test_from_paddle_zero_strides(test, device):
+    import paddle
+    paddle_device = wp.device_to_paddle(device)
+    t_base = paddle.arange(9, dtype=paddle.float32).to(device=paddle_device).reshape((3, 3))
+    # expand outermost dimension
+    t = t_base.unsqueeze(0).expand([3, -1, -1])
+    a = wp.from_paddle(t)
+    assert a.ptr == t.data_ptr()
+    assert a.is_contiguous
+    assert a.shape == tuple(t.shape)
+    a_contiguous = wp.empty_like(a)
+    wp.launch(copy3d_float_kernel, dim=a.shape, inputs=[a_contiguous, a], device=device)
+    assert_np_equal(a_contiguous.numpy(), t.cpu().numpy())
+    # expand middle dimension
+    t = t_base.unsqueeze(1).expand([-1, 3, -1])
+    a = wp.from_paddle(t)
+    assert a.ptr == t.data_ptr()
+    assert a.is_contiguous
+    assert a.shape == tuple(t.shape)
+    a_contiguous = wp.empty_like(a)
+    wp.launch(copy3d_float_kernel, dim=a.shape, inputs=[a_contiguous, a], device=device)
+    assert_np_equal(a_contiguous.numpy(), t.cpu().numpy())
+    # expand innermost dimension
+    t = t_base.unsqueeze(2).expand([-1, -1, 3])
+    a = wp.from_paddle(t)
+    assert a.ptr == t.data_ptr()
+    assert a.is_contiguous
+    assert a.shape == tuple(t.shape)
+    a_contiguous = wp.empty_like(a)
+    wp.launch(copy3d_float_kernel, dim=a.shape, inputs=[a_contiguous, a], device=device)
+    assert_np_equal(a_contiguous.numpy(), t.cpu().numpy())
+def test_paddle_mgpu_from_paddle(test, device):
+    import paddle
+    n = 32
+    t0 = paddle.arange(0, n, 1, dtype=paddle.int32).to(device="gpu:0")
+    t1 = paddle.arange(0, n * 2, 2, dtype=paddle.int32).to(device="gpu:1")
+    a0 = wp.from_paddle(t0, dtype=wp.int32)
+    a1 = wp.from_paddle(t1, dtype=wp.int32)
+    assert a0.device == "gpu:0"
+    assert a1.device == "gpu:1"
+    expected0 = np.arange(0, n, 1)
+    expected1 = np.arange(0, n * 2, 2)
+    assert_np_equal(a0.numpy(), expected0)
+    assert_np_equal(a1.numpy(), expected1)
+def test_paddle_mgpu_to_paddle(test, device):
+    n = 32
+    with wp.ScopedDevice("gpu:0"):
+        a0 = wp.empty(n, dtype=wp.int32)
+        wp.launch(arange, dim=a0.size, inputs=[0, 1, a0])
+    with wp.ScopedDevice("gpu:1"):
+        a1 = wp.empty(n, dtype=wp.int32)
+        wp.launch(arange, dim=a1.size, inputs=[0, 2, a1])
+    t0 = wp.to_paddle(a0)
+    t1 = wp.to_paddle(a1)
+    assert str(t0.device) == "gpu:0"
+    assert str(t1.device) == "gpu:1"
+    expected0 = np.arange(0, n, 1, dtype=np.int32)
+    expected1 = np.arange(0, n * 2, 2, dtype=np.int32)
+    assert_np_equal(t0.cpu().numpy(), expected0)
+    assert_np_equal(t1.cpu().numpy(), expected1)
+def test_paddle_mgpu_interop(test, device):
+    import paddle
+    n = 1024 * 1024
+    with paddle.cuda.device(0):
+        t0 = paddle.arange(n, dtype=paddle.float32).to(device="gpu")
+        a0 = wp.from_paddle(t0)
+        wp.launch(inc, dim=a0.size, inputs=[a0], stream=wp.stream_from_paddle())
+    with paddle.cuda.device(1):
+        t1 = paddle.arange(n, dtype=paddle.float32).to(device="gpu")
+        a1 = wp.from_paddle(t1)
+        wp.launch(inc, dim=a1.size, inputs=[a1], stream=wp.stream_from_paddle())
+    assert a0.device == "gpu:0"
+    assert a1.device == "gpu:1"
+    expected = np.arange(n, dtype=int) + 1
+    # ensure the paddle tensors were modified by warp
+    assert_np_equal(t0.cpu().numpy(), expected)
+    assert_np_equal(t1.cpu().numpy(), expected)
+def test_paddle_autograd(test, device):
+    """Test paddle autograd with a custom Warp op"""
+    import paddle
+    # custom autograd op
+    class TestFunc(paddle.autograd.PyLayer):
+        @staticmethod
+        def forward(ctx, x):
+            # allocate output array
+            y = paddle.empty_like(x)
+            ctx.x = x
+            ctx.y = y
+            wp.launch(kernel=op_kernel, dim=len(x), inputs=[wp.from_paddle(x)], outputs=[wp.from_paddle(y)])
+            return y
+        @staticmethod
+        def backward(ctx, adj_y):
+            # adjoints should be allocated as zero initialized
+            adj_x = paddle.zeros_like(ctx.x).contiguous()
+            adj_y = adj_y.contiguous()
+            wp_x = wp.from_paddle(ctx.x, grad=adj_x)
+            wp_y = wp.from_paddle(ctx.y, grad=adj_y)
+            wp.launch(
+                kernel=op_kernel,
+                dim=len(ctx.x),
+                # fwd inputs
+                inputs=[wp_x],
+                outputs=[wp_y],
+                # adj inputs (already stored in input/output arrays, passing null pointers)
+                adj_inputs=[None],
+                adj_outputs=[None],
+                adjoint=True,
+            )
+            return adj_x
+    # run autograd on given device
+    with wp.ScopedDevice(device):
+        paddle_device = wp.device_to_paddle(device)
+        # input data
+        x = paddle.ones(16, dtype=paddle.float32).to(device=paddle_device)
+        x.stop_gradient = False
+        # execute op
+        y = TestFunc.apply(x)
+        # compute grads
+        l = y.sum()
+        l.backward()
+        passed = (x.grad == -2.0).all()
+        assert passed.item()
+def test_warp_graph_warp_stream(test, device):
+    """Capture Warp graph on Warp stream"""
+    import paddle
+    paddle_device = wp.device_to_paddle(device)
+    n = 1024 * 1024
+    t = paddle.zeros(n, dtype=paddle.float32).to(device=paddle_device)
+    a = wp.from_paddle(t)
+    # make paddle use the warp stream from the given device
+    paddle_stream = wp.stream_to_paddle(device)
+    # capture graph
+    with wp.ScopedDevice(device), paddle.device.stream(paddle_stream):
+        wp.capture_begin(force_module_load=False)
+        try:
+            t += 1.0
+            wp.launch(inc, dim=n, inputs=[a])
+            t += 1.0
+            wp.launch(inc, dim=n, inputs=[a])
+        finally:
+            g = wp.capture_end()
+    # replay graph
+    num_iters = 10
+    for _i in range(num_iters):
+        wp.capture_launch(g)
+    passed = (t == num_iters * 4.0).all()
+    assert passed.item()
+def test_warp_graph_paddle_stream(test, device):
+    """Capture Warp graph on Paddle stream"""
+    wp.load_module(device=device)
+    import paddle
+    paddle_device = wp.device_to_paddle(device)
+    n = 1024 * 1024
+    t = paddle.zeros(n, dtype=paddle.float32).to(device=paddle_device)
+    a = wp.from_paddle(t)
+    # create a device-specific paddle stream to use for capture
+    # (the default paddle stream is not suitable for graph capture)
+    paddle_stream = paddle.device.Stream(device=paddle_device)
+    # make warp use the same stream
+    warp_stream = wp.stream_from_paddle(paddle_stream)
+    # capture graph
+    with wp.ScopedStream(warp_stream):
+        wp.capture_begin(force_module_load=False)
+        try:
+            t += 1.0
+            wp.launch(inc, dim=n, inputs=[a])
+            t += 1.0
+            wp.launch(inc, dim=n, inputs=[a])
+        finally:
+            g = wp.capture_end()
+    # replay graph
+    num_iters = 10
+    for _i in range(num_iters):
+        wp.capture_launch(g)
+    passed = (t == num_iters * 4.0).all()
+    assert passed.item()
+def test_direct(test, device):
+    """Pass Paddle tensors to Warp kernels directly"""
+    import paddle
+    paddle_device = wp.device_to_paddle(device)
+    n = 12
+    s = paddle.arange(n, dtype=paddle.float32).to(device=paddle_device)
+    v = paddle.arange(n, dtype=paddle.float32).to(device=paddle_device).reshape((n // 3, 3))
+    m = paddle.arange(n, dtype=paddle.float32).to(device=paddle_device).reshape((n // 4, 2, 2))
+    wp.launch(inc, dim=n, inputs=[s], device=device)
+    wp.launch(inc_vector, dim=n // 3, inputs=[v], device=device)
+    wp.launch(inc_matrix, dim=n // 4, inputs=[m], device=device)
+    expected = paddle.arange(1, n + 1, dtype=paddle.float32).to(device=paddle_device)
+    assert paddle.equal_all(s, expected).item()
+    assert paddle.equal_all(v.reshape([n]), expected).item()
+    assert paddle.equal_all(m.reshape([n]), expected).item()
+class TestPaddle(unittest.TestCase):
+    pass
+test_devices = get_test_devices()
+try:
+    import paddle
+    # check which Warp devices work with Paddle
+    # CUDA devices may fail if Paddle was not compiled with CUDA support
+    paddle_compatible_devices = []
+    paddle_compatible_cuda_devices = []
+    for d in test_devices:
+        try:
+            t = paddle.arange(10).to(device=wp.device_to_paddle(d))
+            t += 1
+            paddle_compatible_devices.append(d)
+            if d.is_cuda:
+                paddle_compatible_cuda_devices.append(d)
+        except Exception as e:
+            print(f"Skipping Paddle tests on device '{d}' due to exception: {e}")
+    add_function_test(TestPaddle, "test_dtype_from_paddle", test_dtype_from_paddle, devices=None)
+    add_function_test(TestPaddle, "test_dtype_to_paddle", test_dtype_to_paddle, devices=None)
+    if paddle_compatible_devices:
+        add_function_test(
+            TestPaddle, "test_device_conversion", test_device_conversion, devices=paddle_compatible_devices
+        )
+        add_function_test(TestPaddle, "test_from_paddle", test_from_paddle, devices=paddle_compatible_devices)
+        add_function_test(
+            TestPaddle, "test_from_paddle_slices", test_from_paddle_slices, devices=paddle_compatible_devices
+        )
+        add_function_test(
+            TestPaddle, "test_array_ctype_from_paddle", test_array_ctype_from_paddle, devices=paddle_compatible_devices
+        )
+        add_function_test(
+            TestPaddle,
+            "test_from_paddle_zero_strides",
+            test_from_paddle_zero_strides,
+            devices=paddle_compatible_devices,
+        )
+        add_function_test(TestPaddle, "test_to_paddle", test_to_paddle, devices=paddle_compatible_devices)
+        add_function_test(TestPaddle, "test_paddle_zerocopy", test_paddle_zerocopy, devices=paddle_compatible_devices)
+        add_function_test(TestPaddle, "test_paddle_autograd", test_paddle_autograd, devices=paddle_compatible_devices)
+        add_function_test(TestPaddle, "test_direct", test_direct, devices=paddle_compatible_devices)
+    # NOTE: Graph not supported now
+    # if paddle_compatible_cuda_devices:
+    #     add_function_test(
+    #         TestPaddle,
+    #         "test_warp_graph_warp_stream",
+    #         test_warp_graph_warp_stream,
+    #         devices=paddle_compatible_cuda_devices,
+    #     )
+    #     add_function_test(
+    #         TestPaddle,
+    #         "test_warp_graph_paddle_stream",
+    #         test_warp_graph_paddle_stream,
+    #         devices=paddle_compatible_cuda_devices,
+    #     )
+    # multi-GPU tests
+    if len(paddle_compatible_cuda_devices) > 1:
+        add_function_test(TestPaddle, "test_paddle_mgpu_from_paddle", test_paddle_mgpu_from_paddle)
+        add_function_test(TestPaddle, "test_paddle_mgpu_to_paddle", test_paddle_mgpu_to_paddle)
+        add_function_test(TestPaddle, "test_paddle_mgpu_interop", test_paddle_mgpu_interop)
+except Exception as e:
+    print(f"Skipping Paddle tests due to exception: {e}")
+if __name__ == "__main__":
+    wp.clear_kernel_cache()
+    unittest.main(verbosity=2)