PyPI - warp-lang - Versions diffs - 1.8.0__py3-none-manylinux_2_34_aarch64.whl → 1.8.1__py3-none-manylinux_2_34_aarch64.whl - Mend

warp-lang 1.8.0__py3-none-manylinux_2_34_aarch64.whl → 1.8.1__py3-none-manylinux_2_34_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (59) hide show

warp/bin/warp-clang.so +0 -0
warp/bin/warp.so +0 -0
warp/build_dll.py +5 -0
warp/codegen.py +15 -3
warp/config.py +1 -1
warp/context.py +122 -24
warp/examples/interop/example_jax_callable.py +34 -4
warp/examples/interop/example_jax_kernel.py +27 -1
warp/fem/field/virtual.py +2 -0
warp/fem/integrate.py +78 -47
warp/jax_experimental/ffi.py +201 -53
warp/native/array.h +4 -4
warp/native/builtin.h +8 -4
warp/native/coloring.cpp +5 -1
warp/native/cuda_util.cpp +1 -1
warp/native/intersect.h +2 -2
warp/native/mat.h +3 -3
warp/native/mesh.h +1 -1
warp/native/quat.h +6 -2
warp/native/rand.h +7 -7
warp/native/sparse.cu +1 -1
warp/native/svd.h +23 -8
warp/native/tile.h +20 -1
warp/native/tile_radix_sort.h +5 -1
warp/native/tile_reduce.h +16 -25
warp/native/tuple.h +2 -2
warp/native/vec.h +4 -4
warp/native/warp.cpp +1 -1
warp/native/warp.cu +15 -2
warp/native/warp.h +1 -1
warp/render/render_opengl.py +52 -51
warp/render/render_usd.py +0 -1
warp/sim/collide.py +1 -2
warp/sim/integrator_vbd.py +10 -2
warp/sparse.py +1 -1
warp/tape.py +2 -0
warp/tests/sim/test_cloth.py +89 -6
warp/tests/sim/test_coloring.py +76 -1
warp/tests/test_assert.py +53 -0
warp/tests/test_atomic_cas.py +127 -114
warp/tests/test_mat.py +22 -0
warp/tests/test_quat.py +22 -0
warp/tests/test_sparse.py +32 -0
warp/tests/test_static.py +48 -0
warp/tests/test_tape.py +38 -0
warp/tests/test_vec.py +38 -408
warp/tests/test_vec_constructors.py +325 -0
warp/tests/tile/test_tile.py +31 -143
warp/tests/tile/test_tile_mathdx.py +2 -2
warp/tests/tile/test_tile_matmul.py +179 -0
warp/tests/tile/test_tile_reduce.py +100 -11
warp/tests/tile/test_tile_shared_memory.py +12 -12
warp/tests/tile/test_tile_sort.py +59 -55
warp/tests/unittest_suites.py +10 -0
{warp_lang-1.8.0.dist-info → warp_lang-1.8.1.dist-info}/METADATA +4 -4
{warp_lang-1.8.0.dist-info → warp_lang-1.8.1.dist-info}/RECORD +59 -57
{warp_lang-1.8.0.dist-info → warp_lang-1.8.1.dist-info}/WHEEL +0 -0
{warp_lang-1.8.0.dist-info → warp_lang-1.8.1.dist-info}/licenses/LICENSE.md +0 -0
{warp_lang-1.8.0.dist-info → warp_lang-1.8.1.dist-info}/top_level.txt +0 -0

warp/tests/test_vec_constructors.py ADDED Viewed

@@ -0,0 +1,325 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import numpy as np
+import warp as wp
+from warp.tests.unittest_utils import *
+np_float_types = [np.float16, np.float32, np.float64]
+kernel_cache = {}
+def getkernel(func, suffix=""):
+    key = func.__name__ + "_" + suffix
+    if key not in kernel_cache:
+        kernel_cache[key] = wp.Kernel(func=func, key=key)
+    return kernel_cache[key]
+def test_anon_constructor_error_length_mismatch(test, device):
+    @wp.kernel
+    def kernel():
+        wp.vector(wp.vector(length=2, dtype=float), length=3, dtype=float)
+    with test.assertRaisesRegex(
+        RuntimeError,
+        r"incompatible vector of length 3 given when copy constructing a vector of length 2$",
+    ):
+        wp.launch(kernel, dim=1, inputs=[], device=device)
+def test_anon_constructor_error_numeric_arg_missing(test, device):
+    @wp.kernel
+    def kernel():
+        wp.vector(1.0, 2.0, length=12345)
+    with test.assertRaisesRegex(
+        RuntimeError,
+        r"incompatible number of values given \(2\) when constructing a vector of length 12345$",
+    ):
+        wp.launch(kernel, dim=1, inputs=[], device=device)
+def test_anon_constructor_error_length_arg_missing(test, device):
+    @wp.kernel
+    def kernel():
+        wp.vector()
+    with test.assertRaisesRegex(
+        RuntimeError,
+        r"the `length` argument must be specified when zero-initializing a vector$",
+    ):
+        wp.launch(kernel, dim=1, inputs=[], device=device)
+def test_anon_constructor_error_numeric_args_mismatch(test, device):
+    @wp.kernel
+    def kernel():
+        wp.vector(1.0, 2)
+    with test.assertRaisesRegex(
+        RuntimeError,
+        r"all values given when constructing a vector must have the same type$",
+    ):
+        wp.launch(kernel, dim=1, inputs=[], device=device)
+def test_tpl_constructor_error_incompatible_sizes(test, device):
+    @wp.kernel
+    def kernel():
+        wp.vec3(wp.vec2(1.0, 2.0))
+    with test.assertRaisesRegex(
+        RuntimeError, "incompatible vector of length 3 given when copy constructing a vector of length 2"
+    ):
+        wp.launch(kernel, dim=1, inputs=[], device=device)
+def test_tpl_constructor_error_numeric_args_mismatch(test, device):
+    @wp.kernel
+    def kernel():
+        wp.vec2(1.0, 2)
+    with test.assertRaisesRegex(
+        RuntimeError,
+        r"all values given when constructing a vector must have the same type$",
+    ):
+        wp.launch(kernel, dim=1, inputs=[], device=device)
+def test_casting_constructors(test, device, dtype, register_kernels=False):
+    np_type = np.dtype(dtype)
+    wp_type = wp.types.np_dtype_to_warp_type[np_type]
+    vec3 = wp.types.vector(length=3, dtype=wp_type)
+    np16 = np.dtype(np.float16)
+    wp16 = wp.types.np_dtype_to_warp_type[np16]
+    np32 = np.dtype(np.float32)
+    wp32 = wp.types.np_dtype_to_warp_type[np32]
+    np64 = np.dtype(np.float64)
+    wp64 = wp.types.np_dtype_to_warp_type[np64]
+    def cast_float16(a: wp.array(dtype=wp_type, ndim=2), b: wp.array(dtype=wp16, ndim=2)):
+        tid = wp.tid()
+        v1 = vec3(a[tid, 0], a[tid, 1], a[tid, 2])
+        v2 = wp.vector(v1, dtype=wp16)
+        b[tid, 0] = v2[0]
+        b[tid, 1] = v2[1]
+        b[tid, 2] = v2[2]
+    def cast_float32(a: wp.array(dtype=wp_type, ndim=2), b: wp.array(dtype=wp32, ndim=2)):
+        tid = wp.tid()
+        v1 = vec3(a[tid, 0], a[tid, 1], a[tid, 2])
+        v2 = wp.vector(v1, dtype=wp32)
+        b[tid, 0] = v2[0]
+        b[tid, 1] = v2[1]
+        b[tid, 2] = v2[2]
+    def cast_float64(a: wp.array(dtype=wp_type, ndim=2), b: wp.array(dtype=wp64, ndim=2)):
+        tid = wp.tid()
+        v1 = vec3(a[tid, 0], a[tid, 1], a[tid, 2])
+        v2 = wp.vector(v1, dtype=wp64)
+        b[tid, 0] = v2[0]
+        b[tid, 1] = v2[1]
+        b[tid, 2] = v2[2]
+    kernel_16 = getkernel(cast_float16, suffix=dtype.__name__)
+    kernel_32 = getkernel(cast_float32, suffix=dtype.__name__)
+    kernel_64 = getkernel(cast_float64, suffix=dtype.__name__)
+    if register_kernels:
+        return
+    # check casting to float 16
+    a = wp.array(np.ones((1, 3), dtype=np_type), dtype=wp_type, requires_grad=True, device=device)
+    b = wp.array(np.zeros((1, 3), dtype=np16), dtype=wp16, requires_grad=True, device=device)
+    b_result = np.ones((1, 3), dtype=np16)
+    b_grad = wp.array(np.ones((1, 3), dtype=np16), dtype=wp16, device=device)
+    a_grad = wp.array(np.ones((1, 3), dtype=np_type), dtype=wp_type, device=device)
+    tape = wp.Tape()
+    with tape:
+        wp.launch(kernel=kernel_16, dim=1, inputs=[a, b], device=device)
+    tape.backward(grads={b: b_grad})
+    out = tape.gradients[a].numpy()
+    assert_np_equal(b.numpy(), b_result)
+    assert_np_equal(out, a_grad.numpy())
+    # check casting to float 32
+    a = wp.array(np.ones((1, 3), dtype=np_type), dtype=wp_type, requires_grad=True, device=device)
+    b = wp.array(np.zeros((1, 3), dtype=np32), dtype=wp32, requires_grad=True, device=device)
+    b_result = np.ones((1, 3), dtype=np32)
+    b_grad = wp.array(np.ones((1, 3), dtype=np32), dtype=wp32, device=device)
+    a_grad = wp.array(np.ones((1, 3), dtype=np_type), dtype=wp_type, device=device)
+    tape = wp.Tape()
+    with tape:
+        wp.launch(kernel=kernel_32, dim=1, inputs=[a, b], device=device)
+    tape.backward(grads={b: b_grad})
+    out = tape.gradients[a].numpy()
+    assert_np_equal(b.numpy(), b_result)
+    assert_np_equal(out, a_grad.numpy())
+    # check casting to float 64
+    a = wp.array(np.ones((1, 3), dtype=np_type), dtype=wp_type, requires_grad=True, device=device)
+    b = wp.array(np.zeros((1, 3), dtype=np64), dtype=wp64, requires_grad=True, device=device)
+    b_result = np.ones((1, 3), dtype=np64)
+    b_grad = wp.array(np.ones((1, 3), dtype=np64), dtype=wp64, device=device)
+    a_grad = wp.array(np.ones((1, 3), dtype=np_type), dtype=wp_type, device=device)
+    tape = wp.Tape()
+    with tape:
+        wp.launch(kernel=kernel_64, dim=1, inputs=[a, b], device=device)
+    tape.backward(grads={b: b_grad})
+    out = tape.gradients[a].numpy()
+    assert_np_equal(b.numpy(), b_result)
+    assert_np_equal(out, a_grad.numpy())
+@wp.kernel
+def test_vector_constructors_value_func():
+    a = wp.vec2()
+    b = wp.vector(a, dtype=wp.float16)
+    c = wp.vector(a)
+    d = wp.vector(a, length=2)
+    e = wp.vector(1.0, 2.0, 3.0, dtype=float)
+# Test matrix constructors using explicit type (float16)
+# note that these tests are specifically not using generics / closure
+# args to create kernels dynamically (like the rest of this file)
+# as those use different code paths to resolve arg types which
+# has lead to regressions.
+@wp.kernel
+def test_vector_constructors_explicit_precision():
+    # construction for custom matrix types
+    ones = wp.vector(wp.float16(1.0), length=2)
+    zeros = wp.vector(length=2, dtype=wp.float16)
+    custom = wp.vector(wp.float16(0.0), wp.float16(1.0))
+    for i in range(2):
+        wp.expect_eq(ones[i], wp.float16(1.0))
+        wp.expect_eq(zeros[i], wp.float16(0.0))
+        wp.expect_eq(custom[i], wp.float16(i))
+# Same as above but with a default (float/int) type
+# which tests some different code paths that
+# need to ensure types are correctly canonicalized
+# during codegen
+@wp.kernel
+def test_vector_constructors_default_precision():
+    # construction for custom matrix types
+    ones = wp.vector(1.0, length=2)
+    zeros = wp.vector(length=2, dtype=float)
+    custom = wp.vector(0.0, 1.0)
+    for i in range(2):
+        wp.expect_eq(ones[i], 1.0)
+        wp.expect_eq(zeros[i], 0.0)
+        wp.expect_eq(custom[i], float(i))
+CONSTANT_LENGTH = wp.constant(10)
+# tests that we can use global constants in length keyword argument
+# for vector constructor
+@wp.kernel
+def test_vector_constructors_constant_length():
+    v = wp.vector(length=(CONSTANT_LENGTH), dtype=float)
+    for i in range(CONSTANT_LENGTH):
+        v[i] = float(i)
+devices = get_test_devices()
+class TestVecConstructors(unittest.TestCase):
+    pass
+add_function_test(
+    TestVecConstructors,
+    "test_anon_constructor_error_length_mismatch",
+    test_anon_constructor_error_length_mismatch,
+    devices=devices,
+)
+add_function_test(
+    TestVecConstructors,
+    "test_anon_constructor_error_numeric_arg_missing",
+    test_anon_constructor_error_numeric_arg_missing,
+    devices=devices,
+)
+add_function_test(
+    TestVecConstructors,
+    "test_anon_constructor_error_length_arg_missing",
+    test_anon_constructor_error_length_arg_missing,
+    devices=devices,
+)
+add_function_test(
+    TestVecConstructors,
+    "test_anon_constructor_error_numeric_args_mismatch",
+    test_anon_constructor_error_numeric_args_mismatch,
+    devices=devices,
+)
+add_function_test(
+    TestVecConstructors,
+    "test_tpl_constructor_error_incompatible_sizes",
+    test_tpl_constructor_error_incompatible_sizes,
+    devices=devices,
+)
+add_function_test(
+    TestVecConstructors,
+    "test_tpl_constructor_error_numeric_args_mismatch",
+    test_tpl_constructor_error_numeric_args_mismatch,
+    devices=devices,
+)
+add_kernel_test(TestVecConstructors, test_vector_constructors_value_func, dim=1, devices=devices)
+add_kernel_test(TestVecConstructors, test_vector_constructors_explicit_precision, dim=1, devices=devices)
+add_kernel_test(TestVecConstructors, test_vector_constructors_default_precision, dim=1, devices=devices)
+add_kernel_test(TestVecConstructors, test_vector_constructors_constant_length, dim=1, devices=devices)
+for dtype in np_float_types:
+    add_function_test_register_kernel(
+        TestVecConstructors,
+        f"test_casting_constructors_{dtype.__name__}",
+        test_casting_constructors,
+        devices=devices,
+        dtype=dtype,
+    )
+if __name__ == "__main__":
+    wp.clear_kernel_cache()
+    unittest.main(verbosity=2, failfast=True)

warp/tests/tile/test_tile.py CHANGED Viewed

@@ -215,103 +215,6 @@ def test_tile_binary_map(test, device):
     assert_np_equal(B_wp.grad.numpy(), B_grad)
-def test_tile_grouped_gemm(test, device):
-    @wp.kernel
-    def tile_grouped_gemm(A: wp.array3d(dtype=float), B: wp.array3d(dtype=float), C: wp.array3d(dtype=float)):
-        # output tile index
-        i = wp.tid()
-        a = wp.tile_load(A[i], shape=(TILE_M, TILE_K))
-        b = wp.tile_load(B[i], shape=(TILE_K, TILE_N))
-        sum = wp.tile_zeros(shape=(TILE_M, TILE_N), dtype=wp.float32)
-        wp.tile_matmul(a, b, sum)
-        wp.tile_store(C[i], sum)
-    batch_count = 56
-    M = TILE_M
-    N = TILE_N
-    K = TILE_K
-    rng = np.random.default_rng(42)
-    A = rng.random((batch_count, M, K), dtype=np.float32)
-    B = rng.random((batch_count, K, N), dtype=np.float32)
-    C = A @ B
-    A_wp = wp.array(A, requires_grad=True, device=device)
-    B_wp = wp.array(B, requires_grad=True, device=device)
-    C_wp = wp.zeros((batch_count, TILE_M, TILE_N), requires_grad=True, device=device)
-    with wp.Tape() as tape:
-        wp.launch_tiled(
-            tile_grouped_gemm, dim=[batch_count], inputs=[A_wp, B_wp, C_wp], block_dim=TILE_DIM, device=device
-        )
-    # TODO: 32 mismatched elements
-    assert_np_equal(C_wp.numpy(), C, 1e-6)
-def test_tile_gemm(dtype):
-    def test(test, device):
-        @wp.kernel
-        def tile_gemm(A: wp.array2d(dtype=dtype), B: wp.array2d(dtype=dtype), C: wp.array2d(dtype=dtype)):
-            # output tile index
-            i, j = wp.tid()
-            sum = wp.tile_zeros(shape=(TILE_M, TILE_N), dtype=dtype)
-            M = A.shape[0]
-            N = B.shape[1]
-            K = A.shape[1]
-            count = int(K / TILE_K)
-            for k in range(0, count):
-                a = wp.tile_load(A, shape=(TILE_M, TILE_K), offset=(i * TILE_M, k * TILE_K))
-                b = wp.tile_load(B, shape=(TILE_K, TILE_N), offset=(k * TILE_K, j * TILE_N))
-                # sum += a*b
-                wp.tile_matmul(a, b, sum)
-            wp.tile_store(C, sum, offset=(i * TILE_M, j * TILE_N))
-        M = TILE_M * 7
-        K = TILE_K * 6
-        N = TILE_N * 5
-        rng = np.random.default_rng(42)
-        A = rng.random((M, K), dtype=float).astype(wp.dtype_to_numpy(dtype))
-        B = rng.random((K, N), dtype=float).astype(wp.dtype_to_numpy(dtype))
-        C = np.zeros((M, N), dtype=float).astype(wp.dtype_to_numpy(dtype))
-        A_wp = wp.array(A, requires_grad=True, device=device)
-        B_wp = wp.array(B, requires_grad=True, device=device)
-        C_wp = wp.array(C, requires_grad=True, device=device)
-        with wp.Tape() as tape:
-            wp.launch_tiled(
-                tile_gemm,
-                dim=(int(M / TILE_M), int(N / TILE_N)),
-                inputs=[A_wp, B_wp, C_wp],
-                block_dim=TILE_DIM,
-                device=device,
-            )
-        assert_np_equal(C_wp.numpy(), A @ B, tol=1.0e-1)
-        adj_C = np.ones_like(C)
-        tape.backward(grads={C_wp: wp.array(adj_C, device=device)})
-        assert_np_equal(A_wp.grad.numpy(), adj_C @ B.T, tol=1.0e-1)
-        assert_np_equal(B_wp.grad.numpy(), A.T @ adj_C, 1.0e-1)
-    return test
 @wp.kernel
 def tile_operators(input: wp.array3d(dtype=float), output: wp.array3d(dtype=float)):
     # output tile index
@@ -368,6 +271,12 @@ def test_tile_tile_preserve_type_kernel(x: wp.array(dtype=Any), y: wp.array(dtyp
     wp.tile_store(y, t)
+wp.overload(test_tile_tile_preserve_type_kernel, {"x": wp.array(dtype=float), "y": wp.array(dtype=float)})
+wp.overload(test_tile_tile_preserve_type_kernel, {"x": wp.array(dtype=wp.vec3), "y": wp.array(dtype=wp.vec3)})
+wp.overload(test_tile_tile_preserve_type_kernel, {"x": wp.array(dtype=wp.quat), "y": wp.array(dtype=wp.quat)})
+wp.overload(test_tile_tile_preserve_type_kernel, {"x": wp.array(dtype=wp.mat33), "y": wp.array(dtype=wp.mat33)})
 @wp.kernel
 def test_tile_tile_scalar_expansion_kernel(x: wp.array(dtype=float), y: wp.array(dtype=float)):
     a = x[0]
@@ -494,6 +403,12 @@ def test_tile_untile_preserve_type_kernel(x: wp.array(dtype=Any), y: wp.array(dt
     y[i] = b
+wp.overload(test_tile_untile_preserve_type_kernel, {"x": wp.array(dtype=float), "y": wp.array(dtype=float)})
+wp.overload(test_tile_untile_preserve_type_kernel, {"x": wp.array(dtype=wp.vec3), "y": wp.array(dtype=wp.vec3)})
+wp.overload(test_tile_untile_preserve_type_kernel, {"x": wp.array(dtype=wp.quat), "y": wp.array(dtype=wp.quat)})
+wp.overload(test_tile_untile_preserve_type_kernel, {"x": wp.array(dtype=wp.mat33), "y": wp.array(dtype=wp.mat33)})
 @wp.kernel
 def test_tile_untile_kernel(x: wp.array(dtype=Any), y: wp.array(dtype=Any)):
     i = wp.tid()
@@ -503,6 +418,11 @@ def test_tile_untile_kernel(x: wp.array(dtype=Any), y: wp.array(dtype=Any)):
     y[i] = b
+wp.overload(test_tile_untile_kernel, {"x": wp.array(dtype=float), "y": wp.array(dtype=float)})
+wp.overload(test_tile_untile_kernel, {"x": wp.array(dtype=wp.vec3), "y": wp.array(dtype=wp.vec3)})
+wp.overload(test_tile_untile_kernel, {"x": wp.array(dtype=wp.mat33), "y": wp.array(dtype=wp.mat33)})
 def test_tile_untile(test, device):
     def test_func_preserve_type(type: Any):
         x = wp.ones(TILE_DIM, dtype=type, requires_grad=True, device=device)
@@ -644,7 +564,7 @@ def test_tile_sum_launch(test, device):
     assert_np_equal(input_wp.grad.numpy(), np.ones_like(input) * 0.5)
-@wp.kernel
+@wp.kernel(module="unique")
 def test_tile_extract_kernel(a: wp.array2d(dtype=float), b: wp.array2d(dtype=float)):
     i, j, x, y = wp.tid()
@@ -680,7 +600,7 @@ def test_tile_extract(test, device):
     assert_np_equal(a.grad.numpy(), expected_grad)
-@wp.kernel
+@wp.kernel(module="unique")
 def test_tile_extract_repeated_kernel(a: wp.array2d(dtype=float), b: wp.array2d(dtype=float)):
     i, j, x, y = wp.tid()
@@ -744,7 +664,7 @@ def test_tile_assign(test, device):
     tape = wp.Tape()
     with tape:
-        wp.launch(test_tile_assign_kernel, dim=[1, TILE_M], inputs=[x], outputs=[y], block_dim=64, device=device)
+        wp.launch(test_tile_assign_kernel, dim=[1, TILE_M], inputs=[x], outputs=[y], block_dim=TILE_DIM, device=device)
     y.grad = wp.ones_like(y)
     tape.backward()
@@ -766,31 +686,11 @@ def test_tile_transpose(test, device):
     input = wp.array(rng.random((TILE_M, TILE_N), dtype=np.float32), device=device)
     output = wp.zeros_like(input.transpose(), device=device)
-    wp.launch_tiled(test_tile_transpose_kernel, dim=[1], inputs=[input, output], block_dim=32, device=device)
+    wp.launch_tiled(test_tile_transpose_kernel, dim=[1], inputs=[input, output], block_dim=TILE_DIM, device=device)
     assert_np_equal(output.numpy(), input.numpy().T)
-def test_tile_transpose_matmul(test, device):
-    @wp.kernel
-    def test_tile_transpose_matmul_kernel(input: wp.array2d(dtype=float), output: wp.array2d(dtype=float)):
-        x = wp.tile_load(input, shape=(TILE_M, TILE_N))
-        y = wp.tile_transpose(x)
-        z = wp.tile_zeros(dtype=float, shape=(TILE_N, TILE_N))
-        wp.tile_matmul(y, x, z)
-        wp.tile_store(output, z)
-    rng = np.random.default_rng(42)
-    input = wp.array(rng.random((TILE_M, TILE_N), dtype=np.float32), device=device)
-    output = wp.zeros((TILE_N, TILE_N), dtype=float, device=device)
-    wp.launch_tiled(test_tile_transpose_matmul_kernel, dim=[1], inputs=[input, output], block_dim=32, device=device)
-    assert_np_equal(output.numpy(), input.numpy().T @ input.numpy())
 @wp.kernel
 def test_tile_broadcast_add_1d_kernel(
     input_a: wp.array(dtype=float), input_b: wp.array(dtype=float), output: wp.array(dtype=float)
@@ -812,7 +712,7 @@ def test_tile_broadcast_add_1d(test, device):
     b = wp.array(np.ones(1, dtype=np.float32), device=device)
     out = wp.zeros((N,), dtype=float, device=device)
-    wp.launch_tiled(test_tile_broadcast_add_1d_kernel, dim=[1], inputs=[a, b, out], block_dim=32, device=device)
+    wp.launch_tiled(test_tile_broadcast_add_1d_kernel, dim=[1], inputs=[a, b, out], block_dim=TILE_DIM, device=device)
     assert_np_equal(out.numpy(), a.numpy() + b.numpy())
@@ -839,7 +739,7 @@ def test_tile_broadcast_add_2d(test, device):
     b = wp.array(np.arange(0, N, dtype=np.float32), device=device)
     out = wp.zeros((M, N), dtype=float, device=device)
-    wp.launch_tiled(test_tile_broadcast_add_2d_kernel, dim=[1], inputs=[a, b, out], block_dim=32, device=device)
+    wp.launch_tiled(test_tile_broadcast_add_2d_kernel, dim=[1], inputs=[a, b, out], block_dim=TILE_DIM, device=device)
     assert_np_equal(out.numpy(), a.numpy() + b.numpy())
@@ -867,7 +767,7 @@ def test_tile_broadcast_add_3d(test, device):
     b = wp.array(np.arange(0, M * N, dtype=np.float32).reshape((M, N, 1)), device=device)
     out = wp.zeros((M, N, O), dtype=float, device=device)
-    wp.launch_tiled(test_tile_broadcast_add_3d_kernel, dim=[1], inputs=[a, b, out], block_dim=32, device=device)
+    wp.launch_tiled(test_tile_broadcast_add_3d_kernel, dim=[1], inputs=[a, b, out], block_dim=TILE_DIM, device=device)
     assert_np_equal(out.numpy(), a.numpy() + b.numpy())
@@ -894,7 +794,7 @@ def test_tile_broadcast_add_4d(test, device):
     b = wp.array(np.arange(0, M * O, dtype=np.float32).reshape((M, 1, O, 1)), device=device)
     out = wp.zeros((M, N, O, P), dtype=float, device=device)
-    wp.launch_tiled(test_tile_broadcast_add_4d_kernel, dim=[1], inputs=[a, b, out], block_dim=32, device=device)
+    wp.launch_tiled(test_tile_broadcast_add_4d_kernel, dim=[1], inputs=[a, b, out], block_dim=TILE_DIM, device=device)
     assert_np_equal(out.numpy(), a.numpy() + b.numpy())
@@ -915,7 +815,7 @@ def test_tile_broadcast_grad(test, device):
     b = wp.array(np.ones((5, 5), dtype=np.float32), requires_grad=True, device=device)
     with wp.Tape() as tape:
-        wp.launch_tiled(test_tile_broadcast_grad_kernel, dim=[1], inputs=[a, b], block_dim=32, device=device)
+        wp.launch_tiled(test_tile_broadcast_grad_kernel, dim=[1], inputs=[a, b], block_dim=TILE_DIM, device=device)
     b.grad = wp.ones_like(b, device=device)
     tape.backward()
@@ -1049,14 +949,7 @@ def tile_len_kernel(
 def test_tile_len(test, device):
     a = wp.zeros((TILE_M, TILE_N), dtype=float, device=device)
     out = wp.empty(1, dtype=int, device=device)
-    wp.launch_tiled(
-        tile_len_kernel,
-        dim=(1,),
-        inputs=(a,),
-        outputs=(out,),
-        block_dim=32,
-        device=device,
-    )
+    wp.launch_tiled(tile_len_kernel, dim=(1,), inputs=(a,), outputs=(out,), block_dim=TILE_DIM, device=device)
     test.assertEqual(out.numpy()[0], TILE_M)
@@ -1193,12 +1086,7 @@ add_function_test(TestTile, "test_tile_copy_1d", test_tile_copy_1d, devices=devi
 add_function_test(TestTile, "test_tile_copy_2d", test_tile_copy_2d, devices=devices)
 add_function_test(TestTile, "test_tile_unary_map", test_tile_unary_map, devices=devices)
 add_function_test(TestTile, "test_tile_binary_map", test_tile_binary_map, devices=devices)
-add_function_test(TestTile, "test_tile_grouped_gemm", test_tile_grouped_gemm, devices=devices)
-add_function_test(TestTile, "test_tile_gemm_fp16", test_tile_gemm(wp.float16), devices=devices)
-add_function_test(TestTile, "test_tile_gemm_fp32", test_tile_gemm(wp.float32), devices=devices)
-add_function_test(TestTile, "test_tile_gemm_fp64", test_tile_gemm(wp.float64), devices=devices)
 add_function_test(TestTile, "test_tile_transpose", test_tile_transpose, devices=devices)
-add_function_test(TestTile, "test_tile_transpose_matmul", test_tile_transpose_matmul, devices=devices)
 add_function_test(TestTile, "test_tile_operators", test_tile_operators, devices=devices)
 add_function_test(TestTile, "test_tile_tile", test_tile_tile, devices=get_cuda_test_devices())
 add_function_test(TestTile, "test_tile_untile", test_tile_untile, devices=devices)
@@ -1215,10 +1103,10 @@ add_function_test(TestTile, "test_tile_broadcast_grad", test_tile_broadcast_grad
 add_function_test(TestTile, "test_tile_squeeze", test_tile_squeeze, devices=devices)
 add_function_test(TestTile, "test_tile_reshape", test_tile_reshape, devices=devices)
 add_function_test(TestTile, "test_tile_len", test_tile_len, devices=devices)
-add_function_test(TestTile, "test_tile_print", test_tile_print, devices=devices, check_output=False)
-add_function_test(TestTile, "test_tile_inplace", test_tile_inplace, devices=devices)
-add_function_test(TestTile, "test_tile_astype", test_tile_astype, devices=devices)
-add_function_test(TestTile, "test_tile_func_return", test_tile_func_return, devices=devices)
+# add_function_test(TestTile, "test_tile_print", test_tile_print, devices=devices, check_output=False)
+# add_function_test(TestTile, "test_tile_inplace", test_tile_inplace, devices=devices)
+# add_function_test(TestTile, "test_tile_astype", test_tile_astype, devices=devices)
+# add_function_test(TestTile, "test_tile_func_return", test_tile_func_return, devices=devices)
 if __name__ == "__main__":

warp/tests/tile/test_tile_mathdx.py CHANGED Viewed

@@ -450,7 +450,7 @@ def test_tile_math_back_substitution_multiple_rhs(test, device):
 def test_tile_math_block_cholesky(test, device):
     BLOCK_SIZE = wp.constant(TILE_M // 2)
-    @wp.kernel
+    @wp.kernel(module="unique")
     def block_cholesky_kernel(
         A: wp.array2d(dtype=float),
         L: wp.array2d(dtype=float),
@@ -496,7 +496,7 @@ def test_tile_math_block_cholesky(test, device):
                 wp.tile_store(L, sol_tile, offset=(i, k))
-    @wp.kernel
+    @wp.kernel(module="unique")
     def block_cholesky_solve_kernel(
         L: wp.array2d(dtype=float),
         b: wp.array2d(dtype=float),