PyPI - warp-lang - Versions diffs - 1.8.0__py3-none-manylinux_2_34_aarch64.whl → 1.9.0__py3-none-manylinux_2_34_aarch64.whl - Mend

warp-lang 1.8.0__py3-none-manylinux_2_34_aarch64.whl → 1.9.0__py3-none-manylinux_2_34_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (153) hide show

warp/__init__.py +282 -103
warp/__init__.pyi +482 -110
warp/bin/warp-clang.so +0 -0
warp/bin/warp.so +0 -0
warp/build.py +93 -30
warp/build_dll.py +48 -63
warp/builtins.py +955 -137
warp/codegen.py +327 -209
warp/config.py +1 -1
warp/context.py +1363 -800
warp/examples/core/example_marching_cubes.py +1 -0
warp/examples/core/example_render_opengl.py +100 -3
warp/examples/fem/example_apic_fluid.py +98 -52
warp/examples/fem/example_convection_diffusion_dg.py +25 -4
warp/examples/fem/example_diffusion_mgpu.py +8 -3
warp/examples/fem/utils.py +68 -22
warp/examples/interop/example_jax_callable.py +34 -4
warp/examples/interop/example_jax_kernel.py +27 -1
warp/fabric.py +1 -1
warp/fem/cache.py +27 -19
warp/fem/domain.py +2 -2
warp/fem/field/nodal_field.py +2 -2
warp/fem/field/virtual.py +266 -166
warp/fem/geometry/geometry.py +5 -5
warp/fem/integrate.py +200 -91
warp/fem/space/restriction.py +4 -0
warp/fem/space/shape/tet_shape_function.py +3 -10
warp/jax_experimental/custom_call.py +1 -1
warp/jax_experimental/ffi.py +203 -54
warp/marching_cubes.py +708 -0
warp/native/array.h +103 -8
warp/native/builtin.h +90 -9
warp/native/bvh.cpp +64 -28
warp/native/bvh.cu +58 -58
warp/native/bvh.h +2 -2
warp/native/clang/clang.cpp +7 -7
warp/native/coloring.cpp +13 -3
warp/native/crt.cpp +2 -2
warp/native/crt.h +3 -5
warp/native/cuda_util.cpp +42 -11
warp/native/cuda_util.h +10 -4
warp/native/exports.h +1842 -1908
warp/native/fabric.h +2 -1
warp/native/hashgrid.cpp +37 -37
warp/native/hashgrid.cu +2 -2
warp/native/initializer_array.h +1 -1
warp/native/intersect.h +4 -4
warp/native/mat.h +1913 -119
warp/native/mathdx.cpp +43 -43
warp/native/mesh.cpp +24 -24
warp/native/mesh.cu +26 -26
warp/native/mesh.h +5 -3
warp/native/nanovdb/GridHandle.h +179 -12
warp/native/nanovdb/HostBuffer.h +8 -7
warp/native/nanovdb/NanoVDB.h +517 -895
warp/native/nanovdb/NodeManager.h +323 -0
warp/native/nanovdb/PNanoVDB.h +2 -2
warp/native/quat.h +337 -16
warp/native/rand.h +7 -7
warp/native/range.h +7 -1
warp/native/reduce.cpp +10 -10
warp/native/reduce.cu +13 -14
warp/native/runlength_encode.cpp +2 -2
warp/native/runlength_encode.cu +5 -5
warp/native/scan.cpp +3 -3
warp/native/scan.cu +4 -4
warp/native/sort.cpp +10 -10
warp/native/sort.cu +22 -22
warp/native/sparse.cpp +8 -8
warp/native/sparse.cu +14 -14
warp/native/spatial.h +366 -17
warp/native/svd.h +23 -8
warp/native/temp_buffer.h +2 -2
warp/native/tile.h +303 -70
warp/native/tile_radix_sort.h +5 -1
warp/native/tile_reduce.h +16 -25
warp/native/tuple.h +2 -2
warp/native/vec.h +385 -18
warp/native/volume.cpp +54 -54
warp/native/volume.cu +1 -1
warp/native/volume.h +2 -1
warp/native/volume_builder.cu +30 -37
warp/native/warp.cpp +150 -149
warp/native/warp.cu +337 -193
warp/native/warp.h +227 -226
warp/optim/linear.py +736 -271
warp/render/imgui_manager.py +289 -0
warp/render/render_opengl.py +137 -57
warp/render/render_usd.py +0 -1
warp/sim/collide.py +1 -2
warp/sim/graph_coloring.py +2 -2
warp/sim/integrator_vbd.py +10 -2
warp/sparse.py +559 -176
warp/tape.py +2 -0
warp/tests/aux_test_module_aot.py +7 -0
warp/tests/cuda/test_async.py +3 -3
warp/tests/cuda/test_conditional_captures.py +101 -0
warp/tests/geometry/test_marching_cubes.py +233 -12
warp/tests/sim/test_cloth.py +89 -6
warp/tests/sim/test_coloring.py +82 -7
warp/tests/test_array.py +56 -5
warp/tests/test_assert.py +53 -0
warp/tests/test_atomic_cas.py +127 -114
warp/tests/test_codegen.py +3 -2
warp/tests/test_context.py +8 -15
warp/tests/test_enum.py +136 -0
warp/tests/test_examples.py +2 -2
warp/tests/test_fem.py +45 -2
warp/tests/test_fixedarray.py +229 -0
warp/tests/test_func.py +18 -15
warp/tests/test_future_annotations.py +7 -5
warp/tests/test_linear_solvers.py +30 -0
warp/tests/test_map.py +1 -1
warp/tests/test_mat.py +1540 -378
warp/tests/test_mat_assign_copy.py +178 -0
warp/tests/test_mat_constructors.py +574 -0
warp/tests/test_module_aot.py +287 -0
warp/tests/test_print.py +69 -0
warp/tests/test_quat.py +162 -34
warp/tests/test_quat_assign_copy.py +145 -0
warp/tests/test_reload.py +2 -1
warp/tests/test_sparse.py +103 -0
warp/tests/test_spatial.py +140 -34
warp/tests/test_spatial_assign_copy.py +160 -0
warp/tests/test_static.py +48 -0
warp/tests/test_struct.py +43 -3
warp/tests/test_tape.py +38 -0
warp/tests/test_types.py +0 -20
warp/tests/test_vec.py +216 -441
warp/tests/test_vec_assign_copy.py +143 -0
warp/tests/test_vec_constructors.py +325 -0
warp/tests/tile/test_tile.py +206 -152
warp/tests/tile/test_tile_cholesky.py +605 -0
warp/tests/tile/test_tile_load.py +169 -0
warp/tests/tile/test_tile_mathdx.py +2 -558
warp/tests/tile/test_tile_matmul.py +179 -0
warp/tests/tile/test_tile_mlp.py +1 -1
warp/tests/tile/test_tile_reduce.py +100 -11
warp/tests/tile/test_tile_shared_memory.py +16 -16
warp/tests/tile/test_tile_sort.py +59 -55
warp/tests/unittest_suites.py +16 -0
warp/tests/walkthrough_debug.py +1 -1
warp/thirdparty/unittest_parallel.py +108 -9
warp/types.py +554 -264
warp/utils.py +68 -86
{warp_lang-1.8.0.dist-info → warp_lang-1.9.0.dist-info}/METADATA +28 -65
{warp_lang-1.8.0.dist-info → warp_lang-1.9.0.dist-info}/RECORD +150 -138
warp/native/marching.cpp +0 -19
warp/native/marching.cu +0 -514
warp/native/marching.h +0 -19
{warp_lang-1.8.0.dist-info → warp_lang-1.9.0.dist-info}/WHEEL +0 -0
{warp_lang-1.8.0.dist-info → warp_lang-1.9.0.dist-info}/licenses/LICENSE.md +0 -0
{warp_lang-1.8.0.dist-info → warp_lang-1.9.0.dist-info}/top_level.txt +0 -0

warp/tests/tile/test_tile.py CHANGED Viewed

@@ -109,12 +109,29 @@ def test_tile_copy_2d(test, device):
 @wp.func
-def unary_func(x: float):
+def unary_func(x: wp.float32):
     return wp.sin(x)
+@wp.func
+def unary_func(x: wp.float64):
+    return wp.sin(x)
+@wp.kernel
+def tile_unary_map_user_func(input: wp.array2d(dtype=Any), output: wp.array2d(dtype=Any)):
+    # tile index
+    i, j = wp.tid()
+    a = wp.tile_load(input, shape=(TILE_M, TILE_N), offset=(i * TILE_M, j * TILE_N))
+    sa = wp.tile_map(unary_func, a)
+    wp.tile_store(output, sa, offset=(i * TILE_M, j * TILE_N))
 @wp.kernel
-def tile_unary_map(input: wp.array2d(dtype=float), output: wp.array2d(dtype=float)):
+def tile_unary_map_builtin_func(input: wp.array2d(dtype=Any), output: wp.array2d(dtype=Any)):
     # tile index
     i, j = wp.tid()
@@ -131,185 +148,235 @@ def test_tile_unary_map(test, device):
     M = TILE_M * 7
     N = TILE_N * 5
-    A = rng.random((M, N), dtype=np.float32)
-    B = np.sin(A)
+    def run(kernel, dtype):
+        A = rng.random((M, N), dtype=dtype)
+        B = np.sin(A)
-    A_grad = np.cos(A)
+        A_grad = np.cos(A)
-    A_wp = wp.array(A, requires_grad=True, device=device)
-    B_wp = wp.zeros_like(A_wp, requires_grad=True, device=device)
+        A_wp = wp.array(A, requires_grad=True, device=device)
+        B_wp = wp.zeros_like(A_wp, requires_grad=True, device=device)
-    with wp.Tape() as tape:
-        wp.launch_tiled(
-            tile_unary_map,
-            dim=[int(M / TILE_M), int(N / TILE_N)],
-            inputs=[A_wp, B_wp],
-            block_dim=TILE_DIM,
-            device=device,
-        )
+        with wp.Tape() as tape:
+            wp.launch_tiled(
+                kernel,
+                dim=[int(M / TILE_M), int(N / TILE_N)],
+                inputs=[A_wp, B_wp],
+                block_dim=TILE_DIM,
+                device=device,
+            )
-    # verify forward pass
-    assert_np_equal(B_wp.numpy(), B, tol=1.0e-4)
+        tol = 1.0e-6 if dtype == np.float64 else 1.0e-4
-    # verify backward pass
-    B_wp.grad = wp.ones_like(B_wp, device=device)
-    tape.backward()
+        # verify forward pass
+        assert_np_equal(B_wp.numpy(), B, tol=tol)
-    assert_np_equal(A_wp.grad.numpy(), A_grad, tol=1.0e-6)
+        # verify backward pass
+        B_wp.grad = wp.ones_like(B_wp, device=device)
+        tape.backward()
+        assert_np_equal(A_wp.grad.numpy(), A_grad, tol=tol)
+    dtypes = [np.float32, np.float64]
+    for dtype in dtypes:
+        run(tile_unary_map_user_func, dtype)
+        run(tile_unary_map_builtin_func, dtype)
 @wp.func
-def binary_func(x: float, y: float):
-    return wp.sin(x) + y
+def unary_func_mixed_types(x: int) -> float:
+    return wp.sin(float(x))
 @wp.kernel
-def tile_binary_map(
-    input_a: wp.array2d(dtype=float), input_b: wp.array2d(dtype=float), output: wp.array2d(dtype=float)
-):
+def tile_unary_map_mixed_types(input: wp.array2d(dtype=int), output: wp.array2d(dtype=float)):
     # tile index
     i, j = wp.tid()
-    a = wp.tile_load(input_a, shape=(TILE_M, TILE_N), offset=(i * TILE_M, j * TILE_N))
-    b = wp.tile_load(input_b, shape=(TILE_M, TILE_N), offset=(i * TILE_M, j * TILE_N))
+    a = wp.tile_load(input, shape=(TILE_M, TILE_N), offset=(i * TILE_M, j * TILE_N))
-    sa = wp.tile_map(binary_func, a, b)
+    sa = wp.tile_map(unary_func_mixed_types, a)
     wp.tile_store(output, sa, offset=(i * TILE_M, j * TILE_N))
-def test_tile_binary_map(test, device):
+def test_tile_unary_map_mixed_types(test, device):
     rng = np.random.default_rng(42)
     M = TILE_M * 7
     N = TILE_N * 5
-    A = rng.random((M, N), dtype=np.float32)
-    B = rng.random((M, N), dtype=np.float32)
-    C = np.sin(A) + B
+    A = rng.integers(0, 100, size=(M, N), dtype=np.int32)
+    B = np.sin(A.astype(np.float32))
-    A_grad = np.cos(A)
-    B_grad = np.ones_like(B)
+    A_grad = np.cos(A.astype(np.float32))
     A_wp = wp.array(A, requires_grad=True, device=device)
-    B_wp = wp.array(B, requires_grad=True, device=device)
-    C_wp = wp.zeros_like(A_wp, requires_grad=True, device=device)
+    B_wp = wp.zeros((M, N), dtype=float, requires_grad=True, device=device)
     with wp.Tape() as tape:
         wp.launch_tiled(
-            tile_binary_map,
+            tile_unary_map_mixed_types,
             dim=[int(M / TILE_M), int(N / TILE_N)],
-            inputs=[A_wp, B_wp, C_wp],
+            inputs=[A_wp, B_wp],
             block_dim=TILE_DIM,
             device=device,
         )
     # verify forward pass
-    assert_np_equal(C_wp.numpy(), C, tol=1.0e-6)
+    assert_np_equal(B_wp.numpy(), B, tol=1.0e-4)
     # verify backward pass
-    C_wp.grad = wp.ones_like(C_wp, device=device)
+    B_wp.grad = wp.ones_like(B_wp, device=device)
     tape.backward()
-    assert_np_equal(A_wp.grad.numpy(), A_grad, tol=1.0e-6)
-    assert_np_equal(B_wp.grad.numpy(), B_grad)
-def test_tile_grouped_gemm(test, device):
-    @wp.kernel
-    def tile_grouped_gemm(A: wp.array3d(dtype=float), B: wp.array3d(dtype=float), C: wp.array3d(dtype=float)):
-        # output tile index
-        i = wp.tid()
+    # The a gradients are now stored as ints and can't capture the correct values
+    # assert_np_equal(A_wp.grad.numpy(), A_grad, tol=1.0e-6)
-        a = wp.tile_load(A[i], shape=(TILE_M, TILE_K))
-        b = wp.tile_load(B[i], shape=(TILE_K, TILE_N))
-        sum = wp.tile_zeros(shape=(TILE_M, TILE_N), dtype=wp.float32)
-        wp.tile_matmul(a, b, sum)
+@wp.func
+def binary_func(x: wp.float32, y: wp.float32):
+    return x + y
-        wp.tile_store(C[i], sum)
-    batch_count = 56
+@wp.func
+def binary_func(x: wp.float64, y: wp.float64):
+    return x + y
-    M = TILE_M
-    N = TILE_N
-    K = TILE_K
-    rng = np.random.default_rng(42)
-    A = rng.random((batch_count, M, K), dtype=np.float32)
-    B = rng.random((batch_count, K, N), dtype=np.float32)
-    C = A @ B
+@wp.kernel
+def tile_binary_map_user_func(
+    input_a: wp.array2d(dtype=Any), input_b: wp.array2d(dtype=Any), output: wp.array2d(dtype=Any)
+):
+    # tile index
+    i, j = wp.tid()
-    A_wp = wp.array(A, requires_grad=True, device=device)
-    B_wp = wp.array(B, requires_grad=True, device=device)
-    C_wp = wp.zeros((batch_count, TILE_M, TILE_N), requires_grad=True, device=device)
+    a = wp.tile_load(input_a, shape=(TILE_M, TILE_N), offset=(i * TILE_M, j * TILE_N))
+    b = wp.tile_load(input_b, shape=(TILE_M, TILE_N), offset=(i * TILE_M, j * TILE_N))
-    with wp.Tape() as tape:
-        wp.launch_tiled(
-            tile_grouped_gemm, dim=[batch_count], inputs=[A_wp, B_wp, C_wp], block_dim=TILE_DIM, device=device
-        )
+    sa = wp.tile_map(binary_func, a, b)
-    # TODO: 32 mismatched elements
-    assert_np_equal(C_wp.numpy(), C, 1e-6)
+    wp.tile_store(output, sa, offset=(i * TILE_M, j * TILE_N))
-def test_tile_gemm(dtype):
-    def test(test, device):
-        @wp.kernel
-        def tile_gemm(A: wp.array2d(dtype=dtype), B: wp.array2d(dtype=dtype), C: wp.array2d(dtype=dtype)):
-            # output tile index
-            i, j = wp.tid()
+@wp.kernel
+def tile_binary_map_builtin_func(
+    input_a: wp.array2d(dtype=Any), input_b: wp.array2d(dtype=Any), output: wp.array2d(dtype=Any)
+):
+    # tile index
+    i, j = wp.tid()
-            sum = wp.tile_zeros(shape=(TILE_M, TILE_N), dtype=dtype)
+    a = wp.tile_load(input_a, shape=(TILE_M, TILE_N), offset=(i * TILE_M, j * TILE_N))
+    b = wp.tile_load(input_b, shape=(TILE_M, TILE_N), offset=(i * TILE_M, j * TILE_N))
-            M = A.shape[0]
-            N = B.shape[1]
-            K = A.shape[1]
+    sa = wp.tile_map(wp.add, a, b)
-            count = int(K / TILE_K)
+    wp.tile_store(output, sa, offset=(i * TILE_M, j * TILE_N))
-            for k in range(0, count):
-                a = wp.tile_load(A, shape=(TILE_M, TILE_K), offset=(i * TILE_M, k * TILE_K))
-                b = wp.tile_load(B, shape=(TILE_K, TILE_N), offset=(k * TILE_K, j * TILE_N))
-                # sum += a*b
-                wp.tile_matmul(a, b, sum)
+def test_tile_binary_map(test, device):
+    rng = np.random.default_rng(42)
-            wp.tile_store(C, sum, offset=(i * TILE_M, j * TILE_N))
+    M = TILE_M * 7
+    N = TILE_N * 5
-        M = TILE_M * 7
-        K = TILE_K * 6
-        N = TILE_N * 5
+    def run(kernel, dtype):
+        A = rng.random((M, N), dtype=dtype)
+        B = rng.random((M, N), dtype=dtype)
+        C = A + B
-        rng = np.random.default_rng(42)
-        A = rng.random((M, K), dtype=float).astype(wp.dtype_to_numpy(dtype))
-        B = rng.random((K, N), dtype=float).astype(wp.dtype_to_numpy(dtype))
-        C = np.zeros((M, N), dtype=float).astype(wp.dtype_to_numpy(dtype))
+        A_grad = np.ones_like(A)
+        B_grad = np.ones_like(B)
         A_wp = wp.array(A, requires_grad=True, device=device)
         B_wp = wp.array(B, requires_grad=True, device=device)
-        C_wp = wp.array(C, requires_grad=True, device=device)
+        C_wp = wp.zeros_like(A_wp, requires_grad=True, device=device)
         with wp.Tape() as tape:
             wp.launch_tiled(
-                tile_gemm,
-                dim=(int(M / TILE_M), int(N / TILE_N)),
+                kernel,
+                dim=[int(M / TILE_M), int(N / TILE_N)],
                 inputs=[A_wp, B_wp, C_wp],
                 block_dim=TILE_DIM,
                 device=device,
             )
-        assert_np_equal(C_wp.numpy(), A @ B, tol=1.0e-1)
+        tol = 1.0e-6 if dtype == np.float64 else 1.0e-4
-        adj_C = np.ones_like(C)
+        # verify forward pass
+        assert_np_equal(C_wp.numpy(), C, tol=tol)
-        tape.backward(grads={C_wp: wp.array(adj_C, device=device)})
+        # verify backward pass
+        C_wp.grad = wp.ones_like(C_wp, device=device)
+        tape.backward()
+        assert_np_equal(A_wp.grad.numpy(), A_grad, tol=tol)
+        assert_np_equal(B_wp.grad.numpy(), B_grad, tol=tol)
+    dtypes = [np.float32, np.float64]
-        assert_np_equal(A_wp.grad.numpy(), adj_C @ B.T, tol=1.0e-1)
-        assert_np_equal(B_wp.grad.numpy(), A.T @ adj_C, 1.0e-1)
+    for dtype in dtypes:
+        run(tile_binary_map_builtin_func, dtype)
+        run(tile_binary_map_user_func, dtype)
-    return test
+@wp.func
+def binary_func_mixed_types(x: int, y: float) -> float:
+    return wp.sin(float(x)) + y
+@wp.kernel
+def tile_binary_map_mixed_types(
+    input_a: wp.array2d(dtype=int), input_b: wp.array2d(dtype=float), output: wp.array2d(dtype=float)
+):
+    # tile index
+    i, j = wp.tid()
+    a = wp.tile_load(input_a, shape=(TILE_M, TILE_N), offset=(i * TILE_M, j * TILE_N))
+    b = wp.tile_load(input_b, shape=(TILE_M, TILE_N), offset=(i * TILE_M, j * TILE_N))
+    sa = wp.tile_map(binary_func_mixed_types, a, b)
+    wp.tile_store(output, sa, offset=(i * TILE_M, j * TILE_N))
+def test_tile_binary_map_mixed_types(test, device):
+    rng = np.random.default_rng(42)
+    M = TILE_M * 7
+    N = TILE_N * 5
+    A = rng.integers(0, 100, size=(M, N), dtype=np.int32)
+    B = rng.random((M, N), dtype=np.float32)
+    C = np.sin(A.astype(np.float32)) + B
+    A_grad = np.cos(A.astype(np.float32))
+    B_grad = np.ones_like(B)
+    A_wp = wp.array(A, requires_grad=True, device=device)
+    B_wp = wp.array(B, requires_grad=True, device=device)
+    C_wp = wp.zeros_like(B_wp, requires_grad=True, device=device)
+    with wp.Tape() as tape:
+        wp.launch_tiled(
+            tile_binary_map_mixed_types,
+            dim=[int(M / TILE_M), int(N / TILE_N)],
+            inputs=[A_wp, B_wp, C_wp],
+            block_dim=TILE_DIM,
+            device=device,
+        )
+    # verify forward pass
+    assert_np_equal(C_wp.numpy(), C, tol=1.0e-6)
+    # verify backward pass
+    C_wp.grad = wp.ones_like(C_wp, device=device)
+    tape.backward()
+    # The a gradiens are now stored as ints and can't capture the correct values
+    # assert_np_equal(A_wp.grad.numpy(), A_grad, tol=1.0e-6)
+    assert_np_equal(B_wp.grad.numpy(), B_grad)
 @wp.kernel
@@ -368,6 +435,12 @@ def test_tile_tile_preserve_type_kernel(x: wp.array(dtype=Any), y: wp.array(dtyp
     wp.tile_store(y, t)
+wp.overload(test_tile_tile_preserve_type_kernel, {"x": wp.array(dtype=float), "y": wp.array(dtype=float)})
+wp.overload(test_tile_tile_preserve_type_kernel, {"x": wp.array(dtype=wp.vec3), "y": wp.array(dtype=wp.vec3)})
+wp.overload(test_tile_tile_preserve_type_kernel, {"x": wp.array(dtype=wp.quat), "y": wp.array(dtype=wp.quat)})
+wp.overload(test_tile_tile_preserve_type_kernel, {"x": wp.array(dtype=wp.mat33), "y": wp.array(dtype=wp.mat33)})
 @wp.kernel
 def test_tile_tile_scalar_expansion_kernel(x: wp.array(dtype=float), y: wp.array(dtype=float)):
     a = x[0]
@@ -494,6 +567,12 @@ def test_tile_untile_preserve_type_kernel(x: wp.array(dtype=Any), y: wp.array(dt
     y[i] = b
+wp.overload(test_tile_untile_preserve_type_kernel, {"x": wp.array(dtype=float), "y": wp.array(dtype=float)})
+wp.overload(test_tile_untile_preserve_type_kernel, {"x": wp.array(dtype=wp.vec3), "y": wp.array(dtype=wp.vec3)})
+wp.overload(test_tile_untile_preserve_type_kernel, {"x": wp.array(dtype=wp.quat), "y": wp.array(dtype=wp.quat)})
+wp.overload(test_tile_untile_preserve_type_kernel, {"x": wp.array(dtype=wp.mat33), "y": wp.array(dtype=wp.mat33)})
 @wp.kernel
 def test_tile_untile_kernel(x: wp.array(dtype=Any), y: wp.array(dtype=Any)):
     i = wp.tid()
@@ -503,6 +582,11 @@ def test_tile_untile_kernel(x: wp.array(dtype=Any), y: wp.array(dtype=Any)):
     y[i] = b
+wp.overload(test_tile_untile_kernel, {"x": wp.array(dtype=float), "y": wp.array(dtype=float)})
+wp.overload(test_tile_untile_kernel, {"x": wp.array(dtype=wp.vec3), "y": wp.array(dtype=wp.vec3)})
+wp.overload(test_tile_untile_kernel, {"x": wp.array(dtype=wp.mat33), "y": wp.array(dtype=wp.mat33)})
 def test_tile_untile(test, device):
     def test_func_preserve_type(type: Any):
         x = wp.ones(TILE_DIM, dtype=type, requires_grad=True, device=device)
@@ -644,7 +728,7 @@ def test_tile_sum_launch(test, device):
     assert_np_equal(input_wp.grad.numpy(), np.ones_like(input) * 0.5)
-@wp.kernel
+@wp.kernel(module="unique")
 def test_tile_extract_kernel(a: wp.array2d(dtype=float), b: wp.array2d(dtype=float)):
     i, j, x, y = wp.tid()
@@ -680,7 +764,7 @@ def test_tile_extract(test, device):
     assert_np_equal(a.grad.numpy(), expected_grad)
-@wp.kernel
+@wp.kernel(module="unique")
 def test_tile_extract_repeated_kernel(a: wp.array2d(dtype=float), b: wp.array2d(dtype=float)):
     i, j, x, y = wp.tid()
@@ -744,7 +828,7 @@ def test_tile_assign(test, device):
     tape = wp.Tape()
     with tape:
-        wp.launch(test_tile_assign_kernel, dim=[1, TILE_M], inputs=[x], outputs=[y], block_dim=64, device=device)
+        wp.launch(test_tile_assign_kernel, dim=[1, TILE_M], inputs=[x], outputs=[y], block_dim=TILE_DIM, device=device)
     y.grad = wp.ones_like(y)
     tape.backward()
@@ -766,31 +850,11 @@ def test_tile_transpose(test, device):
     input = wp.array(rng.random((TILE_M, TILE_N), dtype=np.float32), device=device)
     output = wp.zeros_like(input.transpose(), device=device)
-    wp.launch_tiled(test_tile_transpose_kernel, dim=[1], inputs=[input, output], block_dim=32, device=device)
+    wp.launch_tiled(test_tile_transpose_kernel, dim=[1], inputs=[input, output], block_dim=TILE_DIM, device=device)
     assert_np_equal(output.numpy(), input.numpy().T)
-def test_tile_transpose_matmul(test, device):
-    @wp.kernel
-    def test_tile_transpose_matmul_kernel(input: wp.array2d(dtype=float), output: wp.array2d(dtype=float)):
-        x = wp.tile_load(input, shape=(TILE_M, TILE_N))
-        y = wp.tile_transpose(x)
-        z = wp.tile_zeros(dtype=float, shape=(TILE_N, TILE_N))
-        wp.tile_matmul(y, x, z)
-        wp.tile_store(output, z)
-    rng = np.random.default_rng(42)
-    input = wp.array(rng.random((TILE_M, TILE_N), dtype=np.float32), device=device)
-    output = wp.zeros((TILE_N, TILE_N), dtype=float, device=device)
-    wp.launch_tiled(test_tile_transpose_matmul_kernel, dim=[1], inputs=[input, output], block_dim=32, device=device)
-    assert_np_equal(output.numpy(), input.numpy().T @ input.numpy())
 @wp.kernel
 def test_tile_broadcast_add_1d_kernel(
     input_a: wp.array(dtype=float), input_b: wp.array(dtype=float), output: wp.array(dtype=float)
@@ -812,7 +876,7 @@ def test_tile_broadcast_add_1d(test, device):
     b = wp.array(np.ones(1, dtype=np.float32), device=device)
     out = wp.zeros((N,), dtype=float, device=device)
-    wp.launch_tiled(test_tile_broadcast_add_1d_kernel, dim=[1], inputs=[a, b, out], block_dim=32, device=device)
+    wp.launch_tiled(test_tile_broadcast_add_1d_kernel, dim=[1], inputs=[a, b, out], block_dim=TILE_DIM, device=device)
     assert_np_equal(out.numpy(), a.numpy() + b.numpy())
@@ -839,7 +903,7 @@ def test_tile_broadcast_add_2d(test, device):
     b = wp.array(np.arange(0, N, dtype=np.float32), device=device)
     out = wp.zeros((M, N), dtype=float, device=device)
-    wp.launch_tiled(test_tile_broadcast_add_2d_kernel, dim=[1], inputs=[a, b, out], block_dim=32, device=device)
+    wp.launch_tiled(test_tile_broadcast_add_2d_kernel, dim=[1], inputs=[a, b, out], block_dim=TILE_DIM, device=device)
     assert_np_equal(out.numpy(), a.numpy() + b.numpy())
@@ -867,7 +931,7 @@ def test_tile_broadcast_add_3d(test, device):
     b = wp.array(np.arange(0, M * N, dtype=np.float32).reshape((M, N, 1)), device=device)
     out = wp.zeros((M, N, O), dtype=float, device=device)
-    wp.launch_tiled(test_tile_broadcast_add_3d_kernel, dim=[1], inputs=[a, b, out], block_dim=32, device=device)
+    wp.launch_tiled(test_tile_broadcast_add_3d_kernel, dim=[1], inputs=[a, b, out], block_dim=TILE_DIM, device=device)
     assert_np_equal(out.numpy(), a.numpy() + b.numpy())
@@ -894,7 +958,7 @@ def test_tile_broadcast_add_4d(test, device):
     b = wp.array(np.arange(0, M * O, dtype=np.float32).reshape((M, 1, O, 1)), device=device)
     out = wp.zeros((M, N, O, P), dtype=float, device=device)
-    wp.launch_tiled(test_tile_broadcast_add_4d_kernel, dim=[1], inputs=[a, b, out], block_dim=32, device=device)
+    wp.launch_tiled(test_tile_broadcast_add_4d_kernel, dim=[1], inputs=[a, b, out], block_dim=TILE_DIM, device=device)
     assert_np_equal(out.numpy(), a.numpy() + b.numpy())
@@ -915,7 +979,7 @@ def test_tile_broadcast_grad(test, device):
     b = wp.array(np.ones((5, 5), dtype=np.float32), requires_grad=True, device=device)
     with wp.Tape() as tape:
-        wp.launch_tiled(test_tile_broadcast_grad_kernel, dim=[1], inputs=[a, b], block_dim=32, device=device)
+        wp.launch_tiled(test_tile_broadcast_grad_kernel, dim=[1], inputs=[a, b], block_dim=TILE_DIM, device=device)
     b.grad = wp.ones_like(b, device=device)
     tape.backward()
@@ -1049,14 +1113,7 @@ def tile_len_kernel(
 def test_tile_len(test, device):
     a = wp.zeros((TILE_M, TILE_N), dtype=float, device=device)
     out = wp.empty(1, dtype=int, device=device)
-    wp.launch_tiled(
-        tile_len_kernel,
-        dim=(1,),
-        inputs=(a,),
-        outputs=(out,),
-        block_dim=32,
-        device=device,
-    )
+    wp.launch_tiled(tile_len_kernel, dim=(1,), inputs=(a,), outputs=(out,), block_dim=TILE_DIM, device=device)
     test.assertEqual(out.numpy()[0], TILE_M)
@@ -1192,13 +1249,10 @@ class TestTile(unittest.TestCase):
 add_function_test(TestTile, "test_tile_copy_1d", test_tile_copy_1d, devices=devices)
 add_function_test(TestTile, "test_tile_copy_2d", test_tile_copy_2d, devices=devices)
 add_function_test(TestTile, "test_tile_unary_map", test_tile_unary_map, devices=devices)
+add_function_test(TestTile, "test_tile_unary_map_mixed_types", test_tile_unary_map_mixed_types, devices=devices)
 add_function_test(TestTile, "test_tile_binary_map", test_tile_binary_map, devices=devices)
-add_function_test(TestTile, "test_tile_grouped_gemm", test_tile_grouped_gemm, devices=devices)
-add_function_test(TestTile, "test_tile_gemm_fp16", test_tile_gemm(wp.float16), devices=devices)
-add_function_test(TestTile, "test_tile_gemm_fp32", test_tile_gemm(wp.float32), devices=devices)
-add_function_test(TestTile, "test_tile_gemm_fp64", test_tile_gemm(wp.float64), devices=devices)
+add_function_test(TestTile, "test_tile_binary_map_mixed_types", test_tile_binary_map_mixed_types, devices=devices)
 add_function_test(TestTile, "test_tile_transpose", test_tile_transpose, devices=devices)
-add_function_test(TestTile, "test_tile_transpose_matmul", test_tile_transpose_matmul, devices=devices)
 add_function_test(TestTile, "test_tile_operators", test_tile_operators, devices=devices)
 add_function_test(TestTile, "test_tile_tile", test_tile_tile, devices=get_cuda_test_devices())
 add_function_test(TestTile, "test_tile_untile", test_tile_untile, devices=devices)
@@ -1215,10 +1269,10 @@ add_function_test(TestTile, "test_tile_broadcast_grad", test_tile_broadcast_grad
 add_function_test(TestTile, "test_tile_squeeze", test_tile_squeeze, devices=devices)
 add_function_test(TestTile, "test_tile_reshape", test_tile_reshape, devices=devices)
 add_function_test(TestTile, "test_tile_len", test_tile_len, devices=devices)
-add_function_test(TestTile, "test_tile_print", test_tile_print, devices=devices, check_output=False)
-add_function_test(TestTile, "test_tile_inplace", test_tile_inplace, devices=devices)
-add_function_test(TestTile, "test_tile_astype", test_tile_astype, devices=devices)
-add_function_test(TestTile, "test_tile_func_return", test_tile_func_return, devices=devices)
+# add_function_test(TestTile, "test_tile_print", test_tile_print, devices=devices, check_output=False)
+# add_function_test(TestTile, "test_tile_inplace", test_tile_inplace, devices=devices)
+# add_function_test(TestTile, "test_tile_astype", test_tile_astype, devices=devices)
+# add_function_test(TestTile, "test_tile_func_return", test_tile_func_return, devices=devices)
 if __name__ == "__main__":