PyPI - warp-lang - Versions diffs - 1.4.1__py3-none-manylinux2014_x86_64.whl → 1.5.0__py3-none-manylinux2014_x86_64.whl - Mend

warp-lang 1.4.1__py3-none-manylinux2014_x86_64.whl → 1.5.0__py3-none-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (164) hide show

warp/__init__.py +4 -0
warp/autograd.py +43 -8
warp/bin/warp-clang.so +0 -0
warp/bin/warp.so +0 -0
warp/build.py +21 -2
warp/build_dll.py +23 -6
warp/builtins.py +1920 -111
warp/codegen.py +186 -62
warp/config.py +2 -2
warp/context.py +322 -73
warp/examples/assets/pixel.jpg +0 -0
warp/examples/benchmarks/benchmark_cloth_paddle.py +86 -0
warp/examples/benchmarks/benchmark_gemm.py +121 -0
warp/examples/benchmarks/benchmark_interop_paddle.py +158 -0
warp/examples/benchmarks/benchmark_tile.py +179 -0
warp/examples/core/example_dem.py +2 -1
warp/examples/core/example_mesh_intersect.py +3 -3
warp/examples/fem/example_adaptive_grid.py +37 -10
warp/examples/fem/example_apic_fluid.py +3 -2
warp/examples/fem/example_convection_diffusion_dg.py +4 -5
warp/examples/fem/example_deformed_geometry.py +1 -1
warp/examples/fem/example_diffusion_3d.py +47 -4
warp/examples/fem/example_distortion_energy.py +220 -0
warp/examples/fem/example_magnetostatics.py +127 -85
warp/examples/fem/example_nonconforming_contact.py +5 -5
warp/examples/fem/example_stokes.py +3 -1
warp/examples/fem/example_streamlines.py +12 -19
warp/examples/fem/utils.py +38 -15
warp/examples/optim/example_walker.py +2 -2
warp/examples/sim/example_cloth.py +2 -25
warp/examples/sim/example_jacobian_ik.py +6 -2
warp/examples/sim/example_quadruped.py +2 -1
warp/examples/tile/example_tile_convolution.py +58 -0
warp/examples/tile/example_tile_fft.py +47 -0
warp/examples/tile/example_tile_filtering.py +105 -0
warp/examples/tile/example_tile_matmul.py +79 -0
warp/examples/tile/example_tile_mlp.py +375 -0
warp/fem/__init__.py +8 -0
warp/fem/cache.py +16 -12
warp/fem/dirichlet.py +1 -1
warp/fem/domain.py +44 -1
warp/fem/field/__init__.py +1 -2
warp/fem/field/field.py +31 -19
warp/fem/field/nodal_field.py +101 -49
warp/fem/field/virtual.py +794 -0
warp/fem/geometry/__init__.py +2 -2
warp/fem/geometry/deformed_geometry.py +3 -105
warp/fem/geometry/element.py +13 -0
warp/fem/geometry/geometry.py +165 -5
warp/fem/geometry/grid_2d.py +3 -6
warp/fem/geometry/grid_3d.py +31 -28
warp/fem/geometry/hexmesh.py +3 -46
warp/fem/geometry/nanogrid.py +3 -2
warp/fem/geometry/{quadmesh_2d.py → quadmesh.py} +280 -159
warp/fem/geometry/tetmesh.py +2 -43
warp/fem/geometry/{trimesh_2d.py → trimesh.py} +354 -186
warp/fem/integrate.py +683 -261
warp/fem/linalg.py +404 -0
warp/fem/operator.py +101 -18
warp/fem/polynomial.py +5 -5
warp/fem/quadrature/quadrature.py +45 -21
warp/fem/space/__init__.py +45 -11
warp/fem/space/basis_function_space.py +451 -0
warp/fem/space/basis_space.py +58 -11
warp/fem/space/function_space.py +146 -5
warp/fem/space/grid_2d_function_space.py +80 -66
warp/fem/space/grid_3d_function_space.py +113 -68
warp/fem/space/hexmesh_function_space.py +96 -108
warp/fem/space/nanogrid_function_space.py +62 -110
warp/fem/space/quadmesh_function_space.py +208 -0
warp/fem/space/shape/__init__.py +45 -7
warp/fem/space/shape/cube_shape_function.py +328 -54
warp/fem/space/shape/shape_function.py +10 -1
warp/fem/space/shape/square_shape_function.py +328 -60
warp/fem/space/shape/tet_shape_function.py +269 -19
warp/fem/space/shape/triangle_shape_function.py +238 -19
warp/fem/space/tetmesh_function_space.py +69 -37
warp/fem/space/topology.py +38 -0
warp/fem/space/trimesh_function_space.py +179 -0
warp/fem/utils.py +6 -331
warp/jax_experimental.py +3 -1
warp/native/array.h +55 -40
warp/native/builtin.h +124 -43
warp/native/bvh.h +4 -0
warp/native/coloring.cpp +600 -0
warp/native/cuda_util.cpp +14 -0
warp/native/cuda_util.h +2 -1
warp/native/fabric.h +8 -0
warp/native/hashgrid.h +4 -0
warp/native/marching.cu +8 -0
warp/native/mat.h +14 -3
warp/native/mathdx.cpp +59 -0
warp/native/mesh.h +4 -0
warp/native/range.h +13 -1
warp/native/reduce.cpp +9 -1
warp/native/reduce.cu +7 -0
warp/native/runlength_encode.cpp +9 -1
warp/native/runlength_encode.cu +7 -1
warp/native/scan.cpp +8 -0
warp/native/scan.cu +8 -0
warp/native/scan.h +8 -1
warp/native/sparse.cpp +8 -0
warp/native/sparse.cu +8 -0
warp/native/temp_buffer.h +7 -0
warp/native/tile.h +1857 -0
warp/native/tile_gemm.h +341 -0
warp/native/tile_reduce.h +210 -0
warp/native/volume_builder.cu +8 -0
warp/native/volume_builder.h +8 -0
warp/native/warp.cpp +10 -2
warp/native/warp.cu +369 -15
warp/native/warp.h +12 -2
warp/optim/adam.py +39 -4
warp/paddle.py +29 -12
warp/render/render_opengl.py +137 -65
warp/sim/graph_coloring.py +292 -0
warp/sim/integrator_euler.py +4 -2
warp/sim/integrator_featherstone.py +115 -44
warp/sim/integrator_vbd.py +6 -0
warp/sim/model.py +90 -17
warp/stubs.py +651 -85
warp/tape.py +12 -7
warp/tests/assets/pixel.npy +0 -0
warp/tests/aux_test_instancing_gc.py +18 -0
warp/tests/test_array.py +207 -48
warp/tests/test_closest_point_edge_edge.py +8 -8
warp/tests/test_codegen.py +120 -1
warp/tests/test_codegen_instancing.py +30 -0
warp/tests/test_collision.py +110 -0
warp/tests/test_coloring.py +241 -0
warp/tests/test_context.py +34 -0
warp/tests/test_examples.py +18 -4
warp/tests/test_fabricarray.py +33 -0
warp/tests/test_fem.py +453 -113
warp/tests/test_func.py +48 -1
warp/tests/test_generics.py +52 -0
warp/tests/test_iter.py +68 -0
warp/tests/test_mat_scalar_ops.py +1 -1
warp/tests/test_mesh_query_point.py +5 -4
warp/tests/test_module_hashing.py +23 -0
warp/tests/test_paddle.py +27 -87
warp/tests/test_print.py +191 -1
warp/tests/test_spatial.py +1 -1
warp/tests/test_tile.py +700 -0
warp/tests/test_tile_mathdx.py +144 -0
warp/tests/test_tile_mlp.py +383 -0
warp/tests/test_tile_reduce.py +374 -0
warp/tests/test_tile_shared_memory.py +190 -0
warp/tests/test_vbd.py +12 -20
warp/tests/test_volume.py +43 -0
warp/tests/unittest_suites.py +23 -2
warp/tests/unittest_utils.py +4 -0
warp/types.py +339 -73
warp/utils.py +22 -1
{warp_lang-1.4.1.dist-info → warp_lang-1.5.0.dist-info}/METADATA +33 -7
{warp_lang-1.4.1.dist-info → warp_lang-1.5.0.dist-info}/RECORD +159 -132
{warp_lang-1.4.1.dist-info → warp_lang-1.5.0.dist-info}/WHEEL +1 -1
warp/fem/field/test.py +0 -180
warp/fem/field/trial.py +0 -183
warp/fem/space/collocated_function_space.py +0 -102
warp/fem/space/quadmesh_2d_function_space.py +0 -261
warp/fem/space/trimesh_2d_function_space.py +0 -153
{warp_lang-1.4.1.dist-info → warp_lang-1.5.0.dist-info}/LICENSE.md +0 -0
{warp_lang-1.4.1.dist-info → warp_lang-1.5.0.dist-info}/top_level.txt +0 -0

warp/tests/test_tile_mathdx.py ADDED Viewed

@@ -0,0 +1,144 @@
+# Copyright (c) 2024 NVIDIA CORPORATION.  All rights reserved.
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+import functools
+import unittest
+import numpy as np
+import warp as wp
+from warp.tests.unittest_utils import *
+wp.init()  # For wp.context.runtime.core.is_mathdx_enabled()
+TILE_M = wp.constant(8)
+TILE_N = wp.constant(4)
+TILE_K = wp.constant(8)
+# num threads per-tile
+TILE_DIM = 32
+FFT_SIZE_FP32 = 64
+FFT_SIZE_FP64 = 64
+@wp.kernel()
+def tile_math_matmul_kernel(
+    ga: wp.array2d(dtype=wp.float16), gb: wp.array2d(dtype=wp.float32), gc: wp.array2d(dtype=wp.float64)
+):
+    i, j = wp.tid()
+    a = wp.tile_load(ga, i, j, m=TILE_M, n=TILE_K)
+    b = wp.tile_load(gb, i, j, m=TILE_K, n=TILE_N)
+    c = wp.tile_zeros(m=TILE_M, n=TILE_N, dtype=wp.float64)
+    wp.tile_matmul(a, b, c)
+    wp.tile_store(gc, i, j, c)
+def test_tile_math_matmul(test, device):
+    rng = np.random.default_rng(42)
+    A = rng.random((TILE_M, TILE_K), dtype=np.float64).astype(np.float16)
+    B = rng.random((TILE_K, TILE_N), dtype=np.float32)
+    C = np.zeros((TILE_M, TILE_N), dtype=np.float64)
+    A_wp = wp.array(A, requires_grad=True, device=device)
+    B_wp = wp.array(B, requires_grad=True, device=device)
+    C_wp = wp.array(C, requires_grad=True, device=device)
+    with wp.Tape() as tape:
+        wp.launch_tiled(
+            tile_math_matmul_kernel,
+            dim=[1, 1],
+            inputs=[A_wp, B_wp, C_wp],
+            block_dim=TILE_DIM,
+            device=device,
+        )
+    # verify forward pass
+    assert_np_equal(C_wp.numpy(), A @ B, tol=1e-2)
+    adj_C = np.ones_like(C)
+    tape.backward(grads={C_wp: wp.array(adj_C, device=device)})
+    assert_np_equal(A_wp.grad.numpy(), adj_C @ B.T, tol=1e-2)
+    assert_np_equal(B_wp.grad.numpy(), A.T @ adj_C, tol=1e-2)
+@wp.kernel()
+def tile_math_fft_kernel_vec2f(gx: wp.array2d(dtype=wp.vec2f), gy: wp.array2d(dtype=wp.vec2f)):
+    i, j = wp.tid()
+    xy = wp.tile_load(gx, i, j, m=FFT_SIZE_FP32, n=FFT_SIZE_FP32)
+    wp.tile_fft(xy)
+    wp.tile_store(gy, i, j, xy)
+@wp.kernel()
+def tile_math_fft_kernel_vec2d(gx: wp.array2d(dtype=wp.vec2d), gy: wp.array2d(dtype=wp.vec2d)):
+    i, j = wp.tid()
+    xy = wp.tile_load(gx, i, j, m=FFT_SIZE_FP64, n=FFT_SIZE_FP64)
+    wp.tile_fft(xy)
+    wp.tile_store(gy, i, j, xy)
+def test_tile_math_fft(test, device, wp_dtype):
+    np_real_dtype = {wp.vec2f: np.float32, wp.vec2d: np.float64}[wp_dtype]
+    np_cplx_dtype = {wp.vec2f: np.complex64, wp.vec2d: np.complex128}[wp_dtype]
+    kernel = {wp.vec2d: tile_math_fft_kernel_vec2d, wp.vec2f: tile_math_fft_kernel_vec2f}[wp_dtype]
+    fft_size = {wp.vec2d: FFT_SIZE_FP64, wp.vec2f: FFT_SIZE_FP32}[wp_dtype]
+    rng = np.random.default_rng(42)
+    # Warp doesn't really have a complex64 type,
+    # so we use 2 float32 to represent a single complex64 number and then convert it to vec2f
+    X = rng.random((fft_size, 2 * fft_size), dtype=np_real_dtype)
+    Y = np.zeros_like(X)
+    X_wp = wp.array2d(X, requires_grad=True, dtype=wp_dtype, device=device)
+    Y_wp = wp.array2d(Y, requires_grad=True, dtype=wp_dtype, device=device)
+    X_c64 = X.view(np_cplx_dtype).reshape(fft_size, fft_size)
+    Y_c64 = np.fft.fft(X_c64, axis=-1)
+    with wp.Tape() as tape:
+        wp.launch_tiled(kernel, dim=[1, 1], inputs=[X_wp, Y_wp], block_dim=TILE_DIM, device=device)
+    Y_wp_c64 = Y_wp.numpy().view(np_cplx_dtype).reshape(fft_size, fft_size)
+    assert_np_equal(Y_wp_c64, Y_c64, tol=1.0e-4)
+    # TODO: implement and test backward pass
+devices = get_cuda_test_devices()
+@unittest.skipUnless(wp.context.runtime.core.is_mathdx_enabled(), "Warp was not built with MathDx support")
+class TestTileMathDx(unittest.TestCase):
+    pass
+# check_output=False so we can enable libmathdx's logging without failing the tests
+add_function_test(TestTileMathDx, "test_tile_math_matmul", test_tile_math_matmul, devices=devices, check_output=False)
+add_function_test(
+    TestTileMathDx,
+    "test_tile_math_fft_vec2f",
+    functools.partial(test_tile_math_fft, wp_dtype=wp.vec2f),
+    devices=devices,
+    check_output=False,
+)
+add_function_test(
+    TestTileMathDx,
+    "test_tile_math_fft_vec2d",
+    functools.partial(test_tile_math_fft, wp_dtype=wp.vec2d),
+    devices=devices,
+    check_output=False,
+)
+if __name__ == "__main__":
+    wp.clear_kernel_cache()
+    unittest.main(verbosity=2, failfast=True)

warp/tests/test_tile_mlp.py ADDED Viewed

@@ -0,0 +1,383 @@
+# Copyright (c) 2024 NVIDIA CORPORATION.  All rights reserved.
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+import os
+import numpy as np
+import warp as wp
+import warp.examples
+import warp.optim
+from warp.tests.unittest_utils import *
+wp.init()
+# needs to be constant for the whole module
+NUM_THREADS = 32
+def create_layer(rng, dim_in, dim_hid, dtype=float):
+    w = rng.uniform(-1.0 / np.sqrt(dim_in), 1.0 / np.sqrt(dim_in), (dim_hid, dim_in))
+    b = rng.uniform(-1.0 / np.sqrt(dim_in), 1.0 / np.sqrt(dim_in), (dim_hid, 1))
+    weights = wp.array(w, dtype=dtype, requires_grad=True)
+    bias = wp.array(b, dtype=dtype, requires_grad=True)
+    return (weights, bias)
+def create_array(rng, dim_in, dim_hid, dtype=float):
+    s = rng.uniform(-1.0 / np.sqrt(dim_in), 1.0 / np.sqrt(dim_in), (dim_hid, dim_in))
+    a = wp.array(s, dtype=dtype, requires_grad=True)
+    return a
+@unittest.skipUnless(wp.context.runtime.core.is_mathdx_enabled(), "Warp was not built with MathDx support")
+def test_multi_layer_nn(test, device):
+    import torch as tc
+    NUM_FREQ = wp.constant(8)
+    DIM_IN = wp.constant(4 * NUM_FREQ)  # sin,cos for both x,y at each frequency
+    DIM_HID = 32
+    DIM_OUT = 3
+    IMG_WIDTH = 256
+    IMG_HEIGHT = 256
+    BATCH_SIZE = min(512, int((IMG_WIDTH * IMG_HEIGHT) / 8))
+    dtype = wp.float16
+    @wp.func
+    def relu(x: dtype):
+        return wp.max(x, dtype(0.0))
+    @wp.func
+    def sigmoid(x: dtype):
+        return dtype(1.0 / (1.0 + wp.exp(-float(x))))
+    @wp.kernel
+    def zero(loss: wp.array(dtype=float)):
+        loss[0] = 0.0
+    @wp.kernel
+    def compute(
+        batches: wp.array(dtype=int),
+        input: wp.array2d(dtype=dtype),
+        weights_0: wp.array2d(dtype=dtype),
+        bias_0: wp.array2d(dtype=dtype),
+        weights_1: wp.array2d(dtype=dtype),
+        bias_1: wp.array2d(dtype=dtype),
+        weights_2: wp.array2d(dtype=dtype),
+        bias_2: wp.array2d(dtype=dtype),
+        weights_3: wp.array2d(dtype=dtype),
+        bias_3: wp.array2d(dtype=dtype),
+        reference: wp.array2d(dtype=float),
+        loss: wp.array1d(dtype=float),
+        out: wp.array2d(dtype=float),
+    ):
+        linear = batches[wp.tid()]
+        row = linear / IMG_WIDTH
+        col = linear % IMG_WIDTH
+        # normalize input coordinates to [-1, 1]
+        x = (float(row) / float(IMG_WIDTH) - 0.5) * 2.0
+        y = (float(col) / float(IMG_HEIGHT) - 0.5) * 2.0
+        local = wp.vector(dtype=dtype, length=DIM_IN)
+        # construct positional encoding
+        for s in range(NUM_FREQ):
+            scale = wp.pow(2.0, float(s)) * wp.pi
+            # x-coord
+            local[s * 4 + 0] = dtype(wp.sin(x * scale))
+            local[s * 4 + 1] = dtype(wp.cos(x * scale))
+            # y-coord
+            local[s * 4 + 2] = dtype(wp.sin(y * scale))
+            local[s * 4 + 3] = dtype(wp.cos(y * scale))
+            # write input back to array so that torch can use it
+            input[s * 4 + 0, linear] = local[s * 4 + 0]
+            input[s * 4 + 1, linear] = local[s * 4 + 1]
+            input[s * 4 + 2, linear] = local[s * 4 + 2]
+            input[s * 4 + 3, linear] = local[s * 4 + 3]
+        # tile feature vectors across the block, returns [dim(f), NUM_THREADS]
+        f = wp.tile(local)
+        # input layer
+        w0 = wp.tile_load(weights_0, 0, 0, m=DIM_HID, n=DIM_IN)
+        b0 = wp.tile_load(bias_0, 0, 0, m=DIM_HID, n=1)
+        z = wp.tile_map(relu, wp.tile_matmul(w0, f) + wp.tile_broadcast(b0, m=DIM_HID, n=NUM_THREADS))
+        # hidden layer
+        w1 = wp.tile_load(weights_1, 0, 0, m=DIM_HID, n=DIM_HID)
+        b1 = wp.tile_load(bias_1, 0, 0, m=DIM_HID, n=1)
+        z = wp.tile_map(relu, wp.tile_matmul(w1, z) + wp.tile_broadcast(b1, m=DIM_HID, n=NUM_THREADS))
+        w2 = wp.tile_load(weights_2, 0, 0, m=DIM_HID, n=DIM_HID)
+        b2 = wp.tile_load(bias_2, 0, 0, m=DIM_HID, n=1)
+        z = wp.tile_map(relu, wp.tile_matmul(w2, z) + wp.tile_broadcast(b2, m=DIM_HID, n=NUM_THREADS))
+        # output layer
+        w3 = wp.tile_load(weights_3, 0, 0, m=DIM_OUT, n=DIM_HID)
+        b3 = wp.tile_load(bias_3, 0, 0, m=DIM_OUT, n=1)
+        o = wp.tile_map(relu, wp.tile_matmul(w3, z) + wp.tile_broadcast(b3, m=DIM_OUT, n=NUM_THREADS))
+        # untile back to SIMT
+        output = wp.untile(o)
+        # compute error
+        error = wp.vec3(
+            float(output[0]) - reference[0, linear],
+            float(output[1]) - reference[1, linear],
+            float(output[2]) - reference[2, linear],
+        )
+        # write MSE loss
+        wp.atomic_add(loss, 0, wp.length_sq(error) / float(3 * BATCH_SIZE))
+        # image output
+        for i in range(DIM_OUT):
+            out[i, linear] = float(output[i])
+    with wp.ScopedDevice(device):
+        torch_device = wp.device_to_torch(device)
+        rng = np.random.default_rng(45)
+        weights_0, bias_0 = create_layer(rng, DIM_IN, DIM_HID, dtype=dtype)
+        weights_1, bias_1 = create_layer(rng, DIM_HID, DIM_HID, dtype=dtype)
+        weights_2, bias_2 = create_layer(rng, DIM_HID, DIM_HID, dtype=dtype)
+        weights_3, bias_3 = create_layer(rng, DIM_HID, DIM_OUT, dtype=dtype)
+        input = create_array(rng, IMG_WIDTH * IMG_HEIGHT, DIM_IN, dtype=dtype)
+        output = create_array(rng, IMG_WIDTH * IMG_HEIGHT, DIM_OUT)
+        reference_np = np.load(os.path.join(os.path.dirname(__file__), "assets/pixel.npy"), allow_pickle=True) / 255.0
+        reference = wp.array(reference_np, dtype=float)
+        assert reference.shape[1] == IMG_WIDTH * IMG_HEIGHT
+        loss = wp.zeros(1, dtype=float, requires_grad=True)
+        params = [weights_0, bias_0, weights_1, bias_1, weights_2, bias_2, weights_3, bias_3]
+        optimizer_grads = [p.grad.flatten() for p in params]
+        optimizer_inputs = [p.flatten() for p in params]
+        optimizer = warp.optim.Adam(optimizer_inputs, lr=0.01)
+        num_batches = int((IMG_WIDTH * IMG_HEIGHT) / BATCH_SIZE)
+        max_epochs = 30
+        # create randomized batch indices
+        batches = np.arange(0, IMG_WIDTH * IMG_HEIGHT, dtype=np.int32)
+        rng.shuffle(batches)
+        batches = wp.array(batches)
+        with wp.ScopedTimer("Training", active=False):
+            for epoch in range(max_epochs):
+                for b in range(0, IMG_WIDTH * IMG_HEIGHT, BATCH_SIZE):
+                    loss.zero_()
+                    with wp.Tape() as tape:
+                        wp.launch(
+                            compute,
+                            dim=[BATCH_SIZE],
+                            inputs=[
+                                batches[b : b + BATCH_SIZE],
+                                input,
+                                weights_0,
+                                bias_0,
+                                weights_1,
+                                bias_1,
+                                weights_2,
+                                bias_2,
+                                weights_3,
+                                bias_3,
+                                reference,
+                                loss,
+                                output,
+                            ],
+                            block_dim=NUM_THREADS,
+                        )
+                    tape.backward(loss)
+                    # check outputs + grads on the first few epoch only
+                    # since this is a relatively slow operation
+                    verify = True
+                    if verify and epoch < 3:
+                        indices = batches[b : b + BATCH_SIZE].numpy()
+                        z_np = np.maximum(weights_0.numpy() @ input.numpy()[:, indices] + bias_0.numpy(), 0.0)
+                        z_np = np.maximum(weights_1.numpy() @ z_np + bias_1.numpy(), 0.0)
+                        z_np = np.maximum(weights_2.numpy() @ z_np + bias_2.numpy(), 0.0)
+                        z_np = np.maximum(weights_3.numpy() @ z_np + bias_3.numpy(), 0.0)
+                        # test numpy forward
+                        assert_np_equal(output.numpy()[:, indices], z_np, tol=1.0e-2)
+                        # torch
+                        input_tc = tc.tensor(input.numpy()[:, indices], requires_grad=True, device=torch_device)
+                        weights_0_tc = tc.tensor(weights_0.numpy(), requires_grad=True, device=torch_device)
+                        bias_0_tc = tc.tensor(bias_0.numpy(), requires_grad=True, device=torch_device)
+                        weights_1_tc = tc.tensor(weights_1.numpy(), requires_grad=True, device=torch_device)
+                        bias_1_tc = tc.tensor(bias_1.numpy(), requires_grad=True, device=torch_device)
+                        weights_2_tc = tc.tensor(weights_2.numpy(), requires_grad=True, device=torch_device)
+                        bias_2_tc = tc.tensor(bias_2.numpy(), requires_grad=True, device=torch_device)
+                        weights_3_tc = tc.tensor(weights_3.numpy(), requires_grad=True, device=torch_device)
+                        bias_3_tc = tc.tensor(bias_3.numpy(), requires_grad=True, device=torch_device)
+                        z_tc = tc.clamp(weights_0_tc @ input_tc + bias_0_tc, min=0.0)
+                        z_tc = tc.clamp(weights_1_tc @ z_tc + bias_1_tc, min=0.0)
+                        z_tc = tc.clamp(weights_2_tc @ z_tc + bias_2_tc, min=0.0)
+                        z_tc = tc.clamp(weights_3_tc @ z_tc + bias_3_tc, min=0.0)
+                        ref_tc = tc.tensor(reference.numpy()[:, indices], requires_grad=True, device=torch_device)
+                        l_tc = tc.mean((z_tc - ref_tc) ** 2)
+                        l_tc.backward()
+                        # test torch
+                        assert_np_equal(z_tc.cpu().detach().numpy(), output.numpy()[:, indices], tol=1.0e-2)
+                        assert_np_equal(weights_0.grad.numpy(), weights_0_tc.grad.cpu().detach().numpy(), tol=1.0e-2)
+                        assert_np_equal(bias_0.grad.numpy(), bias_0_tc.grad.cpu().detach().numpy(), tol=1.0e-2)
+                        assert_np_equal(weights_1.grad.numpy(), weights_1_tc.grad.cpu().detach().numpy(), tol=1.0e-2)
+                        assert_np_equal(bias_1.grad.numpy(), bias_1_tc.grad.cpu().detach().numpy(), tol=1.0e-2)
+                        assert_np_equal(weights_2.grad.numpy(), weights_2_tc.grad.cpu().detach().numpy(), tol=1.0e-2)
+                        assert_np_equal(bias_2.grad.numpy(), bias_2_tc.grad.cpu().detach().numpy(), tol=1.0e-2)
+                        assert_np_equal(weights_3.grad.numpy(), weights_3_tc.grad.cpu().detach().numpy(), tol=1.0e-2)
+                        assert_np_equal(bias_3.grad.numpy(), bias_3_tc.grad.cpu().detach().numpy(), tol=1.0e-2)
+                    optimizer.step(optimizer_grads)
+                    tape.zero()
+        # initial loss is ~0.061
+        test.assertLess(loss.numpy()[0], 0.002)
+@unittest.skipUnless(wp.context.runtime.core.is_mathdx_enabled(), "Warp was not built with MathDx support")
+def test_single_layer_nn(test, device):
+    import torch as tc
+    DIM_IN = 8
+    DIM_HID = 32
+    DIM_OUT = 16
+    NUM_BLOCKS = 56
+    @wp.func
+    def relu(x: float):
+        return wp.max(x, 0.0)
+    @wp.kernel
+    def compute(
+        input: wp.array2d(dtype=float),
+        weights: wp.array2d(dtype=float),
+        bias: wp.array2d(dtype=float),
+        out: wp.array2d(dtype=float),
+    ):
+        i = wp.tid()
+        f = wp.tile_load(input, 0, i, m=DIM_IN, n=NUM_THREADS)
+        w = wp.tile_load(weights, 0, 0, DIM_OUT, DIM_IN)
+        b = wp.tile_load(bias, 0, 0, m=DIM_OUT, n=1)
+        o = wp.tile_map(relu, wp.tile_matmul(w, f) + wp.tile_broadcast(b, m=DIM_OUT, n=NUM_THREADS))
+        wp.tile_store(out, 0, i, o)
+    with wp.ScopedDevice(device):
+        rng = np.random.default_rng(45)
+        # single layer weights, bias
+        weights, bias = create_layer(rng, DIM_IN, DIM_OUT, dtype=float)
+        input = create_array(rng, NUM_THREADS * NUM_BLOCKS, DIM_IN)
+        output = create_array(rng, NUM_THREADS * NUM_BLOCKS, DIM_OUT)
+        with wp.Tape() as tape:
+            wp.launch_tiled(compute, dim=[NUM_BLOCKS], inputs=[input, weights, bias, output], block_dim=NUM_THREADS)
+        output.grad = wp.ones_like(output)
+        tape.backward()
+        # numpy
+        output_np = np.maximum(weights.numpy() @ input.numpy() + bias.numpy(), 0.0)
+        # test numpy forward
+        assert_np_equal(output.numpy(), output_np, tol=1.0e-2)
+        # torch
+        weights_tc = tc.from_numpy(weights.numpy()).requires_grad_(True)  # use .numpy() to avoid any memory aliasing
+        input_tc = tc.from_numpy(input.numpy()).requires_grad_(True)
+        bias_tc = tc.from_numpy(bias.numpy()).requires_grad_(True)
+        output_tc = tc.clamp(weights_tc @ input_tc + bias_tc, min=0.0)
+        output_tc.backward(tc.ones_like(output_tc))
+        # test torch
+        assert_np_equal(output_tc.detach().numpy(), output.numpy(), tol=1.0e-2)
+        assert_np_equal(input.grad.numpy(), input_tc.grad.detach().numpy(), tol=1.0e-2)
+class TestTileMLP(unittest.TestCase):
+    pass
+test_devices = get_test_devices()
+try:
+    import torch
+    # check which Warp devices work with Torch
+    # CUDA devices may fail if Torch was not compiled with CUDA support
+    torch_compatible_devices = []
+    torch_compatible_cuda_devices = []
+    for d in test_devices:
+        try:
+            t = torch.arange(10, device=wp.device_to_torch(d))
+            t += 1
+            torch_compatible_devices.append(d)
+            if d.is_cuda:
+                torch_compatible_cuda_devices.append(d)
+        except Exception as e:
+            print(f"Skipping Torch tests on device '{d}' due to exception: {e}")
+    add_function_test(
+        TestTileMLP,
+        "test_single_layer_nn",
+        test_single_layer_nn,
+        check_output=False,
+        devices=torch_compatible_cuda_devices,
+    )
+    add_function_test(
+        TestTileMLP,
+        "test_multi_layer_nn",
+        test_multi_layer_nn,
+        check_output=False,
+        devices=torch_compatible_cuda_devices,
+    )
+except Exception as e:
+    print(f"Skipping Torch tests due to exception: {e}")
+if __name__ == "__main__":
+    wp.clear_kernel_cache()
+    unittest.main(verbosity=2, failfast=True)