PyPI - warp-lang - Versions diffs - 1.6.0__py3-none-manylinux2014_aarch64.whl → 1.6.1__py3-none-manylinux2014_aarch64.whl - Mend

warp-lang 1.6.0__py3-none-manylinux2014_aarch64.whl → 1.6.1__py3-none-manylinux2014_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (37) hide show

warp/bin/warp-clang.so +0 -0
warp/bin/warp.so +0 -0
warp/builtins.py +1 -1
warp/codegen.py +10 -3
warp/config.py +65 -21
warp/context.py +202 -65
warp/examples/core/example_marching_cubes.py +1 -1
warp/examples/core/example_mesh.py +1 -1
warp/examples/core/example_wave.py +1 -1
warp/examples/sim/example_cloth_self_contact.py +81 -27
warp/examples/tile/example_tile_nbody.py +26 -15
warp/native/clang/clang.cpp +1 -1
warp/native/crt.h +1 -0
warp/native/mat.h +16 -3
warp/native/tile.h +12 -8
warp/render/render_opengl.py +23 -15
warp/render/render_usd.py +10 -2
warp/sim/collide.py +29 -16
warp/sim/import_urdf.py +20 -5
warp/sim/integrator_featherstone.py +4 -11
warp/sim/model.py +62 -59
warp/sim/render.py +2 -2
warp/stubs.py +1 -1
warp/tests/test_array.py +26 -0
warp/tests/test_collision.py +6 -6
warp/tests/test_examples.py +7 -1
warp/tests/test_launch.py +77 -26
warp/tests/test_mat.py +75 -1
warp/tests/test_overwrite.py +4 -3
warp/tests/test_tile_load.py +44 -1
warp/thirdparty/unittest_parallel.py +3 -0
warp/types.py +66 -68
{warp_lang-1.6.0.dist-info → warp_lang-1.6.1.dist-info}/METADATA +34 -17
{warp_lang-1.6.0.dist-info → warp_lang-1.6.1.dist-info}/RECORD +37 -37
{warp_lang-1.6.0.dist-info → warp_lang-1.6.1.dist-info}/WHEEL +1 -1
{warp_lang-1.6.0.dist-info → warp_lang-1.6.1.dist-info}/LICENSE.md +0 -0
{warp_lang-1.6.0.dist-info → warp_lang-1.6.1.dist-info}/top_level.txt +0 -0

warp/tests/test_launch.py CHANGED Viewed

@@ -46,6 +46,12 @@ def kernel4d(a: wp.array(dtype=int, ndim=4)):
     wp.expect_eq(a[i, j, k, l], i * dim_y * dim_z * dim_w + j * dim_z * dim_w + k * dim_w + l)
+@wp.kernel
+def square_kernel(input: wp.array(dtype=float), output: wp.array(dtype=float)):
+    i = wp.tid()
+    output[i] = input[i] * input[i]
 def test1d(test, device):
     a = np.arange(0, dim_x).reshape(dim_x)
@@ -98,8 +104,19 @@ def kernel_cmd(params: Params, i: int, f: float, v: wp.vec3, m: wp.mat33, out: w
 def test_launch_cmd(test, device):
+    """Tests recording and executing a kernel launch command.
+    Verifies that:
+    - A kernel can be recorded as a command without immediate execution
+    - The recorded command can be launched later
+    - Parameters are correctly passed to the kernel
+    - Output matches expected results for both immediate and delayed launches
+    Args:
+        test: Test context
+        device: Device to run the test on
+    """
     n = 1
     ref = np.arange(0, n)
     out = wp.zeros(n, dtype=int, device=device)
@@ -274,12 +291,62 @@ def test_launch_cmd_empty(test, device):
     assert_np_equal(out.numpy(), ref)
+def test_launch_cmd_adjoint(test, device):
+    """Test recording an adjoint launch with record_cmd=True."""
+    input_arr = wp.array([1.0, 2.0, 3.0], dtype=float, requires_grad=True, device=device)
+    output_arr = wp.empty_like(input_arr)
+    output_arr.grad.fill_(1.0)
+    cmd = wp.launch(
+        square_kernel,
+        dim=input_arr.size,
+        inputs=[input_arr, output_arr],
+        adj_inputs=[None, None],
+        adjoint=True,
+        device=device,
+        record_cmd=True,
+    )
+    cmd.launch()
+    assert_np_equal(input_arr.grad.numpy(), np.array([2.0, 4.0, 6.0]))
+def test_launch_cmd_adjoint_empty(test, device):
+    """Test constructing a Launch object for an adjoint kernel."""
+    input_arr = wp.array([1.0, 2.0, 3.0], dtype=float, requires_grad=True, device=device)
+    output_arr = wp.empty_like(input_arr)
+    output_arr.grad.fill_(1.0)
+    cmd = wp.Launch(square_kernel, device, adjoint=True)
+    cmd.set_param_by_name("input", input_arr)
+    cmd.set_param_by_name("output", output_arr)
+    cmd.set_dim(input_arr.size)
+    cmd.launch()
+    assert_np_equal(input_arr.grad.numpy(), np.array([2.0, 4.0, 6.0]))
+    # Now update the launch object's parameters with arrays of different sizes and values
+    # and check that the adjoints are correctly computed
+    input_arr_updated = wp.array([4.0, 5.0, 6.0, 7.0], dtype=float, device=device)
+    input_arr_updated_grad = wp.zeros_like(input_arr_updated)
+    output_arr_updated = wp.empty_like(input_arr_updated)
+    output_arr_updated_grad = wp.full_like(output_arr_updated, 1.0)
+    cmd.set_param_by_name("input", input_arr_updated)
+    cmd.set_param_by_name("output", output_arr_updated)
+    cmd.set_param_by_name("input", input_arr_updated_grad, adjoint=True)
+    cmd.set_param_by_name("output", output_arr_updated_grad, adjoint=True)
+    cmd.set_dim(input_arr_updated.size)
+    cmd.launch()
+    assert_np_equal(input_arr_updated_grad.numpy(), np.array([8.0, 10.0, 12.0, 14.0]))
 @wp.kernel
-def kernel_mul(
-    values: wp.array(dtype=int),
-    coeff: int,
-    out: wp.array(dtype=int),
-):
+def kernel_mul(values: wp.array(dtype=int), coeff: int, out: wp.array(dtype=int)):
     tid = wp.tid()
     out[tid] = values[tid] * coeff
@@ -301,28 +368,10 @@ def test_launch_tuple_args(test, device):
     )
     assert_np_equal(out.numpy(), np.array((0, 3, 6, 9)))
-    wp.launch(
-        kernel_mul,
-        dim=len(values),
-        inputs=(
-            values,
-            coeff,
-            out,
-        ),
-        device=device,
-    )
+    wp.launch(kernel_mul, dim=len(values), inputs=(values, coeff, out), device=device)
     assert_np_equal(out.numpy(), np.array((0, 3, 6, 9)))
-    wp.launch(
-        kernel_mul,
-        dim=len(values),
-        outputs=(
-            values,
-            coeff,
-            out,
-        ),
-        device=device,
-    )
+    wp.launch(kernel_mul, dim=len(values), outputs=(values, coeff, out), device=device)
     assert_np_equal(out.numpy(), np.array((0, 3, 6, 9)))
@@ -343,6 +392,8 @@ add_function_test(TestLaunch, "test_launch_cmd_set_param", test_launch_cmd_set_p
 add_function_test(TestLaunch, "test_launch_cmd_set_ctype", test_launch_cmd_set_ctype, devices=devices)
 add_function_test(TestLaunch, "test_launch_cmd_set_dim", test_launch_cmd_set_dim, devices=devices)
 add_function_test(TestLaunch, "test_launch_cmd_empty", test_launch_cmd_empty, devices=devices)
+add_function_test(TestLaunch, "test_launch_cmd_adjoint", test_launch_cmd_adjoint, devices=devices)
+add_function_test(TestLaunch, "test_launch_cmd_adjoint_empty", test_launch_cmd_adjoint_empty, devices=devices)
 add_function_test(TestLaunch, "test_launch_tuple_args", test_launch_tuple_args, devices=devices)

warp/tests/test_mat.py CHANGED Viewed

@@ -384,6 +384,77 @@ def test_negation(test, device, dtype, register_kernels=False):
                     idx = idx + 1
+def test_matmul(test, device, dtype, register_kernels=False):
+    rng = np.random.default_rng(123)
+    tol = {
+        np.float16: 5.0e-3,
+        np.float32: 1.0e-6,
+        np.float64: 1.0e-12,
+    }.get(dtype, 0)
+    wptype = wp.types.np_dtype_to_warp_type[np.dtype(dtype)]
+    mat22 = wp.types.matrix(shape=(2, 2), dtype=wptype)
+    mat33 = wp.types.matrix(shape=(3, 3), dtype=wptype)
+    mat23 = wp.types.matrix(shape=(2, 3), dtype=wptype)
+    mat32 = wp.types.matrix(shape=(3, 2), dtype=wptype)
+    mat44 = wp.types.matrix(shape=(4, 4), dtype=wptype)
+    output_select_kernel = get_select_kernel(wptype)
+    def check_mat_mul(
+        i23: wp.array(dtype=mat23),
+        i32: wp.array(dtype=mat32),
+        i44: wp.array(dtype=mat44),
+        o22: wp.array(dtype=mat22),
+        o33: wp.array(dtype=mat33),
+        o44: wp.array(dtype=mat44),
+    ):
+        i = wp.tid()
+        o22[i] = i23[i] @ i32[i]
+        o33[i] = i32[i] @ i23[i]
+        o44[i] = i44[i] @ i44[i]
+    kernel = getkernel(check_mat_mul, suffix=dtype.__name__)
+    if register_kernels:
+        return
+    test_adj = dtype in np_float_types
+    i23 = wp.array(randvals(rng, [1, 2, 3], dtype), dtype=mat23, requires_grad=test_adj, device=device)
+    i32 = wp.array(randvals(rng, [1, 3, 2], dtype), dtype=mat32, requires_grad=test_adj, device=device)
+    i44 = wp.array(randvals(rng, [1, 4, 4], dtype), dtype=mat44, requires_grad=test_adj, device=device)
+    o22 = wp.array(randvals(rng, [1, 2, 2], dtype), dtype=mat22, requires_grad=test_adj, device=device)
+    o33 = wp.array(randvals(rng, [1, 3, 3], dtype), dtype=mat33, requires_grad=test_adj, device=device)
+    o44 = wp.array(randvals(rng, [1, 4, 4], dtype), dtype=mat44, requires_grad=test_adj, device=device)
+    tape = wp.Tape()
+    with tape:
+        wp.launch(
+            kernel,
+            dim=1,
+            inputs=[i23, i32, i44],
+            outputs=[o22, o33, o44],
+            device=device,
+        )
+    assert_np_equal(o22.numpy(), i23.numpy() @ i32.numpy(), tol=tol)
+    assert_np_equal(o33.numpy(), i32.numpy() @ i23.numpy(), tol=tol)
+    assert_np_equal(o44.numpy(), i44.numpy() @ i44.numpy(), tol=tol)
+    if test_adj:
+        o22.grad.assign([np.eye(2)])
+        o33.grad.assign([np.eye(3)])
+        o44.grad.assign([np.eye(4)])
+        tape.backward()
+        assert_np_equal(i23.grad.numpy(), 2.0 * i32.numpy().T, tol=tol)
+        assert_np_equal(i32.grad.numpy(), 2.0 * i23.numpy().T, tol=tol)
+        assert_np_equal(i44.grad.numpy(), 2.0 * i44.numpy().T, tol=tol)
 def test_subtraction(test, device, dtype, register_kernels=False):
     rng = np.random.default_rng(123)
@@ -874,7 +945,7 @@ def test_svd(test, device, dtype, register_kernels=False):
     tol = {
         np.float16: 1.0e-3,
         np.float32: 1.0e-6,
-        np.float64: 1.0e-6,
+        np.float64: 1.0e-12,
     }.get(dtype, 0)
     wptype = wp.types.np_dtype_to_warp_type[np.dtype(dtype)]
@@ -1765,6 +1836,9 @@ for dtype in np_signed_int_types + np_float_types:
     add_function_test_register_kernel(
         TestMat, f"test_subtraction_{dtype.__name__}", test_subtraction, devices=devices, dtype=dtype
     )
+    add_function_test_register_kernel(
+        TestMat, f"test_matmul_{dtype.__name__}", test_matmul, devices=devices, dtype=dtype
+    )
 add_function_test(
     TestMat,

warp/tests/test_overwrite.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import contextlib
 import io
 import unittest
+from typing import Any
 import numpy as np
@@ -164,18 +165,18 @@ def test_kernel_writeread_kernel_write(test, device):
 @wp.func
-def read_func(a: wp.array(dtype=float), idx: int):
+def read_func(a: wp.array(dtype=Any), idx: int):
     x = a[idx]
     return x
 @wp.func
-def read_return_func(b: wp.array(dtype=float), idx: int):
+def read_return_func(b: wp.array(dtype=Any), idx: int):
     return 1.0, b[idx]
 @wp.func
-def write_func(c: wp.array(dtype=float), idx: int):
+def write_func(c: wp.array(dtype=Any), idx: int):
     c[idx] = 1.0

warp/tests/test_tile_load.py CHANGED Viewed

@@ -133,6 +133,49 @@ def test_tile_load(kernel, ndim):
     return test
+@wp.kernel
+def tile_load_unaligned_kernel(
+    input: wp.array2d(dtype=float),
+    output: wp.array2d(dtype=float),
+):
+    t = wp.tile_load(input, shape=(TILE_M, TILE_N), offset=(1, 1), storage="shared")
+    wp.tile_store(output, t, offset=(1, 1))
+def test_tile_load_unaligned(test, device):
+    rng = np.random.default_rng(42)
+    shape = [TILE_M + 1, TILE_N + 1]
+    input = wp.array(rng.random(shape), dtype=float, requires_grad=True, device=device)
+    output = wp.zeros(shape, dtype=float, device=device)
+    with wp.Tape() as tape:
+        wp.launch_tiled(
+            tile_load_unaligned_kernel,
+            dim=[1],
+            inputs=[input, output],
+            block_dim=TILE_DIM,
+            device=device,
+        )
+    # first row and column should be zero
+    assert_np_equal(output.numpy()[0, :], np.zeros(TILE_N + 1))
+    assert_np_equal(output.numpy()[:, 0], np.zeros(TILE_M + 1))
+    # check output elements
+    assert_np_equal(output.numpy()[1:, 1:], input.numpy()[1:, 1:])
+    output.grad = wp.ones_like(output)
+    tape.backward()
+    expected_grad = np.ones_like(input.grad.numpy())
+    expected_grad[0, :] = 0.0
+    expected_grad[:, 0] = 0.0
+    assert_np_equal(input.grad.numpy(), expected_grad)
 # ----------------------------------------------------------------------------------------
 TILE_SIZE = 4
@@ -336,7 +379,7 @@ add_function_test(TestTileLoad, "test_tile_load_1d", test_tile_load(tile_load_1d
 add_function_test(TestTileLoad, "test_tile_load_2d", test_tile_load(tile_load_2d_kernel, 2), devices=devices)
 add_function_test(TestTileLoad, "test_tile_load_3d", test_tile_load(tile_load_3d_kernel, 3), devices=devices)
 add_function_test(TestTileLoad, "test_tile_load_4d", test_tile_load(tile_load_4d_kernel, 4), devices=devices)
+add_function_test(TestTileLoad, "test_tile_load_unaligned", test_tile_load_unaligned, devices=devices)
 add_function_test(TestTileLoad, "test_tile_extract_1d", test_tile_extract(tile_extract_1d_kernel, 1), devices=devices)
 add_function_test(TestTileLoad, "test_tile_extract_2d", test_tile_extract(tile_extract_2d_kernel, 2), devices=devices)

warp/thirdparty/unittest_parallel.py CHANGED Viewed

@@ -554,6 +554,9 @@ def initialize_test_process(lock, shared_index, args, temp_dir):
             wp.config.kernel_cache_dir = cache_root_dir
             wp.build.clear_kernel_cache()
+        elif "WARP_CACHE_ROOT" in os.environ:
+            # Using a shared cache for all test processes
+            wp.config.kernel_cache_dir = os.path.join(os.getenv("WARP_CACHE_ROOT"), wp.config.version)
 if __name__ == "__main__":  # pragma: no cover

warp/types.py CHANGED Viewed

@@ -1768,77 +1768,75 @@ class array(Array):
             dtype_ndim = 0
             scalar_dtype = dtype
-        if hasattr(data, "__cuda_array_interface__"):
-            try:
-                # Performance note: try first, ask questions later
-                device = warp.context.runtime.get_device(device)
-            except Exception:
-                # Fallback to using the public API for retrieving the device,
-                # which takes take of initializing Warp if needed.
-                device = warp.context.get_device(device)
-            if device.is_cuda:
-                desc = data.__cuda_array_interface__
-                data_shape = desc.get("shape")
-                data_strides = desc.get("strides")
-                data_dtype = np.dtype(desc.get("typestr"))
-                data_ptr = desc.get("data")[0]
-                if dtype == Any:
-                    dtype = np_dtype_to_warp_type[data_dtype]
-                data_ndim = len(data_shape)
-                # determine whether the input needs reshaping
-                target_npshape = None
-                if shape is not None:
-                    target_npshape = (*shape, *dtype_shape)
-                elif dtype_ndim > 0:
-                    # prune inner dimensions of length 1
-                    while data_ndim > 1 and data_shape[-1] == 1:
-                        data_shape = data_shape[:-1]
-                    # if the inner dims don't match exactly, check if the innermost dim is a multiple of type length
-                    if data_ndim < dtype_ndim or data_shape[-dtype_ndim:] != dtype_shape:
-                        if data_shape[-1] == dtype._length_:
-                            target_npshape = (*data_shape[:-1], *dtype_shape)
-                        elif data_shape[-1] % dtype._length_ == 0:
-                            target_npshape = (*data_shape[:-1], data_shape[-1] // dtype._length_, *dtype_shape)
+        try:
+            # Performance note: try first, ask questions later
+            device = warp.context.runtime.get_device(device)
+        except Exception:
+            # Fallback to using the public API for retrieving the device,
+            # which takes take of initializing Warp if needed.
+            device = warp.context.get_device(device)
+        if device.is_cuda and hasattr(data, "__cuda_array_interface__"):
+            desc = data.__cuda_array_interface__
+            data_shape = desc.get("shape")
+            data_strides = desc.get("strides")
+            data_dtype = np.dtype(desc.get("typestr"))
+            data_ptr = desc.get("data")[0]
+            if dtype == Any:
+                dtype = np_dtype_to_warp_type[data_dtype]
+            if data_strides is None:
+                data_strides = strides_from_shape(data_shape, dtype)
+            data_ndim = len(data_shape)
+            # determine whether the input needs reshaping
+            target_npshape = None
+            if shape is not None:
+                target_npshape = (*shape, *dtype_shape)
+            elif dtype_ndim > 0:
+                # prune inner dimensions of length 1
+                while data_ndim > 1 and data_shape[-1] == 1:
+                    data_shape = data_shape[:-1]
+                # if the inner dims don't match exactly, check if the innermost dim is a multiple of type length
+                if data_ndim < dtype_ndim or data_shape[-dtype_ndim:] != dtype_shape:
+                    if data_shape[-1] == dtype._length_:
+                        target_npshape = (*data_shape[:-1], *dtype_shape)
+                    elif data_shape[-1] % dtype._length_ == 0:
+                        target_npshape = (*data_shape[:-1], data_shape[-1] // dtype._length_, *dtype_shape)
+                    else:
+                        if dtype_ndim == 1:
+                            raise RuntimeError(
+                                f"The inner dimensions of the input data are not compatible with the requested vector type {warp.context.type_str(dtype)}: expected an inner dimension that is a multiple of {dtype._length_}"
+                            )
                         else:
-                            if dtype_ndim == 1:
-                                raise RuntimeError(
-                                    f"The inner dimensions of the input data are not compatible with the requested vector type {warp.context.type_str(dtype)}: expected an inner dimension that is a multiple of {dtype._length_}"
-                                )
-                            else:
-                                raise RuntimeError(
-                                    f"The inner dimensions of the input data are not compatible with the requested matrix type {warp.context.type_str(dtype)}: expected inner dimensions {dtype._shape_} or a multiple of {dtype._length_}"
-                                )
-                if target_npshape is None:
-                    target_npshape = data_shape if shape is None else shape
-                # determine final shape and strides
-                if dtype_ndim > 0:
-                    # make sure the inner dims are contiguous for vector/matrix types
-                    scalar_size = type_size_in_bytes(dtype._wp_scalar_type_)
-                    inner_contiguous = data_strides[-1] == scalar_size
-                    if inner_contiguous and dtype_ndim > 1:
-                        inner_contiguous = data_strides[-2] == scalar_size * dtype_shape[-1]
-                    shape = target_npshape[:-dtype_ndim] or (1,)
-                    strides = data_strides if shape == data_shape else strides_from_shape(shape, dtype)
-                else:
-                    shape = target_npshape or (1,)
-                    strides = data_strides if shape == data_shape else strides_from_shape(shape, dtype)
+                            raise RuntimeError(
+                                f"The inner dimensions of the input data are not compatible with the requested matrix type {warp.context.type_str(dtype)}: expected inner dimensions {dtype._shape_} or a multiple of {dtype._length_}"
+                            )
+            if target_npshape is None:
+                target_npshape = data_shape if shape is None else shape
+            # determine final shape and strides
+            if dtype_ndim > 0:
+                # make sure the inner dims are contiguous for vector/matrix types
+                scalar_size = type_size_in_bytes(dtype._wp_scalar_type_)
+                inner_contiguous = data_strides[-1] == scalar_size
+                if inner_contiguous and dtype_ndim > 1:
+                    inner_contiguous = data_strides[-2] == scalar_size * dtype_shape[-1]
+                shape = target_npshape[:-dtype_ndim] or (1,)
+                strides = data_strides if shape == data_shape else strides_from_shape(shape, dtype)
+            else:
+                shape = target_npshape or (1,)
+                strides = data_strides if shape == data_shape else strides_from_shape(shape, dtype)
-                self._init_from_ptr(data_ptr, dtype, shape, strides, None, device, False, None)
+            self._init_from_ptr(data_ptr, dtype, shape, strides, None, device, False, None)
-                # keep a ref to the source data to keep allocation alive
-                self._ref = data
-                return
-            else:
-                raise RuntimeError(
-                    f"Trying to construct a Warp array from data argument's __cuda_array_interface__ but {device} is not CUDA-capable"
-                )
+            # keep a ref to the source data to keep allocation alive
+            self._ref = data
+            return
         # convert input data to ndarray (handles lists, tuples, etc.) and determine dtype
         if dtype == Any:

{warp_lang-1.6.0.dist-info → warp_lang-1.6.1.dist-info}/METADATA RENAMED Viewed

@@ -1,6 +1,6 @@
 Metadata-Version: 2.2
 Name: warp-lang
-Version: 1.6.0
+Version: 1.6.1
 Summary: A Python framework for high-performance simulation and graphics programming
 Author-email: NVIDIA Corporation <mmacklin@nvidia.com>
 License: NVIDIA Software License
@@ -78,12 +78,24 @@ the `pip install` command, e.g.
 | Platform        | Install Command                                                                                                               |
 | --------------- | ----------------------------------------------------------------------------------------------------------------------------- |
-| Linux aarch64   | `pip install https://github.com/NVIDIA/warp/releases/download/v1.6.0/warp_lang-1.6.0+cu11-py3-none-manylinux2014_aarch64.whl` |
-| Linux x86-64    | `pip install https://github.com/NVIDIA/warp/releases/download/v1.6.0/warp_lang-1.6.0+cu11-py3-none-manylinux2014_x86_64.whl`  |
-| Windows x86-64  | `pip install https://github.com/NVIDIA/warp/releases/download/v1.6.0/warp_lang-1.6.0+cu11-py3-none-win_amd64.whl`             |
+| Linux aarch64   | `pip install https://github.com/NVIDIA/warp/releases/download/v1.6.1/warp_lang-1.6.1+cu11-py3-none-manylinux2014_aarch64.whl` |
+| Linux x86-64    | `pip install https://github.com/NVIDIA/warp/releases/download/v1.6.1/warp_lang-1.6.1+cu11-py3-none-manylinux2014_x86_64.whl`  |
+| Windows x86-64  | `pip install https://github.com/NVIDIA/warp/releases/download/v1.6.1/warp_lang-1.6.1+cu11-py3-none-win_amd64.whl`             |
 The `--force-reinstall` option may need to be used to overwrite a previous installation.
+### Nightly Builds
+Nightly builds of Warp from the `main` branch are available on the [NVIDIA Package Index](https://pypi.nvidia.com/warp-lang/).
+To install the latest nightly build, use the following command:
+```text
+pip install -U --pre warp-lang --extra-index-url=https://pypi.nvidia.com/
+```
+Note that the nightly builds are built with the CUDA 12 runtime and are not published for macOS.
 ### CUDA Requirements
 * Warp packages built with CUDA Toolkit 11.x require NVIDIA driver 470 or newer.
@@ -300,25 +312,13 @@ python -m warp.tests
             <td><a href="https://github.com/NVIDIA/warp/tree/main/warp/examples/optim/example_inverse_kinematics.py"><img src="https://media.githubusercontent.com/media/NVIDIA/warp/refs/heads/main/docs/img/examples/optim_inverse_kinematics.png"></a></td>
             <td><a href="https://github.com/NVIDIA/warp/tree/main/warp/examples/optim/example_spring_cage.py"><img src="https://media.githubusercontent.com/media/NVIDIA/warp/refs/heads/main/docs/img/examples/optim_spring_cage.png"></a></td>
             <td><a href="https://github.com/NVIDIA/warp/tree/main/warp/examples/optim/example_trajectory.py"><img src="https://media.githubusercontent.com/media/NVIDIA/warp/refs/heads/main/docs/img/examples/optim_trajectory.png"></a></td>
-            <td><a href="https://github.com/NVIDIA/warp/tree/main/warp/examples/optim/example_walker.py"><img src="https://media.githubusercontent.com/media/NVIDIA/warp/refs/heads/main/docs/img/examples/optim_walker.png"></a></td>
+            <td><a href="https://github.com/NVIDIA/warp/tree/main/warp/examples/optim/example_softbody_properties.py"><img src="https://media.githubusercontent.com/media/NVIDIA/warp/refs/heads/main/docs/img/examples/optim_softbody_properties.png"></a></td>
         </tr>
         <tr>
             <td align="center">inverse kinematics</td>
             <td align="center">spring cage</td>
             <td align="center">trajectory</td>
-            <td align="center">walker</td>
-        </tr>
-        <tr>
-            <td><a href="https://github.com/NVIDIA/warp/tree/main/warp/examples/optim/example_softbody_properties.py"><img src="https://media.githubusercontent.com/media/NVIDIA/warp/refs/heads/main/docs/img/examples/optim_softbody_properties.png"></a></td>
-            <td></td>
-            <td></td>
-            <td></td>
-        </tr>
-        <tr>
             <td align="center">soft body properties</td>
-            <td align="center"></td>
-            <td align="center"></td>
-            <td align="center"></td>
         </tr>
     </tbody>
 </table>
@@ -378,6 +378,23 @@ python -m warp.tests
     </tbody>
 </table>
+### warp/examples/tile
+<table>
+    <tbody>
+        <tr>
+            <td><a href="https://github.com/NVIDIA/warp/tree/main/warp/examples/tile/example_tile_mlp.py"><img src="https://media.githubusercontent.com/media/NVIDIA/warp/refs/heads/main/docs/img/examples/tile_mlp.png"></a></td>
+            <td><a href="https://github.com/NVIDIA/warp/tree/main/warp/examples/tile/example_tile_nbody.py"><img src="https://media.githubusercontent.com/media/NVIDIA/warp/refs/heads/main/docs/img/examples/tile_nbody.png"></a></td>
+            <td><a href="https://github.com/NVIDIA/warp/tree/main/warp/examples/tile/example_tile_walker.py"><img src="https://media.githubusercontent.com/media/NVIDIA/warp/refs/heads/main/docs/img/examples/tile_walker.png"></a></td>
+        </tr>
+        <tr>
+            <td align="center">mlp</td>
+            <td align="center">nbody</td>
+            <td align="center">walker</td>
+        </tr>
+    </tbody>
+</table>
 ## Building
 For developers who want to build the library themselves, the following tools are required: