PyPI - warp-lang - Versions diffs - 1.8.0__py3-none-macosx_10_13_universal2.whl → 1.9.0__py3-none-macosx_10_13_universal2.whl - Mend

warp-lang 1.8.0__py3-none-macosx_10_13_universal2.whl → 1.9.0__py3-none-macosx_10_13_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (153) hide show

warp/__init__.py +282 -103
warp/__init__.pyi +482 -110
warp/bin/libwarp-clang.dylib +0 -0
warp/bin/libwarp.dylib +0 -0
warp/build.py +93 -30
warp/build_dll.py +48 -63
warp/builtins.py +955 -137
warp/codegen.py +327 -209
warp/config.py +1 -1
warp/context.py +1363 -800
warp/examples/core/example_marching_cubes.py +1 -0
warp/examples/core/example_render_opengl.py +100 -3
warp/examples/fem/example_apic_fluid.py +98 -52
warp/examples/fem/example_convection_diffusion_dg.py +25 -4
warp/examples/fem/example_diffusion_mgpu.py +8 -3
warp/examples/fem/utils.py +68 -22
warp/examples/interop/example_jax_callable.py +34 -4
warp/examples/interop/example_jax_kernel.py +27 -1
warp/fabric.py +1 -1
warp/fem/cache.py +27 -19
warp/fem/domain.py +2 -2
warp/fem/field/nodal_field.py +2 -2
warp/fem/field/virtual.py +266 -166
warp/fem/geometry/geometry.py +5 -5
warp/fem/integrate.py +200 -91
warp/fem/space/restriction.py +4 -0
warp/fem/space/shape/tet_shape_function.py +3 -10
warp/jax_experimental/custom_call.py +1 -1
warp/jax_experimental/ffi.py +203 -54
warp/marching_cubes.py +708 -0
warp/native/array.h +103 -8
warp/native/builtin.h +90 -9
warp/native/bvh.cpp +64 -28
warp/native/bvh.cu +58 -58
warp/native/bvh.h +2 -2
warp/native/clang/clang.cpp +7 -7
warp/native/coloring.cpp +13 -3
warp/native/crt.cpp +2 -2
warp/native/crt.h +3 -5
warp/native/cuda_util.cpp +42 -11
warp/native/cuda_util.h +10 -4
warp/native/exports.h +1842 -1908
warp/native/fabric.h +2 -1
warp/native/hashgrid.cpp +37 -37
warp/native/hashgrid.cu +2 -2
warp/native/initializer_array.h +1 -1
warp/native/intersect.h +4 -4
warp/native/mat.h +1913 -119
warp/native/mathdx.cpp +43 -43
warp/native/mesh.cpp +24 -24
warp/native/mesh.cu +26 -26
warp/native/mesh.h +5 -3
warp/native/nanovdb/GridHandle.h +179 -12
warp/native/nanovdb/HostBuffer.h +8 -7
warp/native/nanovdb/NanoVDB.h +517 -895
warp/native/nanovdb/NodeManager.h +323 -0
warp/native/nanovdb/PNanoVDB.h +2 -2
warp/native/quat.h +337 -16
warp/native/rand.h +7 -7
warp/native/range.h +7 -1
warp/native/reduce.cpp +10 -10
warp/native/reduce.cu +13 -14
warp/native/runlength_encode.cpp +2 -2
warp/native/runlength_encode.cu +5 -5
warp/native/scan.cpp +3 -3
warp/native/scan.cu +4 -4
warp/native/sort.cpp +10 -10
warp/native/sort.cu +22 -22
warp/native/sparse.cpp +8 -8
warp/native/sparse.cu +14 -14
warp/native/spatial.h +366 -17
warp/native/svd.h +23 -8
warp/native/temp_buffer.h +2 -2
warp/native/tile.h +303 -70
warp/native/tile_radix_sort.h +5 -1
warp/native/tile_reduce.h +16 -25
warp/native/tuple.h +2 -2
warp/native/vec.h +385 -18
warp/native/volume.cpp +54 -54
warp/native/volume.cu +1 -1
warp/native/volume.h +2 -1
warp/native/volume_builder.cu +30 -37
warp/native/warp.cpp +150 -149
warp/native/warp.cu +337 -193
warp/native/warp.h +227 -226
warp/optim/linear.py +736 -271
warp/render/imgui_manager.py +289 -0
warp/render/render_opengl.py +137 -57
warp/render/render_usd.py +0 -1
warp/sim/collide.py +1 -2
warp/sim/graph_coloring.py +2 -2
warp/sim/integrator_vbd.py +10 -2
warp/sparse.py +559 -176
warp/tape.py +2 -0
warp/tests/aux_test_module_aot.py +7 -0
warp/tests/cuda/test_async.py +3 -3
warp/tests/cuda/test_conditional_captures.py +101 -0
warp/tests/geometry/test_marching_cubes.py +233 -12
warp/tests/sim/test_cloth.py +89 -6
warp/tests/sim/test_coloring.py +82 -7
warp/tests/test_array.py +56 -5
warp/tests/test_assert.py +53 -0
warp/tests/test_atomic_cas.py +127 -114
warp/tests/test_codegen.py +3 -2
warp/tests/test_context.py +8 -15
warp/tests/test_enum.py +136 -0
warp/tests/test_examples.py +2 -2
warp/tests/test_fem.py +45 -2
warp/tests/test_fixedarray.py +229 -0
warp/tests/test_func.py +18 -15
warp/tests/test_future_annotations.py +7 -5
warp/tests/test_linear_solvers.py +30 -0
warp/tests/test_map.py +1 -1
warp/tests/test_mat.py +1540 -378
warp/tests/test_mat_assign_copy.py +178 -0
warp/tests/test_mat_constructors.py +574 -0
warp/tests/test_module_aot.py +287 -0
warp/tests/test_print.py +69 -0
warp/tests/test_quat.py +162 -34
warp/tests/test_quat_assign_copy.py +145 -0
warp/tests/test_reload.py +2 -1
warp/tests/test_sparse.py +103 -0
warp/tests/test_spatial.py +140 -34
warp/tests/test_spatial_assign_copy.py +160 -0
warp/tests/test_static.py +48 -0
warp/tests/test_struct.py +43 -3
warp/tests/test_tape.py +38 -0
warp/tests/test_types.py +0 -20
warp/tests/test_vec.py +216 -441
warp/tests/test_vec_assign_copy.py +143 -0
warp/tests/test_vec_constructors.py +325 -0
warp/tests/tile/test_tile.py +206 -152
warp/tests/tile/test_tile_cholesky.py +605 -0
warp/tests/tile/test_tile_load.py +169 -0
warp/tests/tile/test_tile_mathdx.py +2 -558
warp/tests/tile/test_tile_matmul.py +179 -0
warp/tests/tile/test_tile_mlp.py +1 -1
warp/tests/tile/test_tile_reduce.py +100 -11
warp/tests/tile/test_tile_shared_memory.py +16 -16
warp/tests/tile/test_tile_sort.py +59 -55
warp/tests/unittest_suites.py +16 -0
warp/tests/walkthrough_debug.py +1 -1
warp/thirdparty/unittest_parallel.py +108 -9
warp/types.py +554 -264
warp/utils.py +68 -86
{warp_lang-1.8.0.dist-info → warp_lang-1.9.0.dist-info}/METADATA +28 -65
{warp_lang-1.8.0.dist-info → warp_lang-1.9.0.dist-info}/RECORD +150 -138
warp/native/marching.cpp +0 -19
warp/native/marching.cu +0 -514
warp/native/marching.h +0 -19
{warp_lang-1.8.0.dist-info → warp_lang-1.9.0.dist-info}/WHEEL +0 -0
{warp_lang-1.8.0.dist-info → warp_lang-1.9.0.dist-info}/licenses/LICENSE.md +0 -0
{warp_lang-1.8.0.dist-info → warp_lang-1.9.0.dist-info}/top_level.txt +0 -0

warp/tests/tile/test_tile_cholesky.py ADDED Viewed

@@ -0,0 +1,605 @@
+# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import numpy as np
+import warp as wp
+from warp.tests.unittest_utils import *
+wp.init()  # For wp.context.runtime.core.wp_is_mathdx_enabled()
+TILE_M = wp.constant(8)
+TILE_N = wp.constant(4)
+TILE_K = wp.constant(8)
+# num threads per-tile
+TILE_DIM = 32
+@wp.kernel()
+def tile_math_cholesky(
+    gA: wp.array2d(dtype=wp.float64),
+    gD: wp.array1d(dtype=wp.float64),
+    gL: wp.array2d(dtype=wp.float64),
+    gy: wp.array1d(dtype=wp.float64),
+    gx: wp.array1d(dtype=wp.float64),
+):
+    i, j = wp.tid()
+    # Load A, D & y
+    a = wp.tile_load(gA, shape=(TILE_M, TILE_M), storage="shared")
+    d = wp.tile_load(gD, shape=TILE_M, storage="shared")
+    y = wp.tile_load(gy, shape=TILE_M, storage="shared")
+    # Ensure tile_diag_add() and tile_cholesky_solve() work with transposed matrices
+    a_t = wp.tile_transpose(a)
+    # Compute L st LL^T = A^T + diag(D)
+    b = wp.tile_diag_add(a_t, d)
+    l = wp.tile_cholesky(b)
+    # Solve for y in LL^T x = y
+    x = wp.tile_cholesky_solve(l, y)
+    # Store L & y
+    wp.tile_store(gL, l)
+    wp.tile_store(gx, x)
+def test_tile_cholesky_cholesky(test, device):
+    A_h = np.ones((TILE_M, TILE_M), dtype=np.float64)
+    D_h = 8.0 * np.ones(TILE_M, dtype=np.float64)
+    L_h = np.zeros_like(A_h)
+    Y_h = np.arange(TILE_M, dtype=np.float64)
+    X_h = np.zeros_like(Y_h)
+    A_np = A_h.T + np.diag(D_h)
+    L_np = np.linalg.cholesky(A_np)
+    X_np = np.linalg.solve(A_np, Y_h)
+    A_wp = wp.array(A_h, requires_grad=True, dtype=wp.float64, device=device)
+    D_wp = wp.array(D_h, requires_grad=True, dtype=wp.float64, device=device)
+    L_wp = wp.array(L_h, requires_grad=True, dtype=wp.float64, device=device)
+    Y_wp = wp.array(Y_h, requires_grad=True, dtype=wp.float64, device=device)
+    X_wp = wp.array(X_h, requires_grad=True, dtype=wp.float64, device=device)
+    wp.launch_tiled(
+        tile_math_cholesky, dim=[1, 1], inputs=[A_wp, D_wp, L_wp, Y_wp, X_wp], block_dim=TILE_DIM, device=device
+    )
+    wp.synchronize_device(device)
+    np.testing.assert_allclose(X_wp.numpy(), X_np)
+    np.testing.assert_allclose(L_wp.numpy(), L_np)
+    # TODO: implement and test backward pass
+@wp.kernel()
+def tile_math_cholesky_multiple_rhs(
+    gA: wp.array2d(dtype=wp.float64),
+    gD: wp.array1d(dtype=wp.float64),
+    gL: wp.array2d(dtype=wp.float64),
+    gy: wp.array2d(dtype=wp.float64),
+    gx: wp.array2d(dtype=wp.float64),
+    gz: wp.array2d(dtype=wp.float64),
+):
+    i, j = wp.tid()
+    # Load A, D & y
+    a = wp.tile_load(gA, shape=(TILE_M, TILE_M), storage="shared")
+    d = wp.tile_load(gD, shape=TILE_M, storage="shared")
+    y = wp.tile_load(gy, shape=(TILE_M, TILE_M), storage="shared")
+    # Ensure tile_diag_add() and tile_cholesky_solve() work with transposed matrices
+    a_t = wp.tile_transpose(a)
+    # Compute L st LL^T = A.T + diag(D)
+    b = wp.tile_diag_add(a_t, d)
+    l = wp.tile_cholesky(b)
+    # Solve for y in LL^T x = y.T
+    y_t = wp.tile_transpose(y)
+    x = wp.tile_cholesky_solve(l, y_t)
+    # Ensure matmul receives correct layout information
+    z = wp.tile_matmul(x, x)
+    # Store L & y
+    wp.tile_store(gL, l)
+    wp.tile_store(gx, x)
+    wp.tile_store(gz, z)
+def test_tile_cholesky_cholesky_multiple_rhs(test, device):
+    A_h = np.ones((TILE_M, TILE_M), dtype=np.float64)
+    D_h = 8.0 * np.ones(TILE_M, dtype=np.float64)
+    L_h = np.zeros_like(A_h)
+    Y_h = np.arange(TILE_M * TILE_M, dtype=np.float64).reshape((TILE_M, TILE_M))
+    X_h = np.zeros_like(Y_h)
+    Z_h = np.zeros_like(Y_h)
+    A_np = A_h.T + np.diag(D_h)
+    L_np = np.linalg.cholesky(A_np)
+    X_np = np.linalg.solve(A_np, Y_h.T)
+    Z_np = X_np @ X_np
+    A_wp = wp.array(A_h, requires_grad=True, dtype=wp.float64, device=device)
+    D_wp = wp.array(D_h, requires_grad=True, dtype=wp.float64, device=device)
+    L_wp = wp.array(L_h, requires_grad=True, dtype=wp.float64, device=device)
+    Y_wp = wp.array(Y_h, requires_grad=True, dtype=wp.float64, device=device)
+    X_wp = wp.array(X_h, requires_grad=True, dtype=wp.float64, device=device)
+    Z_wp = wp.array(Z_h, requires_grad=True, dtype=wp.float64, device=device)
+    wp.launch_tiled(
+        tile_math_cholesky_multiple_rhs,
+        dim=[1, 1],
+        inputs=[A_wp, D_wp, L_wp, Y_wp, X_wp, Z_wp],
+        block_dim=TILE_DIM,
+        device=device,
+    )
+    wp.synchronize_device(device)
+    np.testing.assert_allclose(L_wp.numpy(), L_np)
+    np.testing.assert_allclose(X_wp.numpy(), X_np)
+    np.testing.assert_allclose(Z_wp.numpy(), Z_np)
+    # TODO: implement and test backward pass
+@wp.kernel
+def tile_math_forward_substitution(
+    gL: wp.array2d(dtype=wp.float64), gx: wp.array1d(dtype=wp.float64), gz: wp.array1d(dtype=wp.float64)
+):
+    i, j = wp.tid()
+    # Load L & x
+    L = wp.tile_load(gL, shape=(TILE_M, TILE_M), storage="shared")
+    x = wp.tile_load(gx, shape=TILE_M, storage="shared")
+    # Solve for z in Lz = x
+    # Transpose because we loaded an upper triangular matrix
+    z = wp.tile_lower_solve(wp.tile_transpose(L), x)
+    # Store z
+    wp.tile_store(gz, z)
+def test_tile_cholesky_forward_substitution(test, device):
+    # Create test data
+    rng = np.random.default_rng(42)
+    L_h = np.triu(rng.random((TILE_M, TILE_M)))  # Upper triangular matrix
+    x_h = rng.random(TILE_M)
+    z_h = np.zeros_like(x_h)
+    # Compute reference solution using numpy
+    z_np = np.linalg.solve(L_h.T, x_h)
+    # Create Warp arrays
+    L_wp = wp.array(L_h, requires_grad=True, dtype=wp.float64, device=device)
+    x_wp = wp.array(x_h, requires_grad=True, dtype=wp.float64, device=device)
+    z_wp = wp.array(z_h, requires_grad=True, dtype=wp.float64, device=device)
+    # Run kernel
+    wp.launch_tiled(
+        tile_math_forward_substitution, dim=[1, 1], inputs=[L_wp, x_wp, z_wp], block_dim=TILE_DIM, device=device
+    )
+    wp.synchronize_device(device)
+    # Verify results
+    np.testing.assert_allclose(z_wp.numpy(), z_np)
+    # TODO: implement and test backward pass
+@wp.kernel
+def tile_math_back_substitution(
+    gL: wp.array2d(dtype=wp.float64), gx: wp.array1d(dtype=wp.float64), gz: wp.array1d(dtype=wp.float64)
+):
+    i, j = wp.tid()
+    # Load L & x
+    L = wp.tile_load(gL, shape=(TILE_M, TILE_M), storage="shared")
+    x = wp.tile_load(gx, shape=TILE_M, storage="shared")
+    # Solve for z in L^T z = x
+    # Transpose because we loaded a lower triangular matrix
+    z = wp.tile_upper_solve(wp.tile_transpose(L), x)
+    # Store z
+    wp.tile_store(gz, z)
+def test_tile_cholesky_back_substitution(test, device):
+    # Create test data
+    rng = np.random.default_rng(42)
+    L_h = np.tril(rng.random((TILE_M, TILE_M)))  # Lower triangular matrix
+    x_h = rng.random(TILE_M)
+    z_h = np.zeros_like(x_h)
+    # Compute reference solution using numpy
+    z_np = np.linalg.solve(L_h.T, x_h)
+    # Create Warp arrays
+    L_wp = wp.array(L_h, requires_grad=True, dtype=wp.float64, device=device)
+    x_wp = wp.array(x_h, requires_grad=True, dtype=wp.float64, device=device)
+    z_wp = wp.array(z_h, requires_grad=True, dtype=wp.float64, device=device)
+    # Run kernel
+    wp.launch_tiled(
+        tile_math_back_substitution, dim=[1, 1], inputs=[L_wp, x_wp, z_wp], block_dim=TILE_DIM, device=device
+    )
+    wp.synchronize_device(device)
+    # Verify results
+    np.testing.assert_allclose(z_wp.numpy(), z_np)
+    # TODO: implement and test backward pass
+@wp.kernel
+def tile_math_forward_substitution_multiple_rhs(
+    gL: wp.array2d(dtype=wp.float64),
+    gx: wp.array2d(dtype=wp.float64),
+    gz: wp.array2d(dtype=wp.float64),
+    gc: wp.array2d(dtype=wp.float64),
+):
+    i, j = wp.tid()
+    # Load L & x
+    L = wp.tile_load(gL, shape=(TILE_M, TILE_M), storage="shared")
+    x = wp.tile_load(gx, shape=(TILE_M, TILE_M), storage="shared")
+    # Solve for z in Lz = x.T
+    x_t = wp.tile_transpose(x)
+    z = wp.tile_lower_solve(L, x_t)
+    # Ensure matmul receives correct layout information
+    c = wp.tile_matmul(z, z)
+    # Store z and c
+    wp.tile_store(gz, z)
+    wp.tile_store(gc, c)
+def test_tile_cholesky_forward_substitution_multiple_rhs(test, device):
+    # Create test data
+    rng = np.random.default_rng(42)
+    L_h = np.tril(rng.random((TILE_M, TILE_M)))  # Lower triangular matrix
+    x_h = rng.random((TILE_M, TILE_M))  # Multiple right-hand sides
+    z_h = np.zeros_like(x_h)
+    c_h = np.zeros_like(x_h)
+    # Compute reference solution using numpy
+    z_np = np.linalg.solve(L_h, x_h.T)
+    c_np = z_np @ z_np
+    # Create Warp arrays
+    L_wp = wp.array(L_h, requires_grad=True, dtype=wp.float64, device=device)
+    x_wp = wp.array(x_h, requires_grad=True, dtype=wp.float64, device=device)
+    z_wp = wp.array(z_h, requires_grad=True, dtype=wp.float64, device=device)
+    c_wp = wp.array(c_h, requires_grad=True, dtype=wp.float64, device=device)
+    # Run kernel
+    wp.launch_tiled(
+        tile_math_forward_substitution_multiple_rhs,
+        dim=[1, 1],
+        inputs=[L_wp, x_wp, z_wp, c_wp],
+        block_dim=TILE_DIM,
+        device=device,
+    )
+    wp.synchronize_device(device)
+    # Verify results
+    test.assertTrue(np.allclose(z_wp.numpy(), z_np))
+    test.assertTrue(np.allclose(c_wp.numpy(), c_np))
+    # TODO: implement and test backward pass
+@wp.kernel
+def tile_math_back_substitution_multiple_rhs(
+    gL: wp.array2d(dtype=wp.float64),
+    gx: wp.array2d(dtype=wp.float64),
+    gz: wp.array2d(dtype=wp.float64),
+    gc: wp.array2d(dtype=wp.float64),
+):
+    i, j = wp.tid()
+    # Load L & x
+    L = wp.tile_load(gL, shape=(TILE_M, TILE_M), storage="shared")
+    x = wp.tile_load(gx, shape=(TILE_M, TILE_M), storage="shared")
+    # Solve for z in L^T z = x.T
+    x_t = wp.tile_transpose(x)
+    z = wp.tile_upper_solve(wp.tile_transpose(L), x_t)
+    # Ensure matmul receives correct layout information
+    c = wp.tile_matmul(z, z)
+    # Store z and c
+    wp.tile_store(gz, z)
+    wp.tile_store(gc, c)
+def test_tile_cholesky_back_substitution_multiple_rhs(test, device):
+    # Create test data
+    rng = np.random.default_rng(42)
+    L_h = np.tril(rng.random((TILE_M, TILE_M)))  # Lower triangular matrix
+    x_h = rng.random((TILE_M, TILE_M))  # Multiple right-hand sides
+    z_h = np.zeros_like(x_h)
+    c_h = np.zeros_like(x_h)
+    # Compute reference solution using numpy
+    z_np = np.linalg.solve(L_h.T, x_h.T)
+    c_np = z_np @ z_np
+    # Create Warp arrays
+    L_wp = wp.array(L_h, requires_grad=True, dtype=wp.float64, device=device)
+    x_wp = wp.array(x_h, requires_grad=True, dtype=wp.float64, device=device)
+    z_wp = wp.array(z_h, requires_grad=True, dtype=wp.float64, device=device)
+    c_wp = wp.array(c_h, requires_grad=True, dtype=wp.float64, device=device)
+    # Run kernel
+    wp.launch_tiled(
+        tile_math_back_substitution_multiple_rhs,
+        dim=[1, 1],
+        inputs=[L_wp, x_wp, z_wp, c_wp],
+        block_dim=TILE_DIM,
+        device=device,
+    )
+    wp.synchronize_device(device)
+    # Verify results
+    test.assertTrue(np.allclose(z_wp.numpy(), z_np))
+    test.assertTrue(np.allclose(c_wp.numpy(), c_np))
+    # TODO: implement and test backward pass
+# tests a complex composition of most libmathdx calls
+def test_tile_cholesky_block_cholesky(test, device):
+    BLOCK_SIZE = wp.constant(TILE_M // 2)
+    @wp.kernel(module="unique")
+    def block_cholesky_kernel(
+        A: wp.array2d(dtype=float),
+        L: wp.array2d(dtype=float),
+    ):
+        """
+        Computes the Cholesky factorization of a symmetric positive definite matrix A in blocks.
+        It returns a lower-triangular matrix L such that A = L L^T.
+        """
+        # Process the matrix in blocks along its leading dimension.
+        for k in range(0, TILE_M, BLOCK_SIZE):
+            end = k + BLOCK_SIZE
+            # Load current diagonal block A[k:end, k:end]
+            # and update with contributions from previously computed blocks.
+            A_kk_tile = wp.tile_load(A, shape=(BLOCK_SIZE, BLOCK_SIZE), offset=(k, k), storage="shared")
+            for j in range(0, k, BLOCK_SIZE):
+                L_block = wp.tile_load(L, shape=(BLOCK_SIZE, BLOCK_SIZE), offset=(k, j))
+                L_block_T = wp.tile_transpose(L_block)
+                L_L_T_block = wp.tile_matmul(L_block, L_block_T)
+                A_kk_tile -= L_L_T_block
+            # Compute the Cholesky factorization for the block
+            # print(A_kk_tile)
+            L_kk_tile = wp.tile_cholesky(A_kk_tile)
+            wp.tile_store(L, L_kk_tile, offset=(k, k))
+            # Process the blocks below the current block
+            for i in range(end, TILE_M, BLOCK_SIZE):
+                A_ik_tile = wp.tile_load(A, shape=(BLOCK_SIZE, BLOCK_SIZE), offset=(i, k), storage="shared")
+                for j in range(0, k, BLOCK_SIZE):
+                    L_tile = wp.tile_load(L, shape=(BLOCK_SIZE, BLOCK_SIZE), offset=(i, j))
+                    L_2_tile = wp.tile_load(L, shape=(BLOCK_SIZE, BLOCK_SIZE), offset=(k, j))
+                    L_T_tile = wp.tile_transpose(L_2_tile)
+                    L_L_T_tile = wp.tile_matmul(L_tile, L_T_tile)
+                    A_ik_tile -= L_L_T_tile
+                A_ik_T_tile = wp.tile_transpose(A_ik_tile)
+                sol_T_tile = wp.tile_lower_solve(L_kk_tile, A_ik_T_tile)
+                sol_tile = wp.tile_transpose(sol_T_tile)
+                wp.tile_store(L, sol_tile, offset=(i, k))
+    @wp.kernel(module="unique")
+    def block_cholesky_solve_kernel(
+        L: wp.array2d(dtype=float),
+        b: wp.array2d(dtype=float),
+        scratch: wp.array2d(dtype=float),
+        x: wp.array2d(dtype=float),
+    ):
+        """
+        Solves A x = b given the Cholesky factor L (A = L L^T) using
+        blocked forward and backward substitution.
+        """
+        # Forward substitution: solve L y = b
+        for i in range(0, TILE_M, BLOCK_SIZE):
+            i_end = i + BLOCK_SIZE
+            rhs_tile = wp.tile_load(b, shape=(BLOCK_SIZE, 1), offset=(i, 0))
+            for j in range(0, i, BLOCK_SIZE):
+                L_block = wp.tile_load(L, shape=(BLOCK_SIZE, BLOCK_SIZE), offset=(i, j))
+                y_block = wp.tile_load(scratch, shape=(BLOCK_SIZE, 1), offset=(j, 0))
+                Ly_block = wp.tile_matmul(L_block, y_block)
+                rhs_tile -= Ly_block
+            L_tile = wp.tile_load(L, shape=(BLOCK_SIZE, BLOCK_SIZE), offset=(i, i))
+            y_tile = wp.tile_lower_solve(L_tile, rhs_tile)
+            wp.tile_store(scratch, y_tile, offset=(i, 0))
+        # Backward substitution: solve L^T x = y
+        for i in range(TILE_M - BLOCK_SIZE, -1, -BLOCK_SIZE):
+            i_start = i
+            i_end = i_start + BLOCK_SIZE
+            rhs_tile = wp.tile_load(scratch, shape=(BLOCK_SIZE, 1), offset=(i_start, 0))
+            for j in range(i_end, TILE_M, BLOCK_SIZE):
+                L_tile = wp.tile_load(L, shape=(BLOCK_SIZE, BLOCK_SIZE), offset=(j, i_start))
+                L_T_tile = wp.tile_transpose(L_tile)
+                x_tile = wp.tile_load(x, shape=(BLOCK_SIZE, 1), offset=(j, 0))
+                L_T_x_tile = wp.tile_matmul(L_T_tile, x_tile)
+                rhs_tile -= L_T_x_tile
+            L_tile = wp.tile_load(L, shape=(BLOCK_SIZE, BLOCK_SIZE), offset=(i_start, i_start))
+            x_tile = wp.tile_upper_solve(wp.tile_transpose(L_tile), rhs_tile)
+            wp.tile_store(x, x_tile, offset=(i_start, 0))
+    # check block cholesky decomposition
+    rng = np.random.default_rng(42)
+    M = np.array(rng.random((TILE_M, TILE_M)), dtype=float)
+    A_np = M.T @ M + np.eye(TILE_M, TILE_M)
+    L_np = np.linalg.cholesky(A_np)
+    A_wp = wp.array(A_np, dtype=float, device=device)
+    L_wp = wp.zeros_like(A_wp)
+    wp.launch_tiled(block_cholesky_kernel, dim=1, inputs=[A_wp], outputs=[L_wp], block_dim=TILE_DIM, device=device)
+    # check block cholesky solve
+    assert_np_equal(L_wp.numpy(), L_np, tol=1e-6)
+    b_np = np.array(rng.random((TILE_M, 1)), dtype=float)
+    b_wp = wp.array(b_np, dtype=float, device=device)
+    scratch = wp.zeros_like(b_wp)
+    x_np = np.linalg.solve(L_np.T, np.linalg.solve(L_np, b_np))
+    x_wp = wp.zeros_like(b_wp)
+    wp.launch_tiled(
+        block_cholesky_solve_kernel,
+        dim=1,
+        inputs=[L_wp, b_wp, scratch],
+        outputs=[x_wp],
+        block_dim=TILE_DIM,
+        device=device,
+    )
+    assert_np_equal(x_wp.numpy(), x_np, tol=1e-6)
+@wp.kernel
+def test_tile_lower_solve(L: wp.array2d(dtype=float), y: wp.array(dtype=float), x: wp.array(dtype=float)):
+    L_tile = wp.tile_load(L, shape=(TILE_M, TILE_M))
+    y_tile = wp.tile_load(x, shape=(TILE_M,))
+    sol = wp.tile_lower_solve(L_tile, y_tile)
+    wp.tile_store(x, sol)
+@wp.kernel
+def test_tile_upper_solve(L: wp.array2d(dtype=float), y: wp.array(dtype=float), x: wp.array(dtype=float)):
+    L_tile = wp.tile_load(L, shape=(TILE_M, TILE_M))
+    y_tile = wp.tile_load(x, shape=(TILE_M,))
+    sol = wp.tile_upper_solve(L_tile, y_tile)
+    wp.tile_store(x, sol)
+def test_tile_cholesky_singular_matrices(test, device):
+    if not wp.context.runtime.core.wp_is_mathdx_enabled():
+        test.skipTest("MathDx is not enabled")
+    rng = np.random.default_rng(42)
+    L_np = np.tril(rng.random((TILE_M, TILE_M)))  # Lower triangular matrix
+    L_np[-1, -1] = 0.0  # Make it singular
+    y_np = rng.random(TILE_M)
+    L_wp = wp.array2d(L_np, dtype=float, device=device)
+    y_wp = wp.array(y_np, dtype=float, device=device)
+    x_wp = wp.zeros_like(y_wp)
+    wp.launch_tiled(
+        test_tile_lower_solve, dim=1, inputs=[L_wp, y_wp], outputs=[x_wp], block_dim=TILE_DIM, device=device
+    )
+    test.assertTrue(np.isnan(x_wp.numpy()).any())
+    L_np = np.triu(rng.random((TILE_M, TILE_M)))  # Upper triangular matrix
+    L_np[-1, -1] = 0.0  # Make it singular
+    L_wp = wp.array2d(L_np, dtype=float, device=device)
+    y_wp = wp.array(y_np, dtype=float, device=device)
+    x_wp = wp.zeros_like(y_wp)
+    wp.launch_tiled(
+        test_tile_upper_solve, dim=1, inputs=[L_wp, y_wp], outputs=[x_wp], block_dim=TILE_DIM, device=device
+    )
+    test.assertTrue(np.isnan(x_wp.numpy()).any())
+all_devices = get_test_devices()
+cuda_devices = get_cuda_test_devices()
+@unittest.skipUnless(
+    not wp.context.runtime.core.wp_is_mathdx_enabled()
+    or (wp.context.runtime.core.wp_is_mathdx_enabled() and wp.context.runtime.core.wp_cuda_toolkit_version() >= 12060),
+    "MathDx is not enabled or is enabled but CUDA toolkit version is less than 12.6",
+)
+class TestTileCholesky(unittest.TestCase):
+    pass
+add_function_test(
+    TestTileCholesky,
+    "test_tile_cholesky_cholesky",
+    test_tile_cholesky_cholesky,
+    devices=all_devices,
+    check_output=False,
+)
+add_function_test(
+    TestTileCholesky,
+    "test_tile_cholesky_cholesky_multiple_rhs",
+    test_tile_cholesky_cholesky_multiple_rhs,
+    devices=all_devices,
+    check_output=False,
+)
+add_function_test(
+    TestTileCholesky,
+    "test_tile_cholesky_forward_substitution",
+    test_tile_cholesky_forward_substitution,
+    devices=cuda_devices,
+    check_output=False,
+)
+add_function_test(
+    TestTileCholesky,
+    "test_tile_cholesky_back_substitution",
+    test_tile_cholesky_back_substitution,
+    devices=cuda_devices,
+    check_output=False,
+)
+add_function_test(
+    TestTileCholesky,
+    "test_tile_cholesky_forward_substitution_multiple_rhs",
+    test_tile_cholesky_forward_substitution_multiple_rhs,
+    devices=cuda_devices,
+    check_output=False,
+)
+add_function_test(
+    TestTileCholesky,
+    "test_tile_cholesky_back_substitution_multiple_rhs",
+    test_tile_cholesky_back_substitution_multiple_rhs,
+    devices=cuda_devices,
+    check_output=False,
+)
+add_function_test(
+    TestTileCholesky,
+    "test_tile_cholesky_block_cholesky",
+    test_tile_cholesky_block_cholesky,
+    devices=cuda_devices,
+    check_output=False,
+)
+add_function_test(
+    TestTileCholesky,
+    "test_tile_cholesky_singular_matrices",
+    test_tile_cholesky_singular_matrices,
+    devices=cuda_devices,
+    check_output=False,
+)
+if __name__ == "__main__":
+    wp.clear_kernel_cache()
+    unittest.main(verbosity=2, failfast=True)