PyPI - warp-lang - Versions diffs - 1.7.2__py3-none-manylinux_2_34_aarch64.whl → 1.8.0__py3-none-manylinux_2_34_aarch64.whl - Mend

warp-lang 1.7.2__py3-none-manylinux_2_34_aarch64.whl → 1.8.0__py3-none-manylinux_2_34_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (180) hide show

warp/__init__.py +3 -1
warp/__init__.pyi +3489 -1
warp/autograd.py +45 -122
warp/bin/warp.so +0 -0
warp/build.py +241 -252
warp/build_dll.py +125 -26
warp/builtins.py +1907 -384
warp/codegen.py +257 -101
warp/config.py +12 -1
warp/constants.py +1 -1
warp/context.py +657 -223
warp/dlpack.py +1 -1
warp/examples/benchmarks/benchmark_cloth.py +2 -2
warp/examples/benchmarks/benchmark_tile_sort.py +155 -0
warp/examples/core/example_sample_mesh.py +1 -1
warp/examples/core/example_spin_lock.py +93 -0
warp/examples/core/example_work_queue.py +118 -0
warp/examples/fem/example_adaptive_grid.py +5 -5
warp/examples/fem/example_apic_fluid.py +1 -1
warp/examples/fem/example_burgers.py +1 -1
warp/examples/fem/example_convection_diffusion.py +9 -6
warp/examples/fem/example_darcy_ls_optimization.py +489 -0
warp/examples/fem/example_deformed_geometry.py +1 -1
warp/examples/fem/example_diffusion.py +2 -2
warp/examples/fem/example_diffusion_3d.py +1 -1
warp/examples/fem/example_distortion_energy.py +1 -1
warp/examples/fem/example_elastic_shape_optimization.py +387 -0
warp/examples/fem/example_magnetostatics.py +5 -3
warp/examples/fem/example_mixed_elasticity.py +5 -3
warp/examples/fem/example_navier_stokes.py +11 -9
warp/examples/fem/example_nonconforming_contact.py +5 -3
warp/examples/fem/example_streamlines.py +8 -3
warp/examples/fem/utils.py +9 -8
warp/examples/interop/example_jax_ffi_callback.py +2 -2
warp/examples/optim/example_drone.py +1 -1
warp/examples/sim/example_cloth.py +1 -1
warp/examples/sim/example_cloth_self_contact.py +48 -54
warp/examples/tile/example_tile_block_cholesky.py +502 -0
warp/examples/tile/example_tile_cholesky.py +2 -1
warp/examples/tile/example_tile_convolution.py +1 -1
warp/examples/tile/example_tile_filtering.py +1 -1
warp/examples/tile/example_tile_matmul.py +1 -1
warp/examples/tile/example_tile_mlp.py +2 -0
warp/fabric.py +7 -7
warp/fem/__init__.py +5 -0
warp/fem/adaptivity.py +1 -1
warp/fem/cache.py +152 -63
warp/fem/dirichlet.py +2 -2
warp/fem/domain.py +136 -6
warp/fem/field/field.py +141 -99
warp/fem/field/nodal_field.py +85 -39
warp/fem/field/virtual.py +97 -52
warp/fem/geometry/adaptive_nanogrid.py +91 -86
warp/fem/geometry/closest_point.py +13 -0
warp/fem/geometry/deformed_geometry.py +102 -40
warp/fem/geometry/element.py +56 -2
warp/fem/geometry/geometry.py +323 -22
warp/fem/geometry/grid_2d.py +157 -62
warp/fem/geometry/grid_3d.py +116 -20
warp/fem/geometry/hexmesh.py +86 -20
warp/fem/geometry/nanogrid.py +166 -86
warp/fem/geometry/partition.py +59 -25
warp/fem/geometry/quadmesh.py +86 -135
warp/fem/geometry/tetmesh.py +47 -119
warp/fem/geometry/trimesh.py +77 -270
warp/fem/integrate.py +107 -52
warp/fem/linalg.py +25 -58
warp/fem/operator.py +124 -27
warp/fem/quadrature/pic_quadrature.py +36 -14
warp/fem/quadrature/quadrature.py +40 -16
warp/fem/space/__init__.py +1 -1
warp/fem/space/basis_function_space.py +66 -46
warp/fem/space/basis_space.py +17 -4
warp/fem/space/dof_mapper.py +1 -1
warp/fem/space/function_space.py +2 -2
warp/fem/space/grid_2d_function_space.py +4 -1
warp/fem/space/hexmesh_function_space.py +4 -2
warp/fem/space/nanogrid_function_space.py +3 -1
warp/fem/space/partition.py +11 -2
warp/fem/space/quadmesh_function_space.py +4 -1
warp/fem/space/restriction.py +5 -2
warp/fem/space/shape/__init__.py +10 -8
warp/fem/space/tetmesh_function_space.py +4 -1
warp/fem/space/topology.py +52 -21
warp/fem/space/trimesh_function_space.py +4 -1
warp/fem/utils.py +53 -8
warp/jax.py +1 -2
warp/jax_experimental/ffi.py +12 -17
warp/jax_experimental/xla_ffi.py +37 -24
warp/math.py +171 -1
warp/native/array.h +99 -0
warp/native/builtin.h +174 -31
warp/native/coloring.cpp +1 -1
warp/native/exports.h +118 -63
warp/native/intersect.h +3 -3
warp/native/mat.h +5 -10
warp/native/mathdx.cpp +11 -5
warp/native/matnn.h +1 -123
warp/native/quat.h +28 -4
warp/native/sparse.cpp +121 -258
warp/native/sparse.cu +181 -274
warp/native/spatial.h +305 -17
warp/native/tile.h +583 -72
warp/native/tile_radix_sort.h +1108 -0
warp/native/tile_reduce.h +237 -2
warp/native/tile_scan.h +240 -0
warp/native/tuple.h +189 -0
warp/native/vec.h +6 -16
warp/native/warp.cpp +36 -4
warp/native/warp.cu +574 -51
warp/native/warp.h +47 -74
warp/optim/linear.py +5 -1
warp/paddle.py +7 -8
warp/py.typed +0 -0
warp/render/render_opengl.py +58 -29
warp/render/render_usd.py +124 -61
warp/sim/__init__.py +9 -0
warp/sim/collide.py +252 -78
warp/sim/graph_coloring.py +8 -1
warp/sim/import_mjcf.py +4 -3
warp/sim/import_usd.py +11 -7
warp/sim/integrator.py +5 -2
warp/sim/integrator_euler.py +1 -1
warp/sim/integrator_featherstone.py +1 -1
warp/sim/integrator_vbd.py +751 -320
warp/sim/integrator_xpbd.py +1 -1
warp/sim/model.py +265 -260
warp/sim/utils.py +10 -7
warp/sparse.py +303 -166
warp/tape.py +52 -51
warp/tests/cuda/test_conditional_captures.py +1046 -0
warp/tests/cuda/test_streams.py +1 -1
warp/tests/geometry/test_volume.py +2 -2
warp/tests/interop/test_dlpack.py +9 -9
warp/tests/interop/test_jax.py +0 -1
warp/tests/run_coverage_serial.py +1 -1
warp/tests/sim/disabled_kinematics.py +2 -2
warp/tests/sim/{test_vbd.py → test_cloth.py} +296 -113
warp/tests/sim/test_collision.py +159 -51
warp/tests/sim/test_coloring.py +15 -1
warp/tests/test_array.py +254 -2
warp/tests/test_array_reduce.py +2 -2
warp/tests/test_atomic_cas.py +299 -0
warp/tests/test_codegen.py +142 -19
warp/tests/test_conditional.py +47 -1
warp/tests/test_ctypes.py +0 -20
warp/tests/test_devices.py +8 -0
warp/tests/test_fabricarray.py +4 -2
warp/tests/test_fem.py +58 -25
warp/tests/test_func.py +42 -1
warp/tests/test_grad.py +1 -1
warp/tests/test_lerp.py +1 -3
warp/tests/test_map.py +481 -0
warp/tests/test_mat.py +1 -24
warp/tests/test_quat.py +6 -15
warp/tests/test_rounding.py +10 -38
warp/tests/test_runlength_encode.py +7 -7
warp/tests/test_smoothstep.py +1 -1
warp/tests/test_sparse.py +51 -2
warp/tests/test_spatial.py +507 -1
warp/tests/test_struct.py +2 -2
warp/tests/test_tuple.py +265 -0
warp/tests/test_types.py +2 -2
warp/tests/test_utils.py +24 -18
warp/tests/tile/test_tile.py +420 -1
warp/tests/tile/test_tile_mathdx.py +518 -14
warp/tests/tile/test_tile_reduce.py +213 -0
warp/tests/tile/test_tile_shared_memory.py +130 -1
warp/tests/tile/test_tile_sort.py +117 -0
warp/tests/unittest_suites.py +4 -6
warp/types.py +462 -308
warp/utils.py +647 -86
{warp_lang-1.7.2.dist-info → warp_lang-1.8.0.dist-info}/METADATA +20 -6
{warp_lang-1.7.2.dist-info → warp_lang-1.8.0.dist-info}/RECORD +177 -165
warp/stubs.py +0 -3381
warp/tests/sim/test_xpbd.py +0 -399
warp/tests/test_mlp.py +0 -282
{warp_lang-1.7.2.dist-info → warp_lang-1.8.0.dist-info}/WHEEL +0 -0
{warp_lang-1.7.2.dist-info → warp_lang-1.8.0.dist-info}/licenses/LICENSE.md +0 -0
{warp_lang-1.7.2.dist-info → warp_lang-1.8.0.dist-info}/top_level.txt +0 -0

warp/examples/sim/example_cloth_self_contact.py CHANGED Viewed

@@ -128,6 +128,10 @@ class Example:
         self.num_substeps = 10
         self.iterations = 4
         self.dt = self.frame_dt / self.num_substeps
+        # the BVH used by VBDIntegrator will be rebuilt every self.bvh_rebuild_frames
+        # When the simulated object deforms significantly, simply refitting the BVH can lead to deterioration of the BVH's
+        # quality, in this case we need to completely rebuild the tree to achieve better query efficiency.
+        self.bvh_rebuild_frames = 10
         self.num_frames = num_frames
         self.sim_time = 0.0
@@ -227,69 +231,62 @@ class Example:
         self.cuda_graph = None
         if self.use_cuda_graph:
             with wp.ScopedCapture() as capture:
-                for _ in range(self.num_substeps):
-                    wp.launch(
-                        kernel=apply_rotation,
-                        dim=self.rot_point_indices.shape[0],
-                        inputs=[
-                            self.rot_point_indices,
-                            self.rot_axes,
-                            self.roots,
-                            self.roots_to_ps,
-                            self.t,
-                            self.rot_angular_velocity,
-                            self.dt,
-                            self.rot_end_time,
-                        ],
-                        outputs=[
-                            self.state0.particle_q,
-                            self.state1.particle_q,
-                        ],
-                    )
-                    self.integrator.simulate(self.model, self.state0, self.state1, self.dt, None)
-                    (self.state0, self.state1) = (self.state1, self.state0)
+                self.integrate_frame_substeps()
             self.cuda_graph = capture.graph
-    def step(self):
+    def integrate_frame_substeps(self):
+        for _ in range(self.num_substeps):
+            wp.launch(
+                kernel=apply_rotation,
+                dim=self.rot_point_indices.shape[0],
+                inputs=[
+                    self.rot_point_indices,
+                    self.rot_axes,
+                    self.roots,
+                    self.roots_to_ps,
+                    self.t,
+                    self.rot_angular_velocity,
+                    self.dt,
+                    self.rot_end_time,
+                ],
+                outputs=[
+                    self.state0.particle_q,
+                    self.state1.particle_q,
+                ],
+            )
+            self.integrator.simulate(self.model, self.state0, self.state1, self.dt, None)
+            (self.state0, self.state1) = (self.state1, self.state0)
+    def advance_frame(self):
         with wp.ScopedTimer("step", print=False, dict=self.profiler):
             if self.use_cuda_graph:
                 wp.capture_launch(self.cuda_graph)
             else:
-                for _ in range(self.num_substeps):
-                    wp.launch(
-                        kernel=apply_rotation,
-                        dim=self.rot_point_indices.shape[0],
-                        inputs=[
-                            self.rot_point_indices,
-                            self.rot_axes,
-                            self.roots,
-                            self.roots_to_ps,
-                            self.t,
-                            self.rot_angular_velocity,
-                            self.dt,
-                            self.rot_end_time,
-                        ],
-                        outputs=[
-                            self.state0.particle_q,
-                            self.state1.particle_q,
-                        ],
-                    )
-                    self.integrator.simulate(self.model, self.state0, self.state1, self.dt)
-                    (self.state0, self.state1) = (self.state1, self.state0)
+                self.integrate_frame_substeps()
             self.sim_time += self.dt
+    def run(self):
+        for i in range(self.num_frames):
+            self.advance_frame()
+            self.render()
+            print(f"[{i:4d}/{self.num_frames}]")
+            if i != 0 and not i % self.bvh_rebuild_frames and self.use_cuda_graph:
+                self.integrator.rebuild_bvh(self.state0)
+                with wp.ScopedCapture() as capture:
+                    self.integrate_frame_substeps()
+                self.cuda_graph = capture.graph
     def render(self):
         if self.renderer is None:
             return
-        with wp.ScopedTimer("render", print=False):
-            self.renderer.begin_frame(self.sim_time)
-            self.renderer.render(self.state0)
-            self.renderer.end_frame()
+        self.renderer.begin_frame(self.sim_time)
+        self.renderer.render(self.state0)
+        self.renderer.end_frame()
 if __name__ == "__main__":
@@ -310,13 +307,10 @@ if __name__ == "__main__":
     with wp.ScopedDevice(args.device):
         example = Example(stage_path=args.stage_path, num_frames=args.num_frames)
-        for i in range(example.num_frames):
-            example.step()
-            example.render()
-            print(f"[{i:4d}/{example.num_frames}]")
+        example.run()
         frame_times = example.profiler["step"]
-        print("\nAverage frame sim time: {:.2f} ms".format(sum(frame_times) / len(frame_times)))
+        print(f"\nAverage frame sim time: {sum(frame_times) / len(frame_times):.2f} ms")
         if example.renderer:
             example.renderer.save()

warp/examples/tile/example_tile_block_cholesky.py ADDED Viewed

@@ -0,0 +1,502 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+###########################################################################
+# Example Tile Block Cholesky
+#
+# Shows how to write a kernel computing a blocked Cholesky factorization
+# of a symmetric positive definite matrix using Warp Tile APIs.
+#
+###########################################################################
+from functools import lru_cache
+import numpy as np
+import warp as wp
+wp.set_module_options({"enable_backward": False})
+@lru_cache(maxsize=None)
+def create_blocked_cholesky_kernel(block_size: int):
+    @wp.kernel
+    def blocked_cholesky_kernel(
+        A: wp.array(dtype=float, ndim=2),
+        L: wp.array(dtype=float, ndim=2),
+        active_matrix_size_arr: wp.array(dtype=int, ndim=1),
+    ):
+        """
+        Computes the Cholesky factorization of a symmetric positive definite matrix A in blocks.
+        It returns a lower-triangular matrix L such that A = L L^T.
+        A is assumed to support block reading.
+        """
+        tid, tid_block = wp.tid()
+        num_threads_per_block = wp.block_dim()
+        active_matrix_size = active_matrix_size_arr[0]
+        # Round up active_matrix_size to next multiple of block_size
+        n = ((active_matrix_size + block_size - 1) // block_size) * block_size
+        # Process the matrix in blocks along its leading dimension.
+        for k in range(0, n, block_size):
+            end = k + block_size
+            # Load current diagonal block A[k:end, k:end]
+            # and update with contributions from previously computed blocks.
+            A_kk_tile = wp.tile_load(A, shape=(block_size, block_size), offset=(k, k), storage="shared")
+            # The following if pads the matrix if it is not divisible by block_size
+            if k + block_size > active_matrix_size:
+                num_tile_elements = block_size * block_size
+                num_iterations = (num_tile_elements + num_threads_per_block - 1) // num_threads_per_block
+                for i in range(num_iterations):
+                    linear_index = tid_block + i * num_threads_per_block
+                    linear_index = linear_index % num_tile_elements
+                    row = linear_index // block_size
+                    col = linear_index % block_size
+                    value = A_kk_tile[row, col]
+                    if k + row >= active_matrix_size or k + col >= active_matrix_size:
+                        value = wp.where(row == col, float(1), float(0))
+                    A_kk_tile[row, col] = value
+            if k > 0:
+                for j in range(0, k, block_size):
+                    L_block = wp.tile_load(L, shape=(block_size, block_size), offset=(k, j))
+                    L_block_T = wp.tile_transpose(L_block)
+                    L_L_T_block = wp.tile_matmul(L_block, L_block_T)
+                    A_kk_tile -= L_L_T_block
+            # Compute the Cholesky factorization for the block
+            L_kk_tile = wp.tile_cholesky(A_kk_tile)
+            wp.tile_store(L, L_kk_tile, offset=(k, k))
+            # Process the blocks below the current block
+            for i in range(end, n, block_size):
+                A_ik_tile = wp.tile_load(A, shape=(block_size, block_size), offset=(i, k), storage="shared")
+                # The following if pads the matrix if it is not divisible by block_size
+                if i + block_size > active_matrix_size or k + block_size > active_matrix_size:
+                    num_tile_elements = block_size * block_size
+                    num_iterations = (num_tile_elements + num_threads_per_block - 1) // num_threads_per_block
+                    for ii in range(num_iterations):
+                        linear_index = tid_block + ii * num_threads_per_block
+                        linear_index = linear_index % num_tile_elements
+                        row = linear_index // block_size
+                        col = linear_index % block_size
+                        value = A_ik_tile[row, col]
+                        if i + row >= active_matrix_size or k + col >= active_matrix_size:
+                            value = wp.where(i + row == k + col, float(1), float(0))
+                        A_ik_tile[row, col] = value
+                if k > 0:
+                    for j in range(0, k, block_size):
+                        L_tile = wp.tile_load(L, shape=(block_size, block_size), offset=(i, j))
+                        L_2_tile = wp.tile_load(L, shape=(block_size, block_size), offset=(k, j))
+                        L_T_tile = wp.tile_transpose(L_2_tile)
+                        L_L_T_tile = wp.tile_matmul(L_tile, L_T_tile)
+                        A_ik_tile -= L_L_T_tile
+                t = wp.tile_transpose(A_ik_tile)
+                tmp = wp.tile_lower_solve(L_kk_tile, t)
+                sol_tile = wp.tile_transpose(tmp)
+                wp.tile_store(L, sol_tile, offset=(i, k))
+    return blocked_cholesky_kernel
+@lru_cache(maxsize=None)
+def create_blocked_cholesky_solve_kernel(block_size: int):
+    @wp.kernel
+    def blocked_cholesky_solve_kernel(
+        L: wp.array(dtype=float, ndim=2),
+        b: wp.array(dtype=float, ndim=2),
+        x: wp.array(dtype=float, ndim=2),
+        y: wp.array(dtype=float, ndim=2),
+        active_matrix_size_arr: wp.array(dtype=int, ndim=1),
+    ):
+        """
+        Solves A x = b given the Cholesky factor L (A = L L^T) using
+        blocked forward and backward substitution.
+        b can be a vector or 2-D array with multiple right-hand sides.
+        """
+        active_matrix_size = active_matrix_size_arr[0]
+        # Round up active_matrix_size to next multiple of block_size
+        n = ((active_matrix_size + block_size - 1) // block_size) * block_size
+        # Forward substitution: solve L y = b
+        for i in range(0, n, block_size):
+            i_end = i + block_size
+            rhs_tile = wp.tile_load(b, shape=(block_size, 1), offset=(i, 0))
+            if i > 0:
+                for j in range(0, i, block_size):
+                    L_block = wp.tile_load(L, shape=(block_size, block_size), offset=(i, j))
+                    y_block = wp.tile_load(y, shape=(block_size, 1), offset=(j, 0))
+                    Ly_block = wp.tile_matmul(L_block, y_block)
+                    rhs_tile -= Ly_block
+            L_tile = wp.tile_load(L, shape=(block_size, block_size), offset=(i, i))
+            y_tile = wp.tile_lower_solve(L_tile, rhs_tile)
+            wp.tile_store(y, y_tile, offset=(i, 0))
+        # Backward substitution: solve L^T x = y
+        for i in range(n - block_size, -1, -block_size):
+            i_start = i
+            i_end = i_start + block_size
+            rhs_tile = wp.tile_load(y, shape=(block_size, 1), offset=(i_start, 0))
+            if i_end < n:
+                for j in range(i_end, n, block_size):
+                    L_tile = wp.tile_load(L, shape=(block_size, block_size), offset=(j, i_start))
+                    L_T_tile = wp.tile_transpose(L_tile)
+                    x_tile = wp.tile_load(x, shape=(block_size, 1), offset=(j, 0))
+                    L_T_x_tile = wp.tile_matmul(L_T_tile, x_tile)
+                    rhs_tile -= L_T_x_tile
+            L_tile = wp.tile_load(L, shape=(block_size, block_size), offset=(i_start, i_start))
+            x_tile = wp.tile_upper_solve(wp.tile_transpose(L_tile), rhs_tile)
+            wp.tile_store(x, x_tile, offset=(i_start, 0))
+    return blocked_cholesky_solve_kernel
+# TODO: Add batching support to solve multiple equation systems at once (one per thread block)
+class BlockCholeskySolver:
+    """
+    A class for solving linear systems using the Cholesky factorization.
+    """
+    def __init__(self, max_num_equations: int, block_size=16, device="cuda"):
+        # Round up max_num_equations to next multiple of block_size
+        max_num_equations = ((max_num_equations + block_size - 1) // block_size) * block_size
+        self.max_num_equations = max_num_equations
+        self.device = device
+        self.num_threads_per_block_factorize = 128
+        self.num_threads_per_block_solve = 64
+        self.active_matrix_size_int = -1
+        self.block_size = block_size
+        self.cholesky_kernel = create_blocked_cholesky_kernel(block_size)
+        self.solve_kernel = create_blocked_cholesky_solve_kernel(block_size)
+        # Allocate workspace arrays for factorization and solve
+        self.L = wp.zeros(shape=(self.max_num_equations, self.max_num_equations), dtype=float, device=self.device)
+        self.y = wp.zeros(shape=(self.max_num_equations, 1), dtype=float, device=self.device)  # temp memory
+        self.active_matrix_size = wp.zeros(
+            shape=(1,), dtype=int, device=self.device
+        )  # array to hold active matrix size
+        self.active_matrix_size_external = None
+    def factorize(self, A: wp.array(dtype=float, ndim=2), num_active_equations: int):
+        """
+        Computes the Cholesky factorization of a symmetric positive definite matrix A in blocks.
+        It returns a lower-triangular matrix L such that A = L L^T.
+        """
+        assert num_active_equations <= self.max_num_equations, (
+            f"Number of active equations ({num_active_equations}) exceeds maximum allowed ({self.max_num_equations})"
+        )
+        padded_n = ((num_active_equations + self.block_size - 1) // self.block_size) * self.block_size
+        # Verify input dimensions
+        assert A.shape[0] == A.shape[1], "Matrix A must be square"
+        assert A.shape[0] >= padded_n, f"Matrix A must be at least {padded_n}x{padded_n} to accommodate padding"
+        self.active_matrix_size.zero_()
+        wp.copy(self.active_matrix_size, wp.array([num_active_equations], dtype=int, device=self.device))
+        self.factorize_dynamic(A, self.active_matrix_size)
+        self.active_matrix_size_external = None
+        self.active_matrix_size_int = num_active_equations
+    def factorize_dynamic(self, A: wp.array(dtype=float, ndim=2), num_active_equations: wp.array(dtype=int, ndim=1)):
+        """
+        Computes the Cholesky factorization of a symmetric positive definite matrix A in blocks.
+        It returns a lower-triangular matrix L such that A = L L^T.
+        """
+        self.active_matrix_size_external = num_active_equations
+        self.active_matrix_size_int = -1
+        wp.launch_tiled(
+            self.cholesky_kernel,
+            dim=1,
+            inputs=[A, self.L, num_active_equations],
+            block_dim=self.num_threads_per_block_factorize,
+            device=self.device,
+        )
+    def solve(self, rhs: wp.array(dtype=float, ndim=2), result: wp.array(dtype=float, ndim=2)):
+        """
+        Solves A x = b given the Cholesky factor L (A = L L^T) using
+        blocked forward and backward substitution.
+        b can be a vector or 2-D array with multiple right-hand sides.
+        """
+        # Do safety checks but they can only be done if the matrix size is known on the host
+        if self.active_matrix_size_int > 0:
+            n = self.active_matrix_size_int
+            padded_n = ((n + self.block_size - 1) // self.block_size) * self.block_size
+            # Verify input dimensions
+            assert rhs.shape[1] == 1, "Matrix b must be a column vector"
+            assert rhs.shape[0] >= padded_n, f"Matrix b must be at least {padded_n}x1 to accommodate padding"
+            assert result.shape[1] == 1, "Matrix result must be a column vector"
+            assert result.shape[0] >= padded_n, f"Matrix result must be at least {padded_n}x1 to accommodate padding"
+        if self.active_matrix_size_external is not None:
+            matrix_size = self.active_matrix_size_external
+        else:
+            matrix_size = self.active_matrix_size
+        # Then solve the system using blocked_cholesky_solve kernel
+        wp.launch_tiled(
+            self.solve_kernel,
+            dim=1,
+            inputs=[self.L, rhs, result, self.y, matrix_size],
+            block_dim=self.num_threads_per_block_solve,
+            device=self.device,
+        )
+class CholeskySolverNumPy:
+    """
+    A class for solving linear systems using the Cholesky factorization.
+    """
+    def __init__(self, max_num_equations: int):
+        self.max_num_equations = max_num_equations
+        self.num_active_equations = 0
+        # Allocate workspace arrays for factorization and solve
+        self.L = np.zeros((self.max_num_equations, self.max_num_equations))
+        self.y = np.zeros((self.max_num_equations, 1))  # temp memory
+    def factorize(self, A: np.ndarray, num_active_equations: int):
+        """
+        Computes the Cholesky factorization of a symmetric positive definite matrix A.
+        It returns a lower-triangular matrix L such that A = L L^T.
+        """
+        assert num_active_equations <= self.max_num_equations, (
+            f"Number of active equations ({num_active_equations}) exceeds maximum allowed ({self.max_num_equations})"
+        )
+        self.num_active_equations = num_active_equations
+        # Verify input dimensions
+        assert A.shape[0] == A.shape[1], "Matrix A must be square"
+        assert A.shape[0] >= num_active_equations, (
+            f"Matrix A must be at least {num_active_equations}x{num_active_equations}"
+        )
+        # Compute Cholesky factorization
+        self.L[:num_active_equations, :num_active_equations] = np.linalg.cholesky(
+            A[:num_active_equations, :num_active_equations]
+        )
+    def solve(self, rhs: np.ndarray, result: np.ndarray):
+        """
+        Solves A x = b given the Cholesky factor L (A = L L^T) using
+        forward and backward substitution.
+        b can be a vector or 2-D array with multiple right-hand sides.
+        """
+        assert self.num_active_equations <= self.max_num_equations, (
+            f"Number of active equations ({self.num_active_equations}) exceeds maximum allowed ({self.max_num_equations})"
+        )
+        n = self.num_active_equations
+        # Verify input dimensions
+        assert rhs.shape[1] == 1, "Matrix b must be a column vector"
+        assert rhs.shape[0] >= n, f"Matrix b must be at least {n}x1"
+        assert result.shape[1] == 1, "Matrix result must be a column vector"
+        assert result.shape[0] >= n, f"Matrix result must be at least {n}x1"
+        # Forward substitution: L y = b
+        self.y[:n] = np.linalg.solve(self.L[:n, :n], rhs[:n])
+        # Backward substitution: L^T x = y
+        result[:n] = np.linalg.solve(self.L[:n, :n].T, self.y[:n])
+def test_cholesky_solver(n, warp_solver: BlockCholeskySolver, device: str = "cuda"):
+    # Create a symmetric positive definite matrix
+    rng = np.random.default_rng(0)
+    A_full = rng.standard_normal((n, n))
+    A_full = A_full @ A_full.T + n * np.eye(n)  # ensure SPD
+    block_size = warp_solver.block_size
+    # Pad matrix to make it divisible by block_size
+    padded_n = ((n + block_size - 1) // block_size) * block_size
+    padding = padded_n - n
+    if padding > 0:
+        # Pad the original matrix with random values while maintaining SPD
+        A_padded = rng.standard_normal((padded_n, padded_n))
+        A_padded[:n, :n] = A_full
+        padding_block = rng.standard_normal((padding, padding))
+        padding_block = padding_block @ padding_block.T + padding * np.eye(padding)
+        A_padded[n:, n:] = padding_block
+        A_padded[n:, :n] = rng.standard_normal((padding, n))
+        A_padded[:n, n:] = A_padded[n:, :n].T  # Maintain symmetry
+    else:
+        A_padded = A_full
+    # Create random RHS vector and pad
+    b = rng.standard_normal(n)
+    b_padded = rng.standard_normal(padded_n)
+    b_padded[:n] = b
+    print("\nSolving with NumPy:")
+    # NumPy reference solution
+    x = np.linalg.solve(A_full, b)
+    L_full = np.linalg.cholesky(A_full)
+    # Verify NumPy solution
+    err = np.linalg.norm(A_full - L_full @ L_full.T)
+    res_norm = np.linalg.norm(b - A_full @ x)
+    print(f"Cholesky factorization error: {err:.3e}")
+    print(f"Solution residual norm: {res_norm:.3e}")
+    print("\nSolving with Warp kernels:")
+    # Initialize Warp arrays
+    A_wp = wp.array(A_padded, dtype=wp.float32, device=device)
+    b_wp = wp.array(b_padded, dtype=wp.float32, device=device).reshape((padded_n, 1))
+    x_wp = wp.zeros_like(b_wp)
+    # Create and use the Cholesky solver
+    warp_solver.factorize(A_wp, n)
+    warp_solver.solve(b_wp, x_wp)
+    wp.synchronize()
+    # Get result back to CPU and verify
+    x_warp = x_wp.numpy()[:n].squeeze()
+    L_warp = warp_solver.L.numpy()
+    # Verify Warp solution
+    err_warp = np.linalg.norm(A_full - L_warp[:n, :n] @ L_warp[:n, :n].T)
+    res_norm_warp = np.linalg.norm(b - A_full @ x_warp)
+    diff_norm = np.linalg.norm(x - x_warp)
+    print(f"Warp Cholesky factorization error: {err_warp:.3e}")
+    print(f"Warp solution residual norm: {res_norm_warp:.3e}")
+    print(f"Difference between CPU and GPU solutions: {diff_norm:.3e}")
+@wp.kernel
+def assign_int_kernel(arr: wp.array(dtype=int, ndim=1), value: int):
+    """Assigns an integer value into the first element of an array"""
+    arr[0] = value
+def test_cholesky_solver_graph_capture():
+    wp.clear_kernel_cache()
+    max_equations = 1000
+    # Create random SPD matrix A and random RHS b
+    rng = np.random.default_rng(42)
+    A_np = rng.standard_normal((max_equations, max_equations))
+    A_np = A_np @ A_np.T + np.eye(max_equations) * max_equations  # Make SPD
+    b_np = rng.standard_normal((max_equations, 1))
+    device = "cuda"
+    with wp.ScopedDevice(device):
+        warp_solver = BlockCholeskySolver(max_equations, block_size=32)
+        # Create Warp arrays
+        # Round up dimensions to next multiple of block size
+        block_size = warp_solver.block_size
+        padded_n = ((max_equations + block_size - 1) // block_size) * block_size
+        # Create padded arrays initialized with zeros
+        A_padded = np.zeros((padded_n, padded_n), dtype=np.float32)
+        b_padded = np.zeros((padded_n, 1), dtype=np.float32)
+        # Copy original data into padded arrays
+        A_padded[:max_equations, :max_equations] = A_np
+        b_padded[:max_equations, :] = b_np
+        # Create Warp arrays from padded numpy arrays
+        A_wp = wp.array(A_padded, dtype=wp.float32, ndim=2)
+        b_wp = wp.array(b_padded, dtype=wp.float32, ndim=2)
+        # Create result array
+        x_wp = wp.zeros_like(b_wp)
+        # Create array for equation system size
+        n_wp = wp.array([1], dtype=wp.int32)
+        # Create a stream for graph capture
+        stream = wp.Stream(device)
+        with wp.ScopedStream(stream):
+            # Begin graph capture
+            wp.capture_begin()
+            try:
+                # Loop through different system sizes
+                for n in range(1, max_equations + 1):
+                    # Update system size
+                    wp.launch(assign_int_kernel, dim=1, inputs=[n_wp, n])
+                    # Factorize A
+                    warp_solver.factorize_dynamic(A_wp, n_wp)
+                    # Solve system
+                    warp_solver.solve(b_wp, x_wp)
+            finally:
+                # End graph capture
+                graph = wp.capture_end()
+            # Run the captured graph
+            with wp.ScopedTimer("Launch graph", cuda_filter=wp.TIMING_GRAPH):
+                wp.capture_launch(graph, stream=stream)
+            wp.synchronize()
+            print("Finished!")
+if __name__ == "__main__":
+    wp.clear_kernel_cache()
+    test_graph_capture = False
+    if test_graph_capture:
+        test_cholesky_solver_graph_capture()
+    else:
+        device = "cpu"
+        # Test equation sys  sizes
+        sizes = [32, 70, 128, 192, 257, 320, 401, 1000]
+        # Initialize solver once with max size
+        warp_solver = BlockCholeskySolver(max(sizes), block_size=16, device=device)
+        for n in sizes:
+            print(f"\nTesting system size n = {n}")
+            test_cholesky_solver(n, warp_solver, device)

warp/examples/tile/example_tile_cholesky.py CHANGED Viewed

@@ -82,6 +82,7 @@ if __name__ == "__main__":
     print("A\\n (Warp):\n", Y_wp.numpy())
     print("A\\x (Numpy):\n", Y_np)
-    assert np.allclose(Y_wp.numpy(), Y_np) and np.allclose(L_wp.numpy(), L_np)
+    np.testing.assert_allclose(Y_wp.numpy(), Y_np)
+    np.testing.assert_allclose(L_wp.numpy(), L_np)
     print("Example Tile Cholesky passed")

warp/examples/tile/example_tile_convolution.py CHANGED Viewed

@@ -63,4 +63,4 @@ if __name__ == "__main__":
     wp.launch_tiled(conv_tiled, dim=[1, 1], inputs=[x_wp], outputs=[y_wp], block_dim=BLOCK_DIM)
     # Since filter is 1/N, conv_tiled is a ~no-op
-    assert np.allclose(x_h, y_wp.numpy())
+    np.testing.assert_allclose(x_h, y_wp.numpy())

warp/examples/tile/example_tile_filtering.py CHANGED Viewed

@@ -88,7 +88,7 @@ if __name__ == "__main__":
     f_np = cplx(f_h)
     y_test = cplx(y_wp.numpy())
     y_ref = np.fft.ifft(f_np * np.fft.fft(x_np))
-    assert np.allclose(y_ref, y_test)
+    np.testing.assert_allclose(y_ref, y_test)
 try:
     import matplotlib.pyplot as plt

warp/examples/tile/example_tile_matmul.py CHANGED Viewed

@@ -80,6 +80,6 @@ if __name__ == "__main__":
             block_dim=TILE_THREADS,
         )
-    assert np.allclose(C_wp.numpy(), A @ B, atol=1.0e-4)
+    np.testing.assert_allclose(C_wp.numpy(), A @ B, atol=1.0e-4)
     print("Example matrix multiplication passed")

warp/examples/tile/example_tile_mlp.py CHANGED Viewed

@@ -29,6 +29,8 @@
 #
 ###########################################################################
+# ruff: noqa: RUF003
 import math
 import os