PyPI - warp-lang - Versions diffs - 0.11.0__py3-none-manylinux2014_x86_64.whl → 1.0.0__py3-none-manylinux2014_x86_64.whl - Mend

warp-lang 0.11.0__py3-none-manylinux2014_x86_64.whl → 1.0.0__py3-none-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (170) hide show

warp/__init__.py +8 -0
warp/bin/warp-clang.so +0 -0
warp/bin/warp.so +0 -0
warp/build.py +7 -6
warp/build_dll.py +70 -79
warp/builtins.py +10 -6
warp/codegen.py +51 -19
warp/config.py +7 -8
warp/constants.py +3 -0
warp/context.py +948 -245
warp/dlpack.py +198 -113
warp/examples/assets/bunny.usd +0 -0
warp/examples/assets/cartpole.urdf +110 -0
warp/examples/assets/crazyflie.usd +0 -0
warp/examples/assets/cube.usda +42 -0
warp/examples/assets/nv_ant.xml +92 -0
warp/examples/assets/nv_humanoid.xml +183 -0
warp/examples/assets/quadruped.urdf +268 -0
warp/examples/assets/rocks.nvdb +0 -0
warp/examples/assets/rocks.usd +0 -0
warp/examples/assets/sphere.usda +56 -0
warp/examples/assets/torus.usda +105 -0
warp/examples/benchmarks/benchmark_api.py +383 -0
warp/examples/benchmarks/benchmark_cloth.py +279 -0
warp/examples/benchmarks/benchmark_cloth_cupy.py +88 -0
warp/examples/benchmarks/benchmark_cloth_jax.py +100 -0
warp/examples/benchmarks/benchmark_cloth_numba.py +142 -0
warp/examples/benchmarks/benchmark_cloth_numpy.py +77 -0
warp/examples/benchmarks/benchmark_cloth_pytorch.py +86 -0
warp/examples/benchmarks/benchmark_cloth_taichi.py +112 -0
warp/examples/benchmarks/benchmark_cloth_warp.py +146 -0
warp/examples/benchmarks/benchmark_launches.py +295 -0
warp/examples/core/example_dem.py +221 -0
warp/examples/core/example_fluid.py +267 -0
warp/examples/core/example_graph_capture.py +129 -0
warp/examples/core/example_marching_cubes.py +177 -0
warp/examples/core/example_mesh.py +154 -0
warp/examples/core/example_mesh_intersect.py +193 -0
warp/examples/core/example_nvdb.py +169 -0
warp/examples/core/example_raycast.py +89 -0
warp/examples/core/example_raymarch.py +178 -0
warp/examples/core/example_render_opengl.py +141 -0
warp/examples/core/example_sph.py +389 -0
warp/examples/core/example_torch.py +181 -0
warp/examples/core/example_wave.py +249 -0
warp/examples/fem/bsr_utils.py +380 -0
warp/examples/fem/example_apic_fluid.py +391 -0
warp/examples/fem/example_convection_diffusion.py +168 -0
warp/examples/fem/example_convection_diffusion_dg.py +209 -0
warp/examples/fem/example_convection_diffusion_dg0.py +194 -0
warp/examples/fem/example_deformed_geometry.py +159 -0
warp/examples/fem/example_diffusion.py +173 -0
warp/examples/fem/example_diffusion_3d.py +152 -0
warp/examples/fem/example_diffusion_mgpu.py +214 -0
warp/examples/fem/example_mixed_elasticity.py +222 -0
warp/examples/fem/example_navier_stokes.py +243 -0
warp/examples/fem/example_stokes.py +192 -0
warp/examples/fem/example_stokes_transfer.py +249 -0
warp/examples/fem/mesh_utils.py +109 -0
warp/examples/fem/plot_utils.py +287 -0
warp/examples/optim/example_bounce.py +248 -0
warp/examples/optim/example_cloth_throw.py +210 -0
warp/examples/optim/example_diffray.py +535 -0
warp/examples/optim/example_drone.py +850 -0
warp/examples/optim/example_inverse_kinematics.py +169 -0
warp/examples/optim/example_inverse_kinematics_torch.py +170 -0
warp/examples/optim/example_spring_cage.py +234 -0
warp/examples/optim/example_trajectory.py +201 -0
warp/examples/sim/example_cartpole.py +128 -0
warp/examples/sim/example_cloth.py +184 -0
warp/examples/sim/example_granular.py +113 -0
warp/examples/sim/example_granular_collision_sdf.py +185 -0
warp/examples/sim/example_jacobian_ik.py +213 -0
warp/examples/sim/example_particle_chain.py +106 -0
warp/examples/sim/example_quadruped.py +179 -0
warp/examples/sim/example_rigid_chain.py +191 -0
warp/examples/sim/example_rigid_contact.py +176 -0
warp/examples/sim/example_rigid_force.py +126 -0
warp/examples/sim/example_rigid_gyroscopic.py +97 -0
warp/examples/sim/example_rigid_soft_contact.py +124 -0
warp/examples/sim/example_soft_body.py +178 -0
warp/fabric.py +29 -20
warp/fem/cache.py +0 -1
warp/fem/dirichlet.py +0 -2
warp/fem/integrate.py +0 -1
warp/jax.py +45 -0
warp/jax_experimental.py +339 -0
warp/native/builtin.h +12 -0
warp/native/bvh.cu +18 -18
warp/native/clang/clang.cpp +8 -3
warp/native/cuda_util.cpp +94 -5
warp/native/cuda_util.h +35 -6
warp/native/cutlass_gemm.cpp +1 -1
warp/native/cutlass_gemm.cu +4 -1
warp/native/error.cpp +66 -0
warp/native/error.h +27 -0
warp/native/mesh.cu +2 -2
warp/native/reduce.cu +4 -4
warp/native/runlength_encode.cu +2 -2
warp/native/scan.cu +2 -2
warp/native/sparse.cu +0 -1
warp/native/temp_buffer.h +2 -2
warp/native/warp.cpp +95 -60
warp/native/warp.cu +1053 -218
warp/native/warp.h +49 -32
warp/optim/linear.py +33 -16
warp/render/render_opengl.py +202 -101
warp/render/render_usd.py +82 -40
warp/sim/__init__.py +13 -4
warp/sim/articulation.py +4 -5
warp/sim/collide.py +320 -175
warp/sim/import_mjcf.py +25 -30
warp/sim/import_urdf.py +94 -63
warp/sim/import_usd.py +51 -36
warp/sim/inertia.py +3 -2
warp/sim/integrator.py +233 -0
warp/sim/integrator_euler.py +447 -469
warp/sim/integrator_featherstone.py +1991 -0
warp/sim/integrator_xpbd.py +1420 -640
warp/sim/model.py +765 -487
warp/sim/particles.py +2 -1
warp/sim/render.py +35 -13
warp/sim/utils.py +222 -11
warp/stubs.py +8 -0
warp/tape.py +16 -1
warp/tests/aux_test_grad_customs.py +23 -0
warp/tests/test_array.py +190 -1
warp/tests/test_async.py +656 -0
warp/tests/test_bool.py +50 -0
warp/tests/test_dlpack.py +164 -11
warp/tests/test_examples.py +166 -74
warp/tests/test_fem.py +8 -1
warp/tests/test_generics.py +15 -5
warp/tests/test_grad.py +1 -1
warp/tests/test_grad_customs.py +172 -12
warp/tests/test_jax.py +254 -0
warp/tests/test_large.py +29 -6
warp/tests/test_launch.py +25 -0
warp/tests/test_linear_solvers.py +20 -3
warp/tests/test_matmul.py +61 -16
warp/tests/test_matmul_lite.py +13 -13
warp/tests/test_mempool.py +186 -0
warp/tests/test_multigpu.py +3 -0
warp/tests/test_options.py +16 -2
warp/tests/test_peer.py +137 -0
warp/tests/test_print.py +3 -1
warp/tests/test_quat.py +23 -0
warp/tests/test_sim_kinematics.py +97 -0
warp/tests/test_snippet.py +126 -3
warp/tests/test_streams.py +108 -79
warp/tests/test_torch.py +16 -8
warp/tests/test_utils.py +32 -27
warp/tests/test_verify_fp.py +65 -0
warp/tests/test_volume.py +1 -1
warp/tests/unittest_serial.py +2 -0
warp/tests/unittest_suites.py +12 -0
warp/tests/unittest_utils.py +14 -7
warp/thirdparty/unittest_parallel.py +15 -3
warp/torch.py +10 -8
warp/types.py +363 -246
warp/utils.py +143 -19
warp_lang-1.0.0.dist-info/LICENSE.md +126 -0
warp_lang-1.0.0.dist-info/METADATA +394 -0
{warp_lang-0.11.0.dist-info → warp_lang-1.0.0.dist-info}/RECORD +167 -86
warp/sim/optimizer.py +0 -138
warp_lang-0.11.0.dist-info/LICENSE.md +0 -36
warp_lang-0.11.0.dist-info/METADATA +0 -238
/warp/tests/{walkthough_debug.py → walkthrough_debug.py} +0 -0
{warp_lang-0.11.0.dist-info → warp_lang-1.0.0.dist-info}/WHEEL +0 -0
{warp_lang-0.11.0.dist-info → warp_lang-1.0.0.dist-info}/top_level.txt +0 -0

warp/examples/fem/example_diffusion.py ADDED Viewed

@@ -0,0 +1,173 @@
+# Copyright (c) 2022 NVIDIA CORPORATION.  All rights reserved.
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+###########################################################################
+# Example Diffusion
+#
+# This example solves a 2d diffusion problem:
+#
+# nu Div u = 1
+#
+# with Dirichlet boundary conditions on vertical edges and
+# homogeneous Neumann on horizontal edges.
+###########################################################################
+import argparse
+import warp as wp
+import warp.fem as fem
+from warp.sparse import bsr_axpy
+from warp.fem.utils import array_axpy
+# Import example utilities
+# Make sure that works both when imported as module and run as standalone file
+try:
+    from .bsr_utils import bsr_cg
+    from .mesh_utils import gen_trimesh, gen_quadmesh
+    from .plot_utils import Plot
+except ImportError:
+    from bsr_utils import bsr_cg
+    from mesh_utils import gen_trimesh, gen_quadmesh
+    from plot_utils import Plot
+wp.init()
+@fem.integrand
+def linear_form(
+    s: fem.Sample,
+    v: fem.Field,
+):
+    """Linear form with constant slope 1 -- forcing term of our problem"""
+    return v(s)
+@fem.integrand
+def diffusion_form(s: fem.Sample, u: fem.Field, v: fem.Field, nu: float):
+    """Diffusion bilinear form with constant coefficient ``nu``"""
+    return nu * wp.dot(
+        fem.grad(u, s),
+        fem.grad(v, s),
+    )
+@fem.integrand
+def y_boundary_value_form(s: fem.Sample, domain: fem.Domain, v: fem.Field, val: float):
+    """Linear form with coefficient val on vertical edges, zero elsewhere"""
+    nor = fem.normal(domain, s)
+    return val * v(s) * wp.abs(nor[0])
+@fem.integrand
+def y_boundary_projector_form(
+    s: fem.Sample,
+    domain: fem.Domain,
+    u: fem.Field,
+    v: fem.Field,
+):
+    """
+    Bilinear boundary condition projector form, non-zero on vertical edges only.
+    """
+    # Reuse the above linear form implementation by evaluating one of the participating field and passing it as a normal scalar argument.
+    return y_boundary_value_form(s, domain, v, u(s))
+class Example:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--resolution", type=int, default=50)
+    parser.add_argument("--degree", type=int, default=2)
+    parser.add_argument("--serendipity", action="store_true", default=False)
+    parser.add_argument("--viscosity", type=float, default=2.0)
+    parser.add_argument("--boundary_value", type=float, default=5.0)
+    parser.add_argument("--boundary_compliance", type=float, default=0, help="Dirichlet boundary condition compliance")
+    parser.add_argument("--mesh", choices=("grid", "tri", "quad"), default="grid", help="Mesh type")
+    def __init__(self, stage=None, quiet=False, args=None, **kwargs):
+        if args is None:
+            # Read args from kwargs, add default arg values from parser
+            args = argparse.Namespace(**kwargs)
+            args = Example.parser.parse_args(args=[], namespace=args)
+        self._args = args
+        self._quiet = quiet
+        # Grid or triangle mesh geometry
+        if args.mesh == "tri":
+            positions, tri_vidx = gen_trimesh(res=wp.vec2i(args.resolution))
+            self._geo = fem.Trimesh2D(tri_vertex_indices=tri_vidx, positions=positions)
+        elif args.mesh == "quad":
+            positions, quad_vidx = gen_quadmesh(res=wp.vec2i(args.resolution))
+            self._geo = fem.Quadmesh2D(quad_vertex_indices=quad_vidx, positions=positions)
+        else:
+            self._geo = fem.Grid2D(res=wp.vec2i(args.resolution))
+        # Scalar function space
+        element_basis = fem.ElementBasis.SERENDIPITY if args.serendipity else None
+        self._scalar_space = fem.make_polynomial_space(self._geo, degree=args.degree, element_basis=element_basis)
+        # Scalar field over our function space
+        self._scalar_field = self._scalar_space.make_field()
+        self.renderer = Plot(stage)
+    def step(self):
+        args = self._args
+        geo = self._geo
+        domain = fem.Cells(geometry=geo)
+        # Right-hand-side (forcing term)
+        test = fem.make_test(space=self._scalar_space, domain=domain)
+        rhs = fem.integrate(linear_form, fields={"v": test})
+        # Diffusion form
+        trial = fem.make_trial(space=self._scalar_space, domain=domain)
+        matrix = fem.integrate(diffusion_form, fields={"u": trial, "v": test}, values={"nu": args.viscosity})
+        # Boundary conditions on Y sides
+        # Use nodal integration so that boundary conditions are specified on each node independently
+        boundary = fem.BoundarySides(geo)
+        bd_test = fem.make_test(space=self._scalar_space, domain=boundary)
+        bd_trial = fem.make_trial(space=self._scalar_space, domain=boundary)
+        bd_matrix = fem.integrate(y_boundary_projector_form, fields={"u": bd_trial, "v": bd_test}, nodal=True)
+        bd_rhs = fem.integrate(
+            y_boundary_value_form, fields={"v": bd_test}, values={"val": args.boundary_value}, nodal=True
+        )
+        # Assemble linear system
+        if args.boundary_compliance == 0.0:
+            # Hard BC: project linear system
+            fem.project_linear_system(matrix, rhs, bd_matrix, bd_rhs)
+        else:
+            # Weak BC: add toegether diffusion and boundary condition matrices
+            boundary_strength = 1.0 / args.boundary_compliance
+            bsr_axpy(x=bd_matrix, y=matrix, alpha=boundary_strength, beta=1)
+            array_axpy(x=bd_rhs, y=rhs, alpha=boundary_strength, beta=1)
+        # Solve linear system using Conjugate Gradient
+        x = wp.zeros_like(rhs)
+        bsr_cg(matrix, b=rhs, x=x, quiet=self._quiet)
+        # Assign system result to our discrete field
+        self._scalar_field.dof_values = x
+    def render(self):
+        self.renderer.add_surface("solution", self._scalar_field)
+if __name__ == "__main__":
+    wp.set_module_options({"enable_backward": False})
+    args = Example.parser.parse_args()
+    example = Example(args=args)
+    example.step()
+    example.render()
+    example.renderer.plot()

warp/examples/fem/example_diffusion_3d.py ADDED Viewed

@@ -0,0 +1,152 @@
+# Copyright (c) 2022 NVIDIA CORPORATION.  All rights reserved.
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+###########################################################################
+# Example Diffusion 3D
+#
+# This example solves a 3d diffusion problem:
+#
+# nu Div u = 1
+#
+# with homogeneous Neumann conditions on horizontal sides
+# and homogeneous Dirichlet boundary conditions other sides.
+###########################################################################
+import argparse
+import warp as wp
+import warp.fem as fem
+from warp.sparse import bsr_axpy
+# Import example utilities
+# Make sure that works both when imported as module and run as standalone file
+try:
+    from .example_diffusion import diffusion_form, linear_form
+    from .bsr_utils import bsr_cg
+    from .mesh_utils import gen_tetmesh
+    from .plot_utils import Plot
+except ImportError:
+    from example_diffusion import diffusion_form, linear_form
+    from bsr_utils import bsr_cg
+    from mesh_utils import gen_tetmesh, gen_hexmesh
+    from plot_utils import Plot
+wp.init()
+@fem.integrand
+def vert_boundary_projector_form(
+    s: fem.Sample,
+    domain: fem.Domain,
+    u: fem.Field,
+    v: fem.Field,
+):
+    # Non-zero mass on vertical sides only
+    w = 1.0 - wp.abs(fem.normal(domain, s)[1])
+    return w * u(s) * v(s)
+class Example:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--resolution", type=int, default=10)
+    parser.add_argument("--degree", type=int, default=2)
+    parser.add_argument("--serendipity", action="store_true", default=False)
+    parser.add_argument("--viscosity", type=float, default=2.0)
+    parser.add_argument("--boundary_compliance", type=float, default=0, help="Dirichlet boundary condition compliance")
+    parser.add_argument("--mesh", choices=("grid", "tet", "hex"), default="grid", help="Mesh type")
+    def __init__(self, stage=None, quiet=False, args=None, **kwargs):
+        if args is None:
+            # Read args from kwargs, add default arg values from parser
+            args = argparse.Namespace(**kwargs)
+            args = Example.parser.parse_args(args=[], namespace=args)
+        self._args = args
+        self._quiet = quiet
+        res = wp.vec3i(args.resolution, args.resolution // 2, args.resolution * 2)
+        if args.mesh == "tet":
+            pos, tet_vtx_indices = gen_tetmesh(
+                res=res,
+                bounds_lo=wp.vec3(0.0, 0.0, 0.0),
+                bounds_hi=wp.vec3(1.0, 0.5, 2.0),
+            )
+            self._geo = fem.Tetmesh(tet_vtx_indices, pos)
+        elif args.mesh == "hex":
+            pos, hex_vtx_indices = gen_hexmesh(
+                res=res,
+                bounds_lo=wp.vec3(0.0, 0.0, 0.0),
+                bounds_hi=wp.vec3(1.0, 0.5, 2.0),
+            )
+            self._geo = fem.Hexmesh(hex_vtx_indices, pos)
+        else:
+            self._geo = fem.Grid3D(
+                res=res,
+                bounds_lo=wp.vec3(0.0, 0.0, 0.0),
+                bounds_hi=wp.vec3(1.0, 0.5, 2.0),
+            )
+        # Domain and function spaces
+        element_basis = fem.ElementBasis.SERENDIPITY if args.serendipity else None
+        self._scalar_space = fem.make_polynomial_space(self._geo, degree=args.degree, element_basis=element_basis)
+        # Scalar field over our function space
+        self._scalar_field: fem.DiscreteField = self._scalar_space.make_field()
+        self.renderer = Plot(stage)
+    def step(self):
+        args = self._args
+        geo = self._geo
+        domain = fem.Cells(geometry=geo)
+        # Right-hand-side
+        test = fem.make_test(space=self._scalar_space, domain=domain)
+        rhs = fem.integrate(linear_form, fields={"v": test})
+        # Weakly-imposed boundary conditions on Y sides
+        with wp.ScopedTimer("Integrate"):
+            boundary = fem.BoundarySides(geo)
+            bd_test = fem.make_test(space=self._scalar_space, domain=boundary)
+            bd_trial = fem.make_trial(space=self._scalar_space, domain=boundary)
+            bd_matrix = fem.integrate(vert_boundary_projector_form, fields={"u": bd_trial, "v": bd_test}, nodal=True)
+            # Diffusion form
+            trial = fem.make_trial(space=self._scalar_space, domain=domain)
+            matrix = fem.integrate(diffusion_form, fields={"u": trial, "v": test}, values={"nu": args.viscosity})
+        if args.boundary_compliance == 0.0:
+            # Hard BC: project linear system
+            bd_rhs = wp.zeros_like(rhs)
+            fem.project_linear_system(matrix, rhs, bd_matrix, bd_rhs)
+        else:
+            # Weak BC: add toegether diffusion and boundary condition matrices
+            boundary_strength = 1.0 / args.boundary_compliance
+            bsr_axpy(x=bd_matrix, y=matrix, alpha=boundary_strength, beta=1)
+        with wp.ScopedTimer("CG solve"):
+            x = wp.zeros_like(rhs)
+            bsr_cg(matrix, b=rhs, x=x, quiet=self._quiet)
+            self._scalar_field.dof_values = x
+    def render(self):
+        self.renderer.add_volume("solution", self._scalar_field)
+if __name__ == "__main__":
+    wp.set_module_options({"enable_backward": False})
+    args = Example.parser.parse_args()
+    example = Example(args=args)
+    example.step()
+    example.render()
+    example.renderer.plot()

warp/examples/fem/example_diffusion_mgpu.py ADDED Viewed

@@ -0,0 +1,214 @@
+# Copyright (c) 2022 NVIDIA CORPORATION.  All rights reserved.
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+###########################################################################
+# Example Diffusion MGPU
+#
+# This example illustrates using domain decomposition to
+# solve a diffusion PDE over multiple devices
+###########################################################################
+from typing import Tuple
+import warp as wp
+import warp.fem as fem
+from warp.sparse import bsr_axpy, bsr_mv
+from warp.utils import array_cast
+# Import example utilities
+# Make sure that works both when imported as module and run as standalone file
+try:
+    from .bsr_utils import bsr_cg
+    from .example_diffusion import diffusion_form, linear_form
+    from .plot_utils import Plot
+except ImportError:
+    from bsr_utils import bsr_cg
+    from example_diffusion import diffusion_form, linear_form
+    from plot_utils import Plot
+wp.init()
+@fem.integrand
+def mass_form(
+    s: fem.Sample,
+    u: fem.Field,
+    v: fem.Field,
+):
+    return u(s) * v(s)
+@wp.kernel
+def scal_kernel(a: wp.array(dtype=wp.float64), res: wp.array(dtype=wp.float64), alpha: wp.float64):
+    res[wp.tid()] = a[wp.tid()] * alpha
+@wp.kernel
+def sum_kernel(a: wp.indexedarray(dtype=wp.float64), b: wp.array(dtype=wp.float64)):
+    a[wp.tid()] = a[wp.tid()] + b[wp.tid()]
+def sum_vecs(vecs, indices, sum: wp.array, tmp: wp.array):
+    for v, idx in zip(vecs, indices):
+        wp.copy(dest=tmp, src=v)
+        idx_sum = wp.indexedarray(sum, idx)
+        wp.launch(kernel=sum_kernel, dim=idx.shape, device=sum.device, inputs=[idx_sum, tmp])
+    return sum
+class DistributedSystem:
+    device = None
+    scalar_type: type
+    tmp_buf: wp.array
+    nrow: int
+    shape = Tuple[int, int]
+    rank_data = None
+    def mv_routine(self, x: wp.array, y: wp.array, z: wp.array, alpha=1.0, beta=0.0):
+        """Distributed matrix-vector multiplication routine, for example purposes"""
+        tmp = self.tmp_buf
+        wp.launch(kernel=scal_kernel, dim=y.shape, device=y.device, inputs=[y, z, wp.float64(beta)])
+        stream = wp.get_stream()
+        for mat_i, x_i, y_i, idx in zip(*self.rank_data):
+            # WAR copy with indexed array requiring matching shape
+            tmp_i = wp.array(
+                ptr=tmp.ptr, device=tmp.device, capacity=tmp.capacity, dtype=tmp.dtype, shape=idx.shape
+            )
+            # Compress rhs on rank 0
+            x_idx = wp.indexedarray(x, idx)
+            wp.copy(dest=tmp_i, src=x_idx, count=idx.size, stream=stream)
+            # Send to rank i
+            wp.copy(dest=x_i, src=tmp_i, count=idx.size, stream=stream)
+            with wp.ScopedDevice(x_i.device):
+                wp.wait_stream(stream)
+                bsr_mv(A=mat_i, x=x_i, y=y_i, alpha=alpha, beta=0.0)
+            wp.wait_stream(wp.get_stream(x_i.device))
+            # Back to rank 0 for sum
+            wp.copy(dest=tmp_i, src=y_i, count=idx.size, stream=stream)
+            z_idx = wp.indexedarray(z, idx)
+            wp.launch(kernel=sum_kernel, dim=idx.shape, device=z_idx.device, inputs=[z_idx, tmp_i], stream=stream)
+        wp.wait_stream(stream)
+class Example:
+    def __init__(self, stage=None, quiet=False):
+        self._bd_weight = 100.0
+        self._quiet = quiet
+        self._geo = fem.Grid2D(res=wp.vec2i(25))
+        self._main_device = wp.get_device("cuda")
+        with wp.ScopedDevice(self._main_device):
+            self._scalar_space = fem.make_polynomial_space(self._geo, degree=3)
+            self._scalar_field = self._scalar_space.make_field()
+        self.renderer = Plot(stage)
+    def step(self):
+        devices = wp.get_cuda_devices()
+        main_device = self._main_device
+        rhs_vecs = []
+        res_vecs = []
+        matrices = []
+        indices = []
+        # Build local system for each device
+        for k, device in enumerate(devices):
+            with wp.ScopedDevice(device):
+                # Construct the partition corresponding to the k'th device
+                geo_partition = fem.LinearGeometryPartition(self._geo, k, len(devices))
+                matrix, rhs, partition_node_indices = self._assemble_local_system(geo_partition)
+                rhs_vecs.append(rhs)
+                res_vecs.append(wp.empty_like(rhs))
+                matrices.append(matrix)
+                indices.append(partition_node_indices.to(main_device))
+        # Global rhs as sum of all local rhs
+        glob_rhs = wp.zeros(n=self._scalar_space.node_count(), dtype=wp.float64, device=main_device)
+        # This temporary buffer will be used for peer-to-peer copying during graph capture,
+        # so we allocate it using the default CUDA allocator.  This ensures that the copying
+        # will succeed without enabling mempool access between devices, which is not supported
+        # on all systems.
+        with wp.ScopedMempool(main_device, False):
+            tmp = wp.empty_like(glob_rhs)
+        sum_vecs(rhs_vecs, indices, glob_rhs, tmp)
+        # Distributed CG
+        global_res = wp.zeros_like(glob_rhs)
+        A = DistributedSystem()
+        A.device = main_device
+        A.dtype = glob_rhs.dtype
+        A.nrow = self._scalar_space.node_count()
+        A.shape = (A.nrow, A.nrow)
+        A.tmp_buf = tmp
+        A.rank_data = (matrices, rhs_vecs, res_vecs, indices)
+        with wp.ScopedDevice(main_device):
+            bsr_cg(
+                A,
+                x=global_res,
+                b=glob_rhs,
+                use_diag_precond=False,
+                quiet=self._quiet,
+                mv_routine=A.mv_routine
+            )
+        array_cast(in_array=global_res, out_array=self._scalar_field.dof_values)
+    def render(self):
+        self.renderer.add_surface("solution", self._scalar_field)
+    def _assemble_local_system(self, geo_partition: fem.GeometryPartition):
+        scalar_space = self._scalar_space
+        space_partition = fem.make_space_partition(scalar_space, geo_partition)
+        domain = fem.Cells(geometry=geo_partition)
+        # Right-hand-side
+        test = fem.make_test(space=scalar_space, space_partition=space_partition, domain=domain)
+        rhs = fem.integrate(linear_form, fields={"v": test})
+        # Weakly-imposed boundary conditions on all sides
+        boundary = fem.BoundarySides(geometry=geo_partition)
+        bd_test = fem.make_test(space=scalar_space, space_partition=space_partition, domain=boundary)
+        bd_trial = fem.make_trial(space=scalar_space, space_partition=space_partition, domain=boundary)
+        bd_matrix = fem.integrate(mass_form, fields={"u": bd_trial, "v": bd_test})
+        # Diffusion form
+        trial = fem.make_trial(space=scalar_space, space_partition=space_partition, domain=domain)
+        matrix = fem.integrate(diffusion_form, fields={"u": trial, "v": test}, values={"nu": 1.0})
+        bsr_axpy(y=matrix, x=bd_matrix, alpha=self._bd_weight)
+        return matrix, rhs, space_partition.space_node_indices()
+if __name__ == "__main__":
+    wp.set_module_options({"enable_backward": False})
+    example = Example()
+    example.step()
+    example.render()
+    example.renderer.plot()