PyPI - warp-lang - Versions diffs - 0.11.0__py3-none-manylinux2014_x86_64.whl → 1.0.0__py3-none-manylinux2014_x86_64.whl - Mend

warp-lang 0.11.0__py3-none-manylinux2014_x86_64.whl → 1.0.0__py3-none-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (170) hide show

warp/__init__.py +8 -0
warp/bin/warp-clang.so +0 -0
warp/bin/warp.so +0 -0
warp/build.py +7 -6
warp/build_dll.py +70 -79
warp/builtins.py +10 -6
warp/codegen.py +51 -19
warp/config.py +7 -8
warp/constants.py +3 -0
warp/context.py +948 -245
warp/dlpack.py +198 -113
warp/examples/assets/bunny.usd +0 -0
warp/examples/assets/cartpole.urdf +110 -0
warp/examples/assets/crazyflie.usd +0 -0
warp/examples/assets/cube.usda +42 -0
warp/examples/assets/nv_ant.xml +92 -0
warp/examples/assets/nv_humanoid.xml +183 -0
warp/examples/assets/quadruped.urdf +268 -0
warp/examples/assets/rocks.nvdb +0 -0
warp/examples/assets/rocks.usd +0 -0
warp/examples/assets/sphere.usda +56 -0
warp/examples/assets/torus.usda +105 -0
warp/examples/benchmarks/benchmark_api.py +383 -0
warp/examples/benchmarks/benchmark_cloth.py +279 -0
warp/examples/benchmarks/benchmark_cloth_cupy.py +88 -0
warp/examples/benchmarks/benchmark_cloth_jax.py +100 -0
warp/examples/benchmarks/benchmark_cloth_numba.py +142 -0
warp/examples/benchmarks/benchmark_cloth_numpy.py +77 -0
warp/examples/benchmarks/benchmark_cloth_pytorch.py +86 -0
warp/examples/benchmarks/benchmark_cloth_taichi.py +112 -0
warp/examples/benchmarks/benchmark_cloth_warp.py +146 -0
warp/examples/benchmarks/benchmark_launches.py +295 -0
warp/examples/core/example_dem.py +221 -0
warp/examples/core/example_fluid.py +267 -0
warp/examples/core/example_graph_capture.py +129 -0
warp/examples/core/example_marching_cubes.py +177 -0
warp/examples/core/example_mesh.py +154 -0
warp/examples/core/example_mesh_intersect.py +193 -0
warp/examples/core/example_nvdb.py +169 -0
warp/examples/core/example_raycast.py +89 -0
warp/examples/core/example_raymarch.py +178 -0
warp/examples/core/example_render_opengl.py +141 -0
warp/examples/core/example_sph.py +389 -0
warp/examples/core/example_torch.py +181 -0
warp/examples/core/example_wave.py +249 -0
warp/examples/fem/bsr_utils.py +380 -0
warp/examples/fem/example_apic_fluid.py +391 -0
warp/examples/fem/example_convection_diffusion.py +168 -0
warp/examples/fem/example_convection_diffusion_dg.py +209 -0
warp/examples/fem/example_convection_diffusion_dg0.py +194 -0
warp/examples/fem/example_deformed_geometry.py +159 -0
warp/examples/fem/example_diffusion.py +173 -0
warp/examples/fem/example_diffusion_3d.py +152 -0
warp/examples/fem/example_diffusion_mgpu.py +214 -0
warp/examples/fem/example_mixed_elasticity.py +222 -0
warp/examples/fem/example_navier_stokes.py +243 -0
warp/examples/fem/example_stokes.py +192 -0
warp/examples/fem/example_stokes_transfer.py +249 -0
warp/examples/fem/mesh_utils.py +109 -0
warp/examples/fem/plot_utils.py +287 -0
warp/examples/optim/example_bounce.py +248 -0
warp/examples/optim/example_cloth_throw.py +210 -0
warp/examples/optim/example_diffray.py +535 -0
warp/examples/optim/example_drone.py +850 -0
warp/examples/optim/example_inverse_kinematics.py +169 -0
warp/examples/optim/example_inverse_kinematics_torch.py +170 -0
warp/examples/optim/example_spring_cage.py +234 -0
warp/examples/optim/example_trajectory.py +201 -0
warp/examples/sim/example_cartpole.py +128 -0
warp/examples/sim/example_cloth.py +184 -0
warp/examples/sim/example_granular.py +113 -0
warp/examples/sim/example_granular_collision_sdf.py +185 -0
warp/examples/sim/example_jacobian_ik.py +213 -0
warp/examples/sim/example_particle_chain.py +106 -0
warp/examples/sim/example_quadruped.py +179 -0
warp/examples/sim/example_rigid_chain.py +191 -0
warp/examples/sim/example_rigid_contact.py +176 -0
warp/examples/sim/example_rigid_force.py +126 -0
warp/examples/sim/example_rigid_gyroscopic.py +97 -0
warp/examples/sim/example_rigid_soft_contact.py +124 -0
warp/examples/sim/example_soft_body.py +178 -0
warp/fabric.py +29 -20
warp/fem/cache.py +0 -1
warp/fem/dirichlet.py +0 -2
warp/fem/integrate.py +0 -1
warp/jax.py +45 -0
warp/jax_experimental.py +339 -0
warp/native/builtin.h +12 -0
warp/native/bvh.cu +18 -18
warp/native/clang/clang.cpp +8 -3
warp/native/cuda_util.cpp +94 -5
warp/native/cuda_util.h +35 -6
warp/native/cutlass_gemm.cpp +1 -1
warp/native/cutlass_gemm.cu +4 -1
warp/native/error.cpp +66 -0
warp/native/error.h +27 -0
warp/native/mesh.cu +2 -2
warp/native/reduce.cu +4 -4
warp/native/runlength_encode.cu +2 -2
warp/native/scan.cu +2 -2
warp/native/sparse.cu +0 -1
warp/native/temp_buffer.h +2 -2
warp/native/warp.cpp +95 -60
warp/native/warp.cu +1053 -218
warp/native/warp.h +49 -32
warp/optim/linear.py +33 -16
warp/render/render_opengl.py +202 -101
warp/render/render_usd.py +82 -40
warp/sim/__init__.py +13 -4
warp/sim/articulation.py +4 -5
warp/sim/collide.py +320 -175
warp/sim/import_mjcf.py +25 -30
warp/sim/import_urdf.py +94 -63
warp/sim/import_usd.py +51 -36
warp/sim/inertia.py +3 -2
warp/sim/integrator.py +233 -0
warp/sim/integrator_euler.py +447 -469
warp/sim/integrator_featherstone.py +1991 -0
warp/sim/integrator_xpbd.py +1420 -640
warp/sim/model.py +765 -487
warp/sim/particles.py +2 -1
warp/sim/render.py +35 -13
warp/sim/utils.py +222 -11
warp/stubs.py +8 -0
warp/tape.py +16 -1
warp/tests/aux_test_grad_customs.py +23 -0
warp/tests/test_array.py +190 -1
warp/tests/test_async.py +656 -0
warp/tests/test_bool.py +50 -0
warp/tests/test_dlpack.py +164 -11
warp/tests/test_examples.py +166 -74
warp/tests/test_fem.py +8 -1
warp/tests/test_generics.py +15 -5
warp/tests/test_grad.py +1 -1
warp/tests/test_grad_customs.py +172 -12
warp/tests/test_jax.py +254 -0
warp/tests/test_large.py +29 -6
warp/tests/test_launch.py +25 -0
warp/tests/test_linear_solvers.py +20 -3
warp/tests/test_matmul.py +61 -16
warp/tests/test_matmul_lite.py +13 -13
warp/tests/test_mempool.py +186 -0
warp/tests/test_multigpu.py +3 -0
warp/tests/test_options.py +16 -2
warp/tests/test_peer.py +137 -0
warp/tests/test_print.py +3 -1
warp/tests/test_quat.py +23 -0
warp/tests/test_sim_kinematics.py +97 -0
warp/tests/test_snippet.py +126 -3
warp/tests/test_streams.py +108 -79
warp/tests/test_torch.py +16 -8
warp/tests/test_utils.py +32 -27
warp/tests/test_verify_fp.py +65 -0
warp/tests/test_volume.py +1 -1
warp/tests/unittest_serial.py +2 -0
warp/tests/unittest_suites.py +12 -0
warp/tests/unittest_utils.py +14 -7
warp/thirdparty/unittest_parallel.py +15 -3
warp/torch.py +10 -8
warp/types.py +363 -246
warp/utils.py +143 -19
warp_lang-1.0.0.dist-info/LICENSE.md +126 -0
warp_lang-1.0.0.dist-info/METADATA +394 -0
{warp_lang-0.11.0.dist-info → warp_lang-1.0.0.dist-info}/RECORD +167 -86
warp/sim/optimizer.py +0 -138
warp_lang-0.11.0.dist-info/LICENSE.md +0 -36
warp_lang-0.11.0.dist-info/METADATA +0 -238
/warp/tests/{walkthough_debug.py → walkthrough_debug.py} +0 -0
{warp_lang-0.11.0.dist-info → warp_lang-1.0.0.dist-info}/WHEEL +0 -0
{warp_lang-0.11.0.dist-info → warp_lang-1.0.0.dist-info}/top_level.txt +0 -0

warp/examples/benchmarks/benchmark_api.py ADDED Viewed

@@ -0,0 +1,383 @@
+# Copyright (c) 2022 NVIDIA CORPORATION.  All rights reserved.
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+import gc
+import statistics as stats
+import warp as wp
+ENABLE_MEMPOOLS = False
+ENABLE_PEER_ACCESS = False
+ENABLE_MEMPOOL_ACCESS = False
+ENABLE_MEMPOOL_RELEASE_THRESHOLD = False
+MEMPOOL_RELEASE_THRESHOLD = 1024 * 1024 * 1024
+DO_SYNC = False
+VERBOSE = False
+USE_NVTX = False
+num_elems = 10000
+num_runs = 10000
+trim_runs = 2500
+@wp.kernel
+def inc_kernel(a: wp.array(dtype=float)):
+    tid = wp.tid()
+    a[tid] = a[tid] + 1.0
+wp.init()
+# configure devices
+for target_device in wp.get_cuda_devices():
+    try:
+        wp.set_mempool_enabled(target_device, ENABLE_MEMPOOLS)
+        if ENABLE_MEMPOOL_RELEASE_THRESHOLD:
+            wp.set_mempool_release_threshold(target_device, MEMPOOL_RELEASE_THRESHOLD)
+    except Exception as e:
+        print(f"Error: {e}")
+    for peer_device in wp.get_cuda_devices():
+        try:
+            wp.set_peer_access_enabled(target_device, peer_device, ENABLE_PEER_ACCESS)
+        except Exception as e:
+            print(f"Error: {e}")
+        try:
+            wp.set_mempool_access_enabled(target_device, peer_device, ENABLE_MEMPOOL_ACCESS)
+        except Exception as e:
+            print(f"Error: {e}")
+cuda_device_count = wp.get_cuda_device_count()
+cuda0 = wp.get_device("cuda:0")
+# preallocate some arrays
+arr_host = wp.zeros(num_elems, dtype=float, device="cpu", pinned=False)
+arr_host_pinned = wp.zeros(num_elems, dtype=float, device="cpu", pinned=True)
+arr_cuda0 = wp.zeros(num_elems, dtype=float, device=cuda0)
+arr_cuda0_src = wp.zeros(num_elems, dtype=float, device=cuda0)
+arr_cuda0_dst = wp.zeros(num_elems, dtype=float, device=cuda0)
+# mgpu support
+if cuda_device_count > 1:
+    cuda1 = wp.get_device("cuda:1")
+    arr_cuda1 = wp.zeros(num_elems, dtype=float, device=cuda1)
+stream0 = wp.Stream(cuda0)
+# preload module
+wp.force_load(cuda0)
+if cuda_device_count > 1:
+    wp.force_load(cuda1)
+# capture graph
+with wp.ScopedDevice(cuda0):
+    wp.capture_begin()
+    wp.launch(inc_kernel, dim=arr_cuda0.size, inputs=[arr_cuda0])
+    graph0 = wp.capture_end()
+g_allocs = [None] * num_runs
+def test_alloc(num_elems, device, idx):
+    wp.synchronize()
+    with wp.ScopedTimer("alloc", print=VERBOSE, use_nvtx=USE_NVTX) as timer:
+        g_allocs[idx] = wp.empty(num_elems, dtype=float, device=device)
+        if DO_SYNC:
+            wp.synchronize_device(device)
+    return timer.elapsed
+def test_free(device, idx):
+    wp.synchronize()
+    with wp.ScopedTimer("free", print=VERBOSE, use_nvtx=USE_NVTX) as timer:
+        g_allocs[idx] = None
+        if DO_SYNC:
+            wp.synchronize_device(device)
+    return timer.elapsed
+def test_zeros(num_elems, device, idx):
+    wp.synchronize()
+    with wp.ScopedTimer("zeros", print=VERBOSE, use_nvtx=USE_NVTX) as timer:
+        g_allocs[idx] = wp.zeros(num_elems, dtype=float, device=device)
+        if DO_SYNC:
+            wp.synchronize_device(device)
+    return timer.elapsed
+def test_h2d(num_elems, device):
+    wp.synchronize()
+    with wp.ScopedTimer("h2d", print=VERBOSE, use_nvtx=USE_NVTX) as timer:
+        wp.copy(arr_cuda0, arr_host)
+        if DO_SYNC:
+            wp.synchronize_device(device)
+    return timer.elapsed
+def test_d2h(num_elems, device):
+    wp.synchronize()
+    with wp.ScopedTimer("d2h", print=VERBOSE, use_nvtx=USE_NVTX) as timer:
+        wp.copy(arr_host, arr_cuda0)
+        if DO_SYNC:
+            wp.synchronize_device(device)
+    return timer.elapsed
+def test_h2d_pinned(num_elems, device):
+    wp.synchronize()
+    with wp.ScopedTimer("h2d pinned", print=VERBOSE, use_nvtx=USE_NVTX) as timer:
+        wp.copy(arr_cuda0, arr_host_pinned)
+        if DO_SYNC:
+            wp.synchronize_device(device)
+    return timer.elapsed
+def test_d2h_pinned(num_elems, device):
+    wp.synchronize()
+    with wp.ScopedTimer("d2h pinned", print=VERBOSE, use_nvtx=USE_NVTX) as timer:
+        wp.copy(arr_host_pinned, arr_cuda0)
+        if DO_SYNC:
+            wp.synchronize_device(device)
+    return timer.elapsed
+def test_d2d(num_elems, device):
+    wp.synchronize()
+    with wp.ScopedTimer("d2d", print=VERBOSE, use_nvtx=USE_NVTX) as timer:
+        wp.copy(arr_cuda0_dst, arr_cuda0_src)
+        if DO_SYNC:
+            wp.synchronize_device(device)
+    return timer.elapsed
+def test_p2p(num_elems, src_device, dst_device):
+    wp.synchronize()
+    with wp.ScopedTimer("p2p", print=VERBOSE, use_nvtx=USE_NVTX) as timer:
+        wp.copy(arr_cuda0, arr_cuda1)
+        if DO_SYNC:
+            wp.synchronize_device(src_device)
+            wp.synchronize_device(dst_device)
+    return timer.elapsed
+def test_p2p_stream(num_elems, src_device, dst_device):
+    stream = stream0
+    wp.synchronize()
+    with wp.ScopedTimer("p2p stream", print=VERBOSE, use_nvtx=USE_NVTX) as timer:
+        wp.copy(arr_cuda0, arr_cuda1, stream=stream)
+        if DO_SYNC:
+            wp.synchronize_device(src_device)
+            wp.synchronize_device(dst_device)
+    return timer.elapsed
+def test_launch(num_elems, device):
+    a = arr_cuda0
+    wp.synchronize()
+    with wp.ScopedTimer("launch", print=VERBOSE, use_nvtx=USE_NVTX) as timer:
+        wp.launch(inc_kernel, dim=a.size, inputs=[a], device=device)
+        if DO_SYNC:
+            wp.synchronize_device(device)
+    return timer.elapsed
+def test_launch_stream(num_elems, device):
+    a = arr_cuda0
+    stream = stream0
+    wp.synchronize()
+    with wp.ScopedTimer("launch stream", print=VERBOSE, use_nvtx=USE_NVTX) as timer:
+        wp.launch(inc_kernel, dim=a.size, inputs=[a], stream=stream)
+        if DO_SYNC:
+            wp.synchronize_device(device)
+    return timer.elapsed
+def test_graph(num_elems, device):
+    wp.synchronize()
+    with wp.ScopedTimer("graph", print=VERBOSE, use_nvtx=USE_NVTX) as timer:
+        wp.capture_launch(graph0)
+        if DO_SYNC:
+            wp.synchronize_device(device)
+    return timer.elapsed
+def test_graph_stream(num_elems, device):
+    wp.synchronize()
+    with wp.ScopedTimer("graph", print=VERBOSE, use_nvtx=USE_NVTX) as timer:
+        wp.capture_launch(graph0, stream=stream0)
+        if DO_SYNC:
+            wp.synchronize_device(device)
+    return timer.elapsed
+alloc_times = [0] * num_runs
+free_times = [0] * num_runs
+zeros_times = [0] * num_runs
+d2h_times = [0] * num_runs
+h2d_times = [0] * num_runs
+d2h_pinned_times = [0] * num_runs
+h2d_pinned_times = [0] * num_runs
+d2d_times = [0] * num_runs
+p2p_times = [0] * num_runs
+p2p_stream_times = [0] * num_runs
+launch_times = [0] * num_runs
+launch_stream_times = [0] * num_runs
+graph_times = [0] * num_runs
+graph_stream_times = [0] * num_runs
+wp.set_device(cuda0)
+# alloc
+for i in range(num_runs):
+    gc.disable()
+    alloc_times[i] = test_alloc(num_elems, cuda0, i)
+    gc.enable()
+# free
+for i in range(num_runs):
+    gc.disable()
+    free_times[i] = test_free(cuda0, i)
+    gc.enable()
+# zeros
+for i in range(num_runs):
+    gc.disable()
+    zeros_times[i] = test_zeros(num_elems, cuda0, i)
+    gc.enable()
+# free zeros
+for i in range(num_runs):
+    g_allocs[i] = None
+# h2d, d2h pageable copy
+for i in range(num_runs):
+    gc.disable()
+    h2d_times[i] = test_h2d(num_elems, cuda0)
+    d2h_times[i] = test_d2h(num_elems, cuda0)
+    gc.enable()
+# h2d, d2h pinned copy
+for i in range(num_runs):
+    gc.disable()
+    h2d_pinned_times[i] = test_h2d_pinned(num_elems, cuda0)
+    d2h_pinned_times[i] = test_d2h_pinned(num_elems, cuda0)
+    gc.enable()
+# d2d copy
+for i in range(num_runs):
+    gc.disable()
+    d2d_times[i] = test_d2d(num_elems, cuda0)
+    gc.enable()
+# p2p copy
+if cuda_device_count > 1:
+    for i in range(num_runs):
+        gc.disable()
+        p2p_times[i] = test_p2p(num_elems, cuda1, cuda0)
+        p2p_stream_times[i] = test_p2p_stream(num_elems, cuda1, cuda0)
+        gc.enable()
+# launch
+for i in range(num_runs):
+    gc.disable()
+    launch_times[i] = test_launch(num_elems, cuda0)
+    launch_stream_times[i] = test_launch_stream(num_elems, cuda0)
+    gc.enable()
+# graph
+for i in range(num_runs):
+    gc.disable()
+    graph_times[i] = test_graph(num_elems, cuda0)
+    graph_stream_times[i] = test_graph_stream(num_elems, cuda0)
+    gc.enable()
+def print_stat(name, data, trim=trim_runs):
+    assert(len(data) - 2 * trim > 0)
+    if trim > 0:
+        data = sorted(data)[trim:-trim]
+    print(f"{name:15s} {1000000 * stats.mean(data):.0f}")
+print("=========================")
+print_stat("Alloc", alloc_times)
+print_stat("Free", free_times)
+print_stat("Zeros", zeros_times)
+print_stat("H2D", h2d_times)
+print_stat("D2H", d2h_times)
+print_stat("H2D pinned", h2d_pinned_times)
+print_stat("D2H pinned", d2h_pinned_times)
+print_stat("D2D", d2d_times)
+print_stat("P2P", p2p_times)
+print_stat("P2P stream", p2p_stream_times)
+print_stat("Launch", launch_times)
+print_stat("Launch stream", launch_stream_times)
+print_stat("Graph", graph_times)
+print_stat("Graph stream", graph_stream_times)
+# ========= profiling ==========
+# from pyinstrument import Profiler
+# profiler = Profiler()
+# profiler.start()
+# for i in range(10):
+#     # test_alloc(num_elems, cuda0)
+#     # test_h2d(num_elems, cuda0)
+#     test_p2p(num_elems, cuda0, cuda1)
+# profiler.stop()
+# print(profiler.output_text(show_all=True))

warp/examples/benchmarks/benchmark_cloth.py ADDED Viewed

@@ -0,0 +1,279 @@
+# Copyright (c) 2022 NVIDIA CORPORATION.  All rights reserved.
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+# include parent path
+import os
+import sys, getopt
+import numpy as np
+import math
+import ctypes
+sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))
+from pxr import Usd, UsdGeom, Gf, Sdf
+import warp as wp
+class Cloth:
+    def __init__(
+        self, lower, dx, dy, radius, stretch_stiffness, bend_stiffness, shear_stiffness, mass, fix_corners=True
+    ):
+        self.triangles = []
+        self.positions = []
+        self.velocities = []
+        self.inv_masses = []
+        self.spring_indices = []
+        self.spring_lengths = []
+        self.spring_stiffness = []
+        self.spring_damping = []
+        def grid(x, y, stride):
+            return y * stride + x
+        def create_spring(i, j, stiffness, damp=10.0):
+            length = np.linalg.norm(np.array(self.positions[i]) - np.array(self.positions[j]))
+            self.spring_indices.append(i)
+            self.spring_indices.append(j)
+            self.spring_lengths.append(length)
+            self.spring_stiffness.append(stiffness)
+            self.spring_damping.append(damp)
+        for y in range(dy):
+            for x in range(dx):
+                p = np.array(lower) + radius * np.array((float(x), float(0.0), float(y)))
+                self.positions.append(p)
+                self.velocities.append(np.zeros(3))
+                if x > 0 and y > 0:
+                    self.triangles.append(grid(x - 1, y - 1, dx))
+                    self.triangles.append(grid(x, y - 1, dx))
+                    self.triangles.append(grid(x, y, dx))
+                    self.triangles.append(grid(x - 1, y - 1, dx))
+                    self.triangles.append(grid(x, y, dx))
+                    self.triangles.append(grid(x - 1, y, dx))
+                if fix_corners and y == 0 and (x == 0 or x == dx - 1):
+                    w = 0.0
+                else:
+                    w = 1.0 / mass
+                self.inv_masses.append(w)
+        # horizontal springs
+        for y in range(dy):
+            for x in range(dx):
+                index0 = y * dx + x
+                if x > 0:
+                    index1 = y * dx + x - 1
+                    create_spring(index0, index1, stretch_stiffness)
+                if x > 1 and bend_stiffness > 0.0:
+                    index2 = y * dx + x - 2
+                    create_spring(index0, index2, bend_stiffness)
+                if y > 0 and x < dx - 1 and shear_stiffness > 0.0:
+                    indexDiag = (y - 1) * dx + x + 1
+                    create_spring(index0, indexDiag, shear_stiffness)
+                if y > 0 and x > 0 and shear_stiffness > 0.0:
+                    indexDiag = (y - 1) * dx + x - 1
+                    create_spring(index0, indexDiag, shear_stiffness)
+        # vertical
+        for x in range(dx):
+            for y in range(dy):
+                index0 = y * dx + x
+                if y > 0:
+                    index1 = (y - 1) * dx + x
+                    create_spring(index0, index1, stretch_stiffness)
+                if y > 1 and bend_stiffness > 0.0:
+                    index2 = (y - 2) * dx + x
+                    create_spring(index0, index2, bend_stiffness)
+        # harden to np arrays
+        self.positions = np.array(self.positions, dtype=np.float32)
+        self.velocities = np.array(self.velocities, dtype=np.float32)
+        self.inv_masses = np.array(self.inv_masses, dtype=np.float32)
+        self.spring_lengths = np.array(self.spring_lengths, dtype=np.float32)
+        self.spring_indices = np.array(self.spring_indices, dtype=np.int32)
+        self.spring_stiffness = np.array(self.spring_stiffness, dtype=np.float32)
+        self.spring_damping = np.array(self.spring_damping, dtype=np.float32)
+        self.num_particles = len(self.positions)
+        self.num_springs = len(self.spring_lengths)
+        self.num_tris = int(len(self.triangles) / 3)
+def run_benchmark(mode, dim, timers, render=False):
+    # params
+    sim_width = dim
+    sim_height = dim
+    sim_fps = 60.0
+    sim_substeps = 16
+    sim_duration = 1.0
+    sim_frames = int(sim_duration * sim_fps)
+    sim_dt = 1.0 / sim_fps
+    sim_time = 0.0
+    # wave constants
+    k_stretch = 1000.0
+    k_shear = 1000.0
+    k_bend = 1000.0
+    k_damp = 0.0
+    cloth = Cloth(
+        lower=(0.0, 0.0, 0.0),
+        dx=sim_width,
+        dy=sim_height,
+        radius=0.1,
+        stretch_stiffness=k_stretch,
+        bend_stiffness=k_bend,
+        shear_stiffness=k_shear,
+        mass=0.1,
+        fix_corners=True,
+    )
+    if render:
+        # set up grid for visualization
+        stage = Usd.Stage.CreateNew(os.path.join(os.path.dirname(__file__), "outputs/benchmark.usd"))
+        stage.SetStartTimeCode(0.0)
+        stage.SetEndTimeCode(sim_duration * sim_fps)
+        stage.SetTimeCodesPerSecond(sim_fps)
+        grid = UsdGeom.Mesh.Define(stage, "/root")
+        grid.GetPointsAttr().Set(cloth.positions, 0.0)
+        grid.GetFaceVertexIndicesAttr().Set(cloth.triangles, 0.0)
+        grid.GetFaceVertexCountsAttr().Set([3] * cloth.num_tris, 0.0)
+    with wp.ScopedTimer("Initialization", dict=timers):
+        if mode == "warp_cpu":
+            import examples.benchmark_cloth_warp
+            integrator = examples.benchmark_cloth_warp.WpIntegrator(cloth, "cpu")
+        elif mode == "warp_gpu":
+            import examples.benchmark_cloth_warp
+            integrator = examples.benchmark_cloth_warp.WpIntegrator(cloth, "cuda")
+        elif mode == "taichi_cpu":
+            import examples.benchmark_cloth_taichi
+            integrator = examples.benchmark_cloth_taichi.TiIntegrator(cloth, "cpu")
+        elif mode == "taichi_gpu":
+            import examples.benchmark_cloth_taichi
+            integrator = examples.benchmark_cloth_taichi.TiIntegrator(cloth, "cuda")
+        elif mode == "numpy":
+            import examples.benchmark_cloth_numpy
+            integrator = examples.benchmark_cloth_numpy.NpIntegrator(cloth)
+        elif mode == "cupy":
+            import examples.benchmark_cloth_cupy
+            integrator = examples.benchmark_cloth_cupy.CpIntegrator(cloth)
+        elif mode == "numba":
+            import examples.benchmark_cloth_numba
+            integrator = examples.benchmark_cloth_numba.NbIntegrator(cloth)
+        elif mode == "torch_cpu":
+            import examples.benchmark_cloth_pytorch
+            integrator = examples.benchmark_cloth_pytorch.TrIntegrator(cloth, "cpu")
+        elif mode == "torch_gpu":
+            import examples.benchmark_cloth_pytorch
+            integrator = examples.benchmark_cloth_pytorch.TrIntegrator(cloth, "cuda")
+        elif mode == "jax_cpu":
+            os.environ["JAX_PLATFORM_NAME"] = "cpu"
+            import examples.benchmark_cloth_jax
+            integrator = examples.benchmark_cloth_jax.JxIntegrator(cloth)
+        elif mode == "jax_gpu":
+            os.environ["JAX_PLATFORM_NAME"] = "gpu"
+            import examples.benchmark_cloth_jax
+            integrator = examples.benchmark_cloth_jax.JxIntegrator(cloth)
+        else:
+            raise RuntimeError("Unknown simulation backend")
+            # run one warm-up iteration to accurately measure initialization time (some engines do lazy init)
+            positions = integrator.simulate(sim_dt, sim_substeps)
+    label = "Dim ({}^2)".format(dim)
+    # run simulation
+    for i in range(sim_frames):
+        # simulate
+        with wp.ScopedTimer(label, dict=timers):
+            positions = integrator.simulate(sim_dt, sim_substeps)
+        if render:
+            grid.GetPointsAttr().Set(positions, sim_time * sim_fps)
+        sim_time += sim_dt
+    if render:
+        stage.Save()
+# record profiling information
+timers = {}
+if len(sys.argv) > 1:
+    mode = sys.argv[1]
+else:
+    mode = "warp_gpu"
+run_benchmark(mode, 32, timers, render=False)
+run_benchmark(mode, 64, timers, render=False)
+run_benchmark(mode, 128, timers, render=False)
+# write results
+import csv
+for k, v in timers.items():
+    print("{:16} min: {:8.2f} max: {:8.2f} avg: {:8.2f}".format(k, np.min(v), np.max(v), np.mean(v)))
+report = open(os.path.join(os.path.dirname(__file__), "outputs/benchmark.csv"), "a")
+writer = csv.writer(report, delimiter=",")
+if report.tell() == 0:
+    writer.writerow(["Name", "Init", "Dim (32^2)", "Dim (64^2)", "Dim (128^2)"])
+writer.writerow(
+    [
+        mode,
+        np.max(timers["Initialization"]),
+        np.mean(timers["Dim (32^2)"]),
+        np.mean(timers["Dim (64^2)"]),
+        np.mean(timers["Dim (128^2)"]),
+    ]
+)
+report.close()