PyPI - warp-lang - Versions diffs - 1.4.2__py3-none-macosx_10_13_universal2.whl → 1.5.1__py3-none-macosx_10_13_universal2.whl - Mend

warp-lang 1.4.2__py3-none-macosx_10_13_universal2.whl → 1.5.1__py3-none-macosx_10_13_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (165) hide show

warp/__init__.py +4 -0
warp/autograd.py +43 -8
warp/bin/libwarp.dylib +0 -0
warp/build.py +21 -2
warp/build_dll.py +23 -6
warp/builtins.py +1819 -7
warp/codegen.py +197 -61
warp/config.py +2 -2
warp/context.py +379 -107
warp/examples/assets/pixel.jpg +0 -0
warp/examples/benchmarks/benchmark_cloth_paddle.py +86 -0
warp/examples/benchmarks/benchmark_gemm.py +121 -0
warp/examples/benchmarks/benchmark_interop_paddle.py +158 -0
warp/examples/benchmarks/benchmark_tile.py +179 -0
warp/examples/fem/example_adaptive_grid.py +37 -10
warp/examples/fem/example_apic_fluid.py +3 -2
warp/examples/fem/example_convection_diffusion_dg.py +4 -5
warp/examples/fem/example_deformed_geometry.py +1 -1
warp/examples/fem/example_diffusion_3d.py +47 -4
warp/examples/fem/example_distortion_energy.py +220 -0
warp/examples/fem/example_magnetostatics.py +127 -85
warp/examples/fem/example_nonconforming_contact.py +5 -5
warp/examples/fem/example_stokes.py +3 -1
warp/examples/fem/example_streamlines.py +12 -19
warp/examples/fem/utils.py +38 -15
warp/examples/sim/example_cloth.py +4 -25
warp/examples/sim/example_quadruped.py +2 -1
warp/examples/tile/example_tile_convolution.py +58 -0
warp/examples/tile/example_tile_fft.py +47 -0
warp/examples/tile/example_tile_filtering.py +105 -0
warp/examples/tile/example_tile_matmul.py +79 -0
warp/examples/tile/example_tile_mlp.py +375 -0
warp/fem/__init__.py +8 -0
warp/fem/cache.py +16 -12
warp/fem/dirichlet.py +1 -1
warp/fem/domain.py +44 -1
warp/fem/field/__init__.py +1 -2
warp/fem/field/field.py +31 -19
warp/fem/field/nodal_field.py +101 -49
warp/fem/field/virtual.py +794 -0
warp/fem/geometry/__init__.py +2 -2
warp/fem/geometry/deformed_geometry.py +3 -105
warp/fem/geometry/element.py +13 -0
warp/fem/geometry/geometry.py +165 -7
warp/fem/geometry/grid_2d.py +3 -6
warp/fem/geometry/grid_3d.py +31 -28
warp/fem/geometry/hexmesh.py +3 -46
warp/fem/geometry/nanogrid.py +3 -2
warp/fem/geometry/{quadmesh_2d.py → quadmesh.py} +280 -159
warp/fem/geometry/tetmesh.py +2 -43
warp/fem/geometry/{trimesh_2d.py → trimesh.py} +354 -186
warp/fem/integrate.py +683 -261
warp/fem/linalg.py +404 -0
warp/fem/operator.py +101 -18
warp/fem/polynomial.py +5 -5
warp/fem/quadrature/quadrature.py +45 -21
warp/fem/space/__init__.py +45 -11
warp/fem/space/basis_function_space.py +451 -0
warp/fem/space/basis_space.py +58 -11
warp/fem/space/function_space.py +146 -5
warp/fem/space/grid_2d_function_space.py +80 -66
warp/fem/space/grid_3d_function_space.py +113 -68
warp/fem/space/hexmesh_function_space.py +96 -108
warp/fem/space/nanogrid_function_space.py +62 -110
warp/fem/space/quadmesh_function_space.py +208 -0
warp/fem/space/shape/__init__.py +45 -7
warp/fem/space/shape/cube_shape_function.py +328 -54
warp/fem/space/shape/shape_function.py +10 -1
warp/fem/space/shape/square_shape_function.py +328 -60
warp/fem/space/shape/tet_shape_function.py +269 -19
warp/fem/space/shape/triangle_shape_function.py +238 -19
warp/fem/space/tetmesh_function_space.py +69 -37
warp/fem/space/topology.py +38 -0
warp/fem/space/trimesh_function_space.py +179 -0
warp/fem/utils.py +6 -331
warp/jax_experimental.py +3 -1
warp/native/array.h +15 -0
warp/native/builtin.h +66 -26
warp/native/bvh.h +4 -0
warp/native/coloring.cpp +604 -0
warp/native/cuda_util.cpp +68 -51
warp/native/cuda_util.h +2 -1
warp/native/fabric.h +8 -0
warp/native/hashgrid.h +4 -0
warp/native/marching.cu +8 -0
warp/native/mat.h +14 -3
warp/native/mathdx.cpp +59 -0
warp/native/mesh.h +4 -0
warp/native/range.h +13 -1
warp/native/reduce.cpp +9 -1
warp/native/reduce.cu +7 -0
warp/native/runlength_encode.cpp +9 -1
warp/native/runlength_encode.cu +7 -1
warp/native/scan.cpp +8 -0
warp/native/scan.cu +8 -0
warp/native/scan.h +8 -1
warp/native/sparse.cpp +8 -0
warp/native/sparse.cu +8 -0
warp/native/temp_buffer.h +7 -0
warp/native/tile.h +1854 -0
warp/native/tile_gemm.h +341 -0
warp/native/tile_reduce.h +210 -0
warp/native/volume_builder.cu +8 -0
warp/native/volume_builder.h +8 -0
warp/native/warp.cpp +10 -2
warp/native/warp.cu +369 -15
warp/native/warp.h +12 -2
warp/optim/adam.py +39 -4
warp/paddle.py +29 -12
warp/render/render_opengl.py +140 -67
warp/sim/graph_coloring.py +292 -0
warp/sim/import_urdf.py +8 -8
warp/sim/integrator_euler.py +4 -2
warp/sim/integrator_featherstone.py +115 -44
warp/sim/integrator_vbd.py +6 -0
warp/sim/model.py +109 -32
warp/sparse.py +1 -1
warp/stubs.py +569 -4
warp/tape.py +12 -7
warp/tests/assets/pixel.npy +0 -0
warp/tests/aux_test_instancing_gc.py +18 -0
warp/tests/test_array.py +39 -0
warp/tests/test_codegen.py +81 -1
warp/tests/test_codegen_instancing.py +30 -0
warp/tests/test_collision.py +110 -0
warp/tests/test_coloring.py +251 -0
warp/tests/test_context.py +34 -0
warp/tests/test_examples.py +21 -5
warp/tests/test_fem.py +453 -113
warp/tests/test_func.py +34 -4
warp/tests/test_generics.py +52 -0
warp/tests/test_iter.py +68 -0
warp/tests/test_lerp.py +13 -87
warp/tests/test_mat_scalar_ops.py +1 -1
warp/tests/test_matmul.py +6 -9
warp/tests/test_matmul_lite.py +6 -11
warp/tests/test_mesh_query_point.py +1 -1
warp/tests/test_module_hashing.py +23 -0
warp/tests/test_overwrite.py +45 -0
warp/tests/test_paddle.py +27 -87
warp/tests/test_print.py +56 -1
warp/tests/test_smoothstep.py +17 -83
warp/tests/test_spatial.py +1 -1
warp/tests/test_static.py +3 -3
warp/tests/test_tile.py +744 -0
warp/tests/test_tile_mathdx.py +144 -0
warp/tests/test_tile_mlp.py +383 -0
warp/tests/test_tile_reduce.py +374 -0
warp/tests/test_tile_shared_memory.py +190 -0
warp/tests/test_vbd.py +12 -20
warp/tests/test_volume.py +43 -0
warp/tests/unittest_suites.py +19 -2
warp/tests/unittest_utils.py +4 -2
warp/types.py +340 -74
warp/utils.py +23 -3
{warp_lang-1.4.2.dist-info → warp_lang-1.5.1.dist-info}/METADATA +32 -7
{warp_lang-1.4.2.dist-info → warp_lang-1.5.1.dist-info}/RECORD +160 -133
{warp_lang-1.4.2.dist-info → warp_lang-1.5.1.dist-info}/WHEEL +1 -1
warp/fem/field/test.py +0 -180
warp/fem/field/trial.py +0 -183
warp/fem/space/collocated_function_space.py +0 -102
warp/fem/space/quadmesh_2d_function_space.py +0 -261
warp/fem/space/trimesh_2d_function_space.py +0 -153
{warp_lang-1.4.2.dist-info → warp_lang-1.5.1.dist-info}/LICENSE.md +0 -0
{warp_lang-1.4.2.dist-info → warp_lang-1.5.1.dist-info}/top_level.txt +0 -0

warp/examples/assets/pixel.jpg ADDED Viewed

Binary file

warp/examples/benchmarks/benchmark_cloth_paddle.py ADDED Viewed

@@ -0,0 +1,86 @@
+# Copyright (c) 2022 NVIDIA CORPORATION.  All rights reserved.
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+import paddle
+def eval_springs(x, v, indices, rest, ke, kd, f):
+    i = indices[:, 0]
+    j = indices[:, 1]
+    xi = x[i]
+    xj = x[j]
+    vi = v[i]
+    vj = v[j]
+    xij = xi - xj
+    vij = vi - vj
+    l = paddle.linalg.norm(xij, axis=1)
+    l_inv = 1.0 / l
+    # normalized spring direction
+    dir = (xij.T * l_inv).T
+    c = l - rest
+    dcdt = paddle.sum(dir * vij, axis=1)
+    # damping based on relative velocity.
+    fs = dir.T * (ke * c + kd * dcdt)
+    f.index_add_(axis=0, index=i, value=-fs.T)
+    f.index_add_(axis=0, index=j, value=fs.T)
+def integrate_particles(x, v, f, g, w, dt):
+    s = w > 0.0
+    a_ext = g * s[:, None].astype(g.dtype)
+    # simple semi-implicit Euler. v1 = v0 + a dt, x1 = x0 + v1 dt
+    v += ((f.T * w).T + a_ext) * dt
+    x += v * dt
+    # clear forces
+    f *= 0.0
+class TrIntegrator:
+    def __init__(self, cloth, device):
+        self.cloth = cloth
+        self.positions = paddle.to_tensor(self.cloth.positions, place=device)
+        self.velocities = paddle.to_tensor(self.cloth.velocities, place=device)
+        self.inv_mass = paddle.to_tensor(self.cloth.inv_masses, place=device)
+        self.spring_indices = paddle.to_tensor(self.cloth.spring_indices, dtype=paddle.int64, place=device)
+        self.spring_lengths = paddle.to_tensor(self.cloth.spring_lengths, place=device)
+        self.spring_stiffness = paddle.to_tensor(self.cloth.spring_stiffness, place=device)
+        self.spring_damping = paddle.to_tensor(self.cloth.spring_damping, place=device)
+        self.forces = paddle.zeros((self.cloth.num_particles, 3), dtype=paddle.float32).to(device=device)
+        self.gravity = paddle.to_tensor((0.0, 0.0 - 9.8, 0.0), dtype=paddle.float32, place=device)
+    def simulate(self, dt, substeps):
+        sim_dt = dt / substeps
+        for _s in range(substeps):
+            eval_springs(
+                self.positions,
+                self.velocities,
+                self.spring_indices.reshape((self.cloth.num_springs, 2)),
+                self.spring_lengths,
+                self.spring_stiffness,
+                self.spring_damping,
+                self.forces,
+            )
+            # integrate
+            integrate_particles(self.positions, self.velocities, self.forces, self.gravity, self.inv_mass, sim_dt)
+        return self.positions.cpu().numpy()

warp/examples/benchmarks/benchmark_gemm.py ADDED Viewed

@@ -0,0 +1,121 @@
+from itertools import product
+import numpy as np
+import torch as tc
+import warp as wp
+tc.backends.cuda.matmul.allow_tf32 = False  # Disable TF32 for matrix multiplications
+tc.backends.cudnn.allow_tf32 = False  # Disable TF32 for cuDNN operations
+wp.init()
+wp.clear_kernel_cache()
+wp.set_module_options({"fast_math": True, "enable_backward": False})
+def create_mlp_kernel(m, n, k):
+    TILE_M = m
+    TILE_N = n
+    TILE_K = k
+    @wp.kernel
+    def mlp(x: wp.array2d(dtype=float), weights_wp: wp.array2d(dtype=float), n_k: int, output: wp.array2d(dtype=float)):
+        i_m, i_n = wp.tid()
+        sum = wp.tile_zeros(m=TILE_M, n=TILE_N, dtype=wp.float32)
+        for count in range(n_k):
+            feat = wp.tile_load(x, i_m, count, TILE_M, TILE_K)
+            weight = wp.tile_load(weights_wp, count, i_n, TILE_K, TILE_N)
+            wp.tile_matmul(feat, weight, sum)
+        wp.tile_store(output, i_m, i_n, sum)
+    return mlp
+def benchmark_torch(A, B, warm_up, iterations):
+    # warm-up
+    for _ in range(warm_up):
+        tc.matmul(A, B)
+    timers = {}
+    tc.cuda.synchronize()
+    with wp.ScopedTimer("torch", print=False, dict=timers, synchronize=True):
+        for _ in range(iterations):
+            tc.matmul(A, B)
+        tc.cuda.synchronize()
+    return timers["torch"][0]
+def benchmark_warp(A, B, config, warm_up, iterations):
+    TILE_M = config[0]
+    TILE_N = config[1]
+    TILE_K = config[2]
+    BLOCK_DIM = config[3]
+    mlp = create_mlp_kernel(TILE_M, TILE_N, TILE_K)
+    M = A.shape[0]
+    N = B.shape[1]
+    K = A.shape[1]
+    output = wp.zeros((M, N), dtype=float)
+    # warm-up
+    for _ in range(warm_up):
+        wp.launch_tiled(
+            kernel=mlp, dim=[M // TILE_M, N // TILE_N], inputs=[A, B, K // TILE_K, output], block_dim=BLOCK_DIM
+        )
+    # check output
+    if warm_up > 0:
+        assert np.allclose(output.numpy(), A.numpy() @ B.numpy(), atol=1e-3, rtol=1e-3)
+    # benchmark
+    timers = {}
+    with wp.ScopedTimer("warp", print=False, dict=timers, synchronize=True):
+        for _ in range(iterations):
+            wp.launch_tiled(
+                kernel=mlp, dim=[M // TILE_M, N // TILE_N], inputs=[A, B, K // TILE_K, output], block_dim=BLOCK_DIM
+            )
+    return timers["warp"][0]
+tile_m = [8, 16, 32, 64]
+tile_n = [8, 16, 32, 64]
+tile_k = [8, 16, 64]
+block = [32, 64, 128]
+M = 1024
+N = 1024
+K = 1024
+A = tc.randn(M, K).cuda()
+B = tc.randn(K, N).cuda()
+iterations = 1000
+warm_up = 10
+time_torch = benchmark_torch(A, B, warm_up, iterations)
+print(f"Torch: {time_torch}")
+configs = list(product(tile_m, tile_n, tile_k, block))
+wp.config.quiet = True
+# header
+print(
+    "{:<{}} {:<{}} {:<{}} {:<{}} {:<{}} {:<{}}".format(
+        "TILE_M", 12, "TILE_N", 12, "TILE_K", 12, "BLOCK", 12, "Time", 12, "Relative", 12
+    )
+)
+for c in configs:
+    time_warp = benchmark_warp(wp.from_torch(A), wp.from_torch(B), c, warm_up, iterations)
+    print(
+        "{:<{}} {:<{}} {:<{}} {:<{}} {:<{}} {:<{}}".format(
+            c[0], 12, c[1], 12, c[2], 12, c[3], 12, time_warp, 12, time_warp / time_torch, 12
+        )
+    )

warp/examples/benchmarks/benchmark_interop_paddle.py ADDED Viewed

@@ -0,0 +1,158 @@
+# Copyright (c) 2024 NVIDIA CORPORATION.  All rights reserved.
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+import time
+import paddle
+import warp as wp
+def create_simple_kernel(dtype):
+    def simple_kernel(
+        a: wp.array(dtype=dtype),
+        b: wp.array(dtype=dtype),
+        c: wp.array(dtype=dtype),
+        d: wp.array(dtype=dtype),
+        e: wp.array(dtype=dtype),
+    ):
+        pass
+    return wp.Kernel(simple_kernel)
+def test_from_paddle(kernel, num_iters, array_size, device, warp_dtype=None):
+    warp_device = wp.get_device(device)
+    paddle_device = wp.device_to_paddle(warp_device)
+    if hasattr(warp_dtype, "_shape_"):
+        paddle_shape = (array_size, *warp_dtype._shape_)
+        paddle_dtype = wp.dtype_to_paddle(warp_dtype._wp_scalar_type_)
+    else:
+        paddle_shape = (array_size,)
+        paddle_dtype = paddle.float32 if warp_dtype is None else wp.dtype_to_paddle(warp_dtype)
+    _a = paddle.zeros(paddle_shape, dtype=paddle_dtype).to(device=paddle_device)
+    _b = paddle.zeros(paddle_shape, dtype=paddle_dtype).to(device=paddle_device)
+    _c = paddle.zeros(paddle_shape, dtype=paddle_dtype).to(device=paddle_device)
+    _d = paddle.zeros(paddle_shape, dtype=paddle_dtype).to(device=paddle_device)
+    _e = paddle.zeros(paddle_shape, dtype=paddle_dtype).to(device=paddle_device)
+    wp.synchronize()
+    # profiler = Profiler(interval=0.000001)
+    # profiler.start()
+    t1 = time.time_ns()
+    for _ in range(num_iters):
+        a = wp.from_paddle(_a, dtype=warp_dtype)
+        b = wp.from_paddle(_b, dtype=warp_dtype)
+        c = wp.from_paddle(_c, dtype=warp_dtype)
+        d = wp.from_paddle(_d, dtype=warp_dtype)
+        e = wp.from_paddle(_e, dtype=warp_dtype)
+        wp.launch(kernel, dim=array_size, inputs=[a, b, c, d, e])
+    t2 = time.time_ns()
+    print(f"{(t2 - t1) / 1_000_000 :8.0f} ms  from_paddle(...)")
+    # profiler.stop()
+    # profiler.print()
+def test_array_ctype_from_paddle(kernel, num_iters, array_size, device, warp_dtype=None):
+    warp_device = wp.get_device(device)
+    paddle_device = wp.device_to_paddle(warp_device)
+    if hasattr(warp_dtype, "_shape_"):
+        paddle_shape = (array_size, *warp_dtype._shape_)
+        paddle_dtype = wp.dtype_to_paddle(warp_dtype._wp_scalar_type_)
+    else:
+        paddle_shape = (array_size,)
+        paddle_dtype = paddle.float32 if warp_dtype is None else wp.dtype_to_paddle(warp_dtype)
+    _a = paddle.zeros(paddle_shape, dtype=paddle_dtype).to(device=paddle_device)
+    _b = paddle.zeros(paddle_shape, dtype=paddle_dtype).to(device=paddle_device)
+    _c = paddle.zeros(paddle_shape, dtype=paddle_dtype).to(device=paddle_device)
+    _d = paddle.zeros(paddle_shape, dtype=paddle_dtype).to(device=paddle_device)
+    _e = paddle.zeros(paddle_shape, dtype=paddle_dtype).to(device=paddle_device)
+    wp.synchronize()
+    # profiler = Profiler(interval=0.000001)
+    # profiler.start()
+    t1 = time.time_ns()
+    for _ in range(num_iters):
+        a = wp.from_paddle(_a, dtype=warp_dtype, return_ctype=True)
+        b = wp.from_paddle(_b, dtype=warp_dtype, return_ctype=True)
+        c = wp.from_paddle(_c, dtype=warp_dtype, return_ctype=True)
+        d = wp.from_paddle(_d, dtype=warp_dtype, return_ctype=True)
+        e = wp.from_paddle(_e, dtype=warp_dtype, return_ctype=True)
+        wp.launch(kernel, dim=array_size, inputs=[a, b, c, d, e])
+    t2 = time.time_ns()
+    print(f"{(t2 - t1) / 1_000_000 :8.0f} ms  from_paddle(..., return_ctype=True)")
+    # profiler.stop()
+    # profiler.print()
+def test_direct_from_paddle(kernel, num_iters, array_size, device, warp_dtype=None):
+    warp_device = wp.get_device(device)
+    paddle_device = wp.device_to_paddle(warp_device)
+    if hasattr(warp_dtype, "_shape_"):
+        paddle_shape = (array_size, *warp_dtype._shape_)
+        paddle_dtype = wp.dtype_to_paddle(warp_dtype._wp_scalar_type_)
+    else:
+        paddle_shape = (array_size,)
+        paddle_dtype = paddle.float32 if warp_dtype is None else wp.dtype_to_paddle(warp_dtype)
+    _a = paddle.zeros(paddle_shape, dtype=paddle_dtype).to(device=paddle_device)
+    _b = paddle.zeros(paddle_shape, dtype=paddle_dtype).to(device=paddle_device)
+    _c = paddle.zeros(paddle_shape, dtype=paddle_dtype).to(device=paddle_device)
+    _d = paddle.zeros(paddle_shape, dtype=paddle_dtype).to(device=paddle_device)
+    _e = paddle.zeros(paddle_shape, dtype=paddle_dtype).to(device=paddle_device)
+    wp.synchronize()
+    # profiler = Profiler(interval=0.000001)
+    # profiler.start()
+    t1 = time.time_ns()
+    for _ in range(num_iters):
+        wp.launch(kernel, dim=array_size, inputs=[_a, _b, _c, _d, _e])
+    t2 = time.time_ns()
+    print(f"{(t2 - t1) / 1_000_000 :8.0f} ms  direct from paddle")
+    # profiler.stop()
+    # profiler.print()
+wp.init()
+params = [
+    # (warp_dtype arg, kernel)
+    (None, create_simple_kernel(wp.float32)),
+    (wp.float32, create_simple_kernel(wp.float32)),
+    (wp.vec3f, create_simple_kernel(wp.vec3f)),
+    (wp.mat22f, create_simple_kernel(wp.mat22f)),
+]
+wp.load_module()
+num_iters = 100000
+for warp_dtype, kernel in params:
+    print(f"\ndtype={wp.context.type_str(warp_dtype)}")
+    test_from_paddle(kernel, num_iters, 10, "cuda:0", warp_dtype=warp_dtype)
+    test_array_ctype_from_paddle(kernel, num_iters, 10, "cuda:0", warp_dtype=warp_dtype)
+    test_direct_from_paddle(kernel, num_iters, 10, "cuda:0", warp_dtype=warp_dtype)

warp/examples/benchmarks/benchmark_tile.py ADDED Viewed

@@ -0,0 +1,179 @@
+# Copyright (c) 2024 NVIDIA CORPORATION.  All rights reserved.
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+import numpy as np
+import torch
+import warp as wp
+wp.init()
+wp.set_module_options({"enable_backward": False, "fast_math": True})
+wp.set_device("cuda:0")
+wp.build.clear_kernel_cache()
+@wp.kernel
+def gemm(A: wp.array2d(dtype=float), B: wp.array2d(dtype=float), C: wp.array2d(dtype=float)):
+    # output index
+    i, j = wp.tid()
+    sum = float(0.0)
+    for k in range(0, A.shape[1]):
+        sum += A[i, k] * B[k, j]
+    C[i, j] = sum
+TILE_M = wp.constant(64)
+TILE_N = wp.constant(64)
+TILE_K = wp.constant(8)
+@wp.kernel
+def gemm_tiled(A: wp.array2d(dtype=float), B: wp.array2d(dtype=float), C: wp.array2d(dtype=float)):
+    # output tile index
+    i, j = wp.tid()
+    sum = wp.tile_zeros(m=TILE_M, n=TILE_N, dtype=wp.float32)
+    _M = A.shape[0]
+    _N = B.shape[1]
+    K = A.shape[1]
+    count = int(K / 8)  # TODO: code-gen bug if you use a constant before passing it to a kwd arg (in this case TILE_K)
+    for k in range(count):
+        a = wp.tile_load(A, i, k, m=TILE_M, n=TILE_K)
+        b = wp.tile_load(B, k, j, m=TILE_K, n=TILE_N)
+        # sum += a*b
+        wp.tile_matmul(a, b, sum)
+    wp.tile_store(C, i, j, sum)
+def benchmark_numpy(A, B, C):
+    timers = {}
+    iters = 10
+    # warm up
+    for _i in range(10):
+        _C = A @ B
+    with wp.ScopedTimer("NumPy", dict=timers):
+        for _i in range(iters):
+            _C = A @ B
+    return min(timers["NumPy"])
+def benchmark_warp_simt(A, B, C):
+    timers = {}
+    iters = 10
+    A_wp = wp.array(A)
+    B_wp = wp.array(B)
+    C_wp = wp.array(C)
+    # warm up
+    for _i in range(10):
+        wp.launch(gemm, dim=(M, N), inputs=[A_wp, B_wp, C_wp])
+    with wp.ScopedTimer("Warp (SIMT)", dict=timers, print=False, synchronize=True):
+        for _i in range(iters):
+            wp.launch(gemm, dim=(M, N), inputs=[A_wp, B_wp, C_wp])
+    return min(timers["Warp (SIMT)"])
+def benchmark_warp_tiled(A, B, C):
+    timers = {}
+    iters = 10
+    # must match with the tile_matmul() partition size
+    SUB_TILE_M = 4
+    SUB_TILE_N = 4
+    num_threads = int(TILE_M / SUB_TILE_M) * int(TILE_N / SUB_TILE_N)
+    A_wp = wp.array(A)
+    B_wp = wp.array(B)
+    C_wp = wp.array(C)
+    # warm up
+    wp.capture_begin()
+    for _i in range(iters):
+        wp.launch(gemm_tiled, dim=(int(M / TILE_M), int(N / TILE_N)), inputs=[A_wp, B_wp, C_wp], tile_size=num_threads)
+    graph = wp.capture_end()
+    with wp.ScopedTimer("Warp (Tiled)", dict=timers, print=False, synchronize=True):
+        # for i in range(iters):
+        #    wp.launch(gemm_tiled, dim=(int(M/TILE_M), int(N/TILE_N)), inputs=[A_wp, B_wp, C_wp], tile_size=num_threads)
+        wp.capture_launch(graph)
+    return min(timers["Warp (Tiled)"])
+def benchmark_torch(A, B, C):
+    A_tc = torch.from_numpy(A).to("cuda:0")
+    B_tc = torch.from_numpy(B).to("cuda:0")
+    C_tc = torch.from_numpy(C).to("cuda:0")
+    # warm-up
+    for _i in range(10):
+        torch.matmul(A_tc, B_tc, out=C_tc)
+    timers = {}
+    iters = 10
+    torch.cuda.synchronize()
+    with wp.ScopedTimer("Torch", dict=timers, print=False):
+        for _i in range(iters):
+            torch.matmul(A_tc, B_tc)  # , out=C_tc)
+        torch.cuda.synchronize()
+    return min(timers["Torch"])
+results_torch = []
+results_warp_simt = []
+results_warp_tiled = []
+print("{:>8s} {:>8s} {:>8s} {:>8s} {:>8s} {:>8s}".format("M", "N", "K", "Torch", "Warp (SIMT)", "Warp (Tiled)"))
+print("--------------------------------------------------------")
+for i in range(2, 33):
+    # for i in range(8,9):
+    M = i * 128
+    N = M
+    K = N
+    # M = TILE_M*21
+    # K = TILE_K*7
+    # N = TILE_M*12
+    rng = np.random.default_rng(42)
+    A = rng.random((M, K), dtype=np.float32)
+    B = rng.random((K, N), dtype=np.float32)
+    C = np.zeros((M, N), dtype=np.float32)
+    results_torch.append(benchmark_torch(A, B, C))
+    results_warp_simt.append(0.0)  # benchmark_warp_simt(A, B, C))
+    results_warp_tiled.append(benchmark_warp_tiled(A, B, C))
+    print(
+        "{:>8d} {:>8d} {:>8d} {:>8f} {:>8f} {:>8f}".format(
+            M, N, K, results_torch[-1], results_warp_simt[-1], results_warp_tiled[-1]
+        )
+    )

warp/examples/fem/example_adaptive_grid.py CHANGED Viewed

@@ -56,7 +56,7 @@ def mass_form(
     u: fem.Field,
     v: fem.Field,
 ):
-    return u(s) * v(s)
+    return fem.linalg.generalized_inner(u(s), v(s))
 @fem.integrand
@@ -86,9 +86,12 @@ def pressure_anomaly_field(s: fem.Sample, domain: fem.Domain, pressure: fem.Fiel
 class Example:
-    def __init__(self, quiet=False, degree=2, base_resolution=8, level_count=4, headless: bool = False):
+    def __init__(
+        self, quiet=False, degree=2, div_conforming=False, base_resolution=8, level_count=4, headless: bool = False
+    ):
         self._quiet = quiet
         self._degree = degree
+        self._div_conforming = div_conforming
         # Start from a coarse, dense grid
         res = wp.vec3i(2 * base_resolution, base_resolution // 2, base_resolution)
@@ -110,9 +113,13 @@ class Example:
             sim_vol, level_count, refinement_field=refinement, grading="face"
         )
-        # Function spaces for velocity, scalars and pressure (Pk / Pk / Pk-1)
-        self._u_basis = fem.make_polynomial_basis_space(geo=self._geo, degree=self._degree)
-        u_space = fem.make_collocated_function_space(self._u_basis, dtype=wp.vec3)
+        # Function spaces for velocity, pressure (RTk / Pk-1 or Pk / Pk-1)
+        u_space = fem.make_polynomial_space(
+            geo=self._geo,
+            element_basis=fem.ElementBasis.RAVIART_THOMAS if div_conforming else None,
+            degree=self._degree,
+            dtype=wp.vec3,
+        )
         p_space = fem.make_polynomial_space(geo=self._geo, degree=self._degree - 1, dtype=float)
         self.pressure_field = p_space.make_field()
@@ -137,7 +144,17 @@ class Example:
     def render(self):
         # self.renderer.add_field("solution", self.pressure_field)
         self.plot.add_field("pressure_anomaly", self.pressure_anomaly_field)
-        self.plot.add_field("velocity", self.velocity_field)
+        if self._div_conforming:
+            # If using H(div)-conforming elements, interpolate to continuous space
+            velocity_field_lagrange = fem.make_polynomial_space(
+                self.velocity_field.geometry, dtype=wp.vec3, degree=self._degree
+            ).make_field()
+            fem.interpolate(self.velocity_field, dest=velocity_field_lagrange)
+        else:
+            velocity_field_lagrange = self.velocity_field
+        self.plot.add_field("velocity", velocity_field_lagrange)
     def step(self):
         u_space = self.velocity_field.space
@@ -153,9 +170,14 @@ class Example:
         fem.normalize_dirichlet_projector(dirichlet_projector)
         # (Diagonal) mass matrix
-        s_space = fem.make_collocated_function_space(self._u_basis, dtype=float)
-        rho_test = fem.make_test(s_space)
-        rho_trial = fem.make_trial(s_space)
+        if self._div_conforming:
+            rho_test = fem.make_test(u_space)
+            rho_trial = fem.make_trial(u_space)
+        else:
+            rho_space = fem.make_polynomial_space(geo=u_space.geometry, degree=self._degree)
+            rho_test = fem.make_test(rho_space)
+            rho_trial = fem.make_trial(rho_space)
         inv_mass_matrix = fem.integrate(
             mass_form, fields={"u": rho_trial, "v": rho_test}, nodal=True, output_dtype=float
         )
@@ -177,6 +199,7 @@ class Example:
             side_divergence_form,
             fields={"u": u_side_trial, "psi": p_side_test},
             output_dtype=float,
+            assembly="generic",  # not required, for test coverage purposes
         )
         # Solve incompressibility
@@ -204,7 +227,10 @@ if __name__ == "__main__":
     parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
     parser.add_argument("--device", type=str, default=None, help="Override the default Warp device.")
     parser.add_argument("--resolution", type=int, default=8, help="Grid resolution.")
-    parser.add_argument("--degree", type=int, default=2, help="Polynomial degree of shape functions.")
+    parser.add_argument("--degree", type=int, default=1, help="Polynomial degree of shape functions.")
+    parser.add_argument(
+        "--div_conforming", action="store_true", default=False, help="Use H(div)-conforming function space"
+    )
     parser.add_argument("--level_count", type=int, default=4, help="Number of refinement levels.")
     parser.add_argument(
         "--headless",
@@ -219,6 +245,7 @@ if __name__ == "__main__":
         example = Example(
             quiet=args.quiet,
             degree=args.degree,
+            div_conforming=args.div_conforming,
             base_resolution=args.resolution,
             level_count=args.level_count,
             headless=args.headless,

warp/examples/fem/example_apic_fluid.py CHANGED Viewed

@@ -12,6 +12,8 @@
 # grid and the PicQuadrature class.
 ###########################################################################
+from typing import Any
 import numpy as np
 import warp as wp
@@ -123,7 +125,7 @@ def scalar_vector_multiply(
 @wp.kernel
 def scale_transposed_divergence_mat(
     tr_divergence_mat_offsets: wp.array(dtype=int),
-    tr_divergence_mat_values: wp.array(dtype=wp.mat(shape=(3, 1), dtype=float)),
+    tr_divergence_mat_values: wp.array(dtype=Any),
     inv_fraction_int: wp.array(dtype=float),
 ):
     # In-place scaling of gradient operator rows with inverse mass
@@ -203,7 +205,6 @@ class Example:
         particle_grid_offset = wp.vec3(self.radius, self.radius, self.radius)
         # Initialize warp.sim model, spawn particles
-        np.random.seed(0)
         builder = wp.sim.ModelBuilder()
         builder.add_particle_grid(
             dim_x=particle_grid_res[0],