PyPI - warp-lang - Versions diffs - 1.4.1__py3-none-manylinux2014_aarch64.whl → 1.5.0__py3-none-manylinux2014_aarch64.whl - Mend

warp-lang 1.4.1__py3-none-manylinux2014_aarch64.whl → 1.5.0__py3-none-manylinux2014_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.

This version of warp-lang might be problematic. Click here for more details.

Files changed (164) hide show

warp/__init__.py +4 -0
warp/autograd.py +43 -8
warp/bin/warp-clang.so +0 -0
warp/bin/warp.so +0 -0
warp/build.py +21 -2
warp/build_dll.py +23 -6
warp/builtins.py +1920 -111
warp/codegen.py +186 -62
warp/config.py +2 -2
warp/context.py +322 -73
warp/examples/assets/pixel.jpg +0 -0
warp/examples/benchmarks/benchmark_cloth_paddle.py +86 -0
warp/examples/benchmarks/benchmark_gemm.py +121 -0
warp/examples/benchmarks/benchmark_interop_paddle.py +158 -0
warp/examples/benchmarks/benchmark_tile.py +179 -0
warp/examples/core/example_dem.py +2 -1
warp/examples/core/example_mesh_intersect.py +3 -3
warp/examples/fem/example_adaptive_grid.py +37 -10
warp/examples/fem/example_apic_fluid.py +3 -2
warp/examples/fem/example_convection_diffusion_dg.py +4 -5
warp/examples/fem/example_deformed_geometry.py +1 -1
warp/examples/fem/example_diffusion_3d.py +47 -4
warp/examples/fem/example_distortion_energy.py +220 -0
warp/examples/fem/example_magnetostatics.py +127 -85
warp/examples/fem/example_nonconforming_contact.py +5 -5
warp/examples/fem/example_stokes.py +3 -1
warp/examples/fem/example_streamlines.py +12 -19
warp/examples/fem/utils.py +38 -15
warp/examples/optim/example_walker.py +2 -2
warp/examples/sim/example_cloth.py +2 -25
warp/examples/sim/example_jacobian_ik.py +6 -2
warp/examples/sim/example_quadruped.py +2 -1
warp/examples/tile/example_tile_convolution.py +58 -0
warp/examples/tile/example_tile_fft.py +47 -0
warp/examples/tile/example_tile_filtering.py +105 -0
warp/examples/tile/example_tile_matmul.py +79 -0
warp/examples/tile/example_tile_mlp.py +375 -0
warp/fem/__init__.py +8 -0
warp/fem/cache.py +16 -12
warp/fem/dirichlet.py +1 -1
warp/fem/domain.py +44 -1
warp/fem/field/__init__.py +1 -2
warp/fem/field/field.py +31 -19
warp/fem/field/nodal_field.py +101 -49
warp/fem/field/virtual.py +794 -0
warp/fem/geometry/__init__.py +2 -2
warp/fem/geometry/deformed_geometry.py +3 -105
warp/fem/geometry/element.py +13 -0
warp/fem/geometry/geometry.py +165 -5
warp/fem/geometry/grid_2d.py +3 -6
warp/fem/geometry/grid_3d.py +31 -28
warp/fem/geometry/hexmesh.py +3 -46
warp/fem/geometry/nanogrid.py +3 -2
warp/fem/geometry/{quadmesh_2d.py → quadmesh.py} +280 -159
warp/fem/geometry/tetmesh.py +2 -43
warp/fem/geometry/{trimesh_2d.py → trimesh.py} +354 -186
warp/fem/integrate.py +683 -261
warp/fem/linalg.py +404 -0
warp/fem/operator.py +101 -18
warp/fem/polynomial.py +5 -5
warp/fem/quadrature/quadrature.py +45 -21
warp/fem/space/__init__.py +45 -11
warp/fem/space/basis_function_space.py +451 -0
warp/fem/space/basis_space.py +58 -11
warp/fem/space/function_space.py +146 -5
warp/fem/space/grid_2d_function_space.py +80 -66
warp/fem/space/grid_3d_function_space.py +113 -68
warp/fem/space/hexmesh_function_space.py +96 -108
warp/fem/space/nanogrid_function_space.py +62 -110
warp/fem/space/quadmesh_function_space.py +208 -0
warp/fem/space/shape/__init__.py +45 -7
warp/fem/space/shape/cube_shape_function.py +328 -54
warp/fem/space/shape/shape_function.py +10 -1
warp/fem/space/shape/square_shape_function.py +328 -60
warp/fem/space/shape/tet_shape_function.py +269 -19
warp/fem/space/shape/triangle_shape_function.py +238 -19
warp/fem/space/tetmesh_function_space.py +69 -37
warp/fem/space/topology.py +38 -0
warp/fem/space/trimesh_function_space.py +179 -0
warp/fem/utils.py +6 -331
warp/jax_experimental.py +3 -1
warp/native/array.h +55 -40
warp/native/builtin.h +124 -43
warp/native/bvh.h +4 -0
warp/native/coloring.cpp +600 -0
warp/native/cuda_util.cpp +14 -0
warp/native/cuda_util.h +2 -1
warp/native/fabric.h +8 -0
warp/native/hashgrid.h +4 -0
warp/native/marching.cu +8 -0
warp/native/mat.h +14 -3
warp/native/mathdx.cpp +59 -0
warp/native/mesh.h +4 -0
warp/native/range.h +13 -1
warp/native/reduce.cpp +9 -1
warp/native/reduce.cu +7 -0
warp/native/runlength_encode.cpp +9 -1
warp/native/runlength_encode.cu +7 -1
warp/native/scan.cpp +8 -0
warp/native/scan.cu +8 -0
warp/native/scan.h +8 -1
warp/native/sparse.cpp +8 -0
warp/native/sparse.cu +8 -0
warp/native/temp_buffer.h +7 -0
warp/native/tile.h +1857 -0
warp/native/tile_gemm.h +341 -0
warp/native/tile_reduce.h +210 -0
warp/native/volume_builder.cu +8 -0
warp/native/volume_builder.h +8 -0
warp/native/warp.cpp +10 -2
warp/native/warp.cu +369 -15
warp/native/warp.h +12 -2
warp/optim/adam.py +39 -4
warp/paddle.py +29 -12
warp/render/render_opengl.py +137 -65
warp/sim/graph_coloring.py +292 -0
warp/sim/integrator_euler.py +4 -2
warp/sim/integrator_featherstone.py +115 -44
warp/sim/integrator_vbd.py +6 -0
warp/sim/model.py +90 -17
warp/stubs.py +651 -85
warp/tape.py +12 -7
warp/tests/assets/pixel.npy +0 -0
warp/tests/aux_test_instancing_gc.py +18 -0
warp/tests/test_array.py +207 -48
warp/tests/test_closest_point_edge_edge.py +8 -8
warp/tests/test_codegen.py +120 -1
warp/tests/test_codegen_instancing.py +30 -0
warp/tests/test_collision.py +110 -0
warp/tests/test_coloring.py +241 -0
warp/tests/test_context.py +34 -0
warp/tests/test_examples.py +18 -4
warp/tests/test_fabricarray.py +33 -0
warp/tests/test_fem.py +453 -113
warp/tests/test_func.py +48 -1
warp/tests/test_generics.py +52 -0
warp/tests/test_iter.py +68 -0
warp/tests/test_mat_scalar_ops.py +1 -1
warp/tests/test_mesh_query_point.py +5 -4
warp/tests/test_module_hashing.py +23 -0
warp/tests/test_paddle.py +27 -87
warp/tests/test_print.py +191 -1
warp/tests/test_spatial.py +1 -1
warp/tests/test_tile.py +700 -0
warp/tests/test_tile_mathdx.py +144 -0
warp/tests/test_tile_mlp.py +383 -0
warp/tests/test_tile_reduce.py +374 -0
warp/tests/test_tile_shared_memory.py +190 -0
warp/tests/test_vbd.py +12 -20
warp/tests/test_volume.py +43 -0
warp/tests/unittest_suites.py +23 -2
warp/tests/unittest_utils.py +4 -0
warp/types.py +339 -73
warp/utils.py +22 -1
{warp_lang-1.4.1.dist-info → warp_lang-1.5.0.dist-info}/METADATA +33 -7
{warp_lang-1.4.1.dist-info → warp_lang-1.5.0.dist-info}/RECORD +159 -132
{warp_lang-1.4.1.dist-info → warp_lang-1.5.0.dist-info}/WHEEL +1 -1
warp/fem/field/test.py +0 -180
warp/fem/field/trial.py +0 -183
warp/fem/space/collocated_function_space.py +0 -102
warp/fem/space/quadmesh_2d_function_space.py +0 -261
warp/fem/space/trimesh_2d_function_space.py +0 -153
{warp_lang-1.4.1.dist-info → warp_lang-1.5.0.dist-info}/LICENSE.md +0 -0
{warp_lang-1.4.1.dist-info → warp_lang-1.5.0.dist-info}/top_level.txt +0 -0

warp/examples/tile/example_tile_convolution.py ADDED Viewed

@@ -0,0 +1,58 @@
+# Copyright (c) 2024 NVIDIA CORPORATION.  All rights reserved.
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+###########################################################################
+# Example Tile Convolution
+#
+# Shows how to write a simple convolution kernel using Warp FFT tile
+# primitives.
+#
+###########################################################################
+import numpy as np
+import warp as wp
+wp.set_module_options({"enable_backward": False})
+BLOCK_DIM = 64
+TILE_M = 1
+TILE_N = 128
+scale = wp.vec2d(wp.float64(1 / TILE_N), wp.float64(1 / TILE_N))
+@wp.func
+def filter(x: wp.vec2d):
+    return wp.cw_mul(x, scale)
+@wp.kernel
+def conv_tiled(x: wp.array2d(dtype=wp.vec2d), y: wp.array2d(dtype=wp.vec2d)):
+    i, j, _ = wp.tid()
+    a = wp.tile_load(x, i, j, m=TILE_M, n=TILE_N)
+    wp.tile_fft(a)
+    b = wp.tile_map(filter, a)
+    wp.tile_ifft(b)
+    wp.tile_store(y, i, j, b)
+if __name__ == "__main__":
+    wp.set_device("cuda:0")
+    rng = np.random.default_rng(42)
+    x_h = rng.standard_normal((TILE_M, TILE_N, 2), dtype=np.float64)
+    y_h = np.zeros_like(x_h)
+    x_wp = wp.array2d(x_h, dtype=wp.vec2d)
+    y_wp = wp.array2d(y_h, dtype=wp.vec2d)
+    wp.launch_tiled(conv_tiled, dim=[1, 1], inputs=[x_wp], outputs=[y_wp], block_dim=BLOCK_DIM)
+    # Since filter is 1/N, conv_tiled is a ~no-op
+    assert np.allclose(x_h, y_wp.numpy())

warp/examples/tile/example_tile_fft.py ADDED Viewed

@@ -0,0 +1,47 @@
+# Copyright (c) 2024 NVIDIA CORPORATION.  All rights reserved.
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+###########################################################################
+# Example Tile FFT
+#
+# Shows how to write a simple FFT kernel using Warp tile primitives.
+#
+###########################################################################
+import numpy as np
+import warp as wp
+wp.set_module_options({"enable_backward": False})
+BLOCK_DIM = 8
+TILE_M = 1
+TILE_N = 32
+@wp.kernel
+def fft_tiled(x: wp.array2d(dtype=wp.vec2d), y: wp.array2d(dtype=wp.vec2d)):
+    i, j, _ = wp.tid()
+    a = wp.tile_load(x, i, j, m=TILE_M, n=TILE_N)
+    wp.tile_fft(a)
+    wp.tile_ifft(a)
+    wp.tile_store(y, i, j, a)
+if __name__ == "__main__":
+    wp.set_device("cuda:0")
+    x_h = np.ones((TILE_M, TILE_N, 2), dtype=np.float64)
+    x_h[:, :, 1] = 0
+    y_h = 3 * np.ones((TILE_M, TILE_N, 2), dtype=np.float64)
+    x_wp = wp.array2d(x_h, dtype=wp.vec2d)
+    y_wp = wp.array2d(y_h, dtype=wp.vec2d)
+    wp.launch_tiled(fft_tiled, dim=[1, 1], inputs=[x_wp], outputs=[y_wp], block_dim=BLOCK_DIM)
+    print("Inputs:\n", x_wp)  # [1+0i, 1+0i, 1+0i, ...]
+    print("Output:\n", y_wp)  # [32+0i, 0, 0, ...]

warp/examples/tile/example_tile_filtering.py ADDED Viewed

@@ -0,0 +1,105 @@
+# Copyright (c) 2024 NVIDIA CORPORATION.  All rights reserved.
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+###########################################################################
+# Example Tile Filtering
+#
+# Shows how to write a simple filtering kernel using Warp FFT tile
+# primitives.
+#
+###########################################################################
+import numpy as np
+import warp as wp
+wp.set_module_options({"enable_backward": False})
+BLOCK_DIM = 128
+TILE_M = 1
+TILE_N = 512
+scale = wp.vec2d(wp.float64(1 / TILE_N), wp.float64(1 / TILE_N))
+def cplx(array):
+    return array[..., 0] + 1j * array[..., 1]
+@wp.func
+def cplx_prod(x: wp.vec2d, y: wp.vec2d):
+    return wp.cw_mul(wp.vec2d(x[0] * y[0] - x[1] * y[1], x[0] * y[1] + x[1] * y[0]), scale)
+@wp.kernel
+def conv_tiled(x: wp.array2d(dtype=wp.vec2d), y: wp.array2d(dtype=wp.vec2d), z: wp.array2d(dtype=wp.vec2d)):
+    i, j, _ = wp.tid()
+    a = wp.tile_load(x, i, j, m=TILE_M, n=TILE_N)
+    b = wp.tile_load(y, i, j, m=TILE_M, n=TILE_N)
+    wp.tile_fft(a)
+    c = wp.tile_map(cplx_prod, a, b)
+    wp.tile_ifft(c)
+    wp.tile_store(z, i, j, c)
+if __name__ == "__main__":
+    rng = np.random.default_rng(42)
+    # Create noisy input signal
+    t = np.linspace(0, 2 * np.pi, TILE_N, dtype=np.float64)
+    x = np.sin(t) + 0.5 * rng.random(TILE_N, dtype=np.float64)
+    # Create filter. This filter keeps only ~10% of the frequencies at the center
+    # of the spectrum.
+    f = np.ones_like(x)
+    freq = np.fft.fftfreq(TILE_N)
+    f[np.abs(freq) > 0.05] = 0.0
+    f[np.abs(freq) <= 0.05] = 1.0
+    # Create Warp input data
+    # We use vec2d to hold complex numbers
+    x_h = np.zeros((TILE_M, TILE_N, 2), dtype=np.float64)
+    f_h = np.zeros_like(x_h)
+    y_h = np.zeros_like(f_h)
+    x_h[:, :, 0] = x
+    f_h[:, :, 0] = f
+    x_wp = wp.array2d(x_h, dtype=wp.vec2d)
+    f_wp = wp.array2d(f_h, dtype=wp.vec2d)
+    y_wp = wp.array2d(y_h, dtype=wp.vec2d)
+    wp.launch_tiled(conv_tiled, dim=[1, 1], inputs=[x_wp, f_wp], outputs=[y_wp], block_dim=BLOCK_DIM)
+    # Extract output and compare with numpy
+    x_np = cplx(x_h)
+    f_np = cplx(f_h)
+    y_test = cplx(y_wp.numpy())
+    y_ref = np.fft.ifft(f_np * np.fft.fft(x_np))
+    assert np.allclose(y_ref, y_test)
+try:
+    import matplotlib.pyplot as plt
+    fig, ax = plt.subplots(figsize=(10, 5))
+    ax.plot(
+        x,
+        color="#DDDDDD",
+        linewidth=2,
+        label="Original",
+    )
+    ax.plot(y_test[0, :].real, color="#76B900", linewidth=3, label="Smoothed")
+    ax.legend()
+    ax.grid(True)
+    plt.tight_layout()
+    plt.show()
+except ModuleNotFoundError:
+    print("Matplotlib not available; skipping figure")

warp/examples/tile/example_tile_matmul.py ADDED Viewed

@@ -0,0 +1,79 @@
+# Copyright (c) 2024 NVIDIA CORPORATION.  All rights reserved.
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+###########################################################################
+# Example Tile MatMul
+#
+# Shows how to write a simple GEMM kernel using Warp tile primitives.
+#
+###########################################################################
+import numpy as np
+import warp as wp
+# tile size
+TILE_M = wp.constant(8)
+TILE_N = wp.constant(4)
+TILE_K = wp.constant(8)
+# num threads per-tile
+TILE_THREADS = 64
+@wp.kernel
+def tile_gemm(A: wp.array2d(dtype=wp.float32), B: wp.array2d(dtype=wp.float16), C: wp.array2d(dtype=wp.float64)):
+    # output tile index
+    i, j = wp.tid()
+    sum = wp.tile_zeros(m=TILE_M, n=TILE_N, dtype=wp.float64)
+    _M = A.shape[0]
+    _N = B.shape[1]
+    K = A.shape[1]
+    count = int(K / TILE_K)
+    for k in range(0, count):
+        a = wp.tile_load(A, i, k, m=TILE_M, n=TILE_K)
+        b = wp.tile_load(B, k, j, m=TILE_K, n=TILE_N)
+        # sum += a*b
+        wp.tile_matmul(a, b, sum)
+    wp.tile_store(C, i, j, sum)
+if __name__ == "__main__":
+    wp.set_device("cuda:0")
+    # generate some tile aligned matrix dimensions
+    M = TILE_M * 7
+    K = TILE_K * 6
+    N = TILE_N * 5
+    rng = np.random.default_rng(42)
+    A = rng.random((M, K), dtype=np.float32)
+    B = rng.random((K, N), dtype=np.float32).astype(np.float16)
+    C = np.zeros((M, N), dtype=np.float64)
+    A_wp = wp.array(A, requires_grad=True)
+    B_wp = wp.array(B, requires_grad=True)
+    C_wp = wp.array(C, requires_grad=True)
+    with wp.Tape() as tape:
+        wp.launch_tiled(
+            tile_gemm,
+            dim=(int(M / TILE_M), int(N / TILE_N)),
+            inputs=[A_wp, B_wp],
+            outputs=[C_wp],
+            block_dim=TILE_THREADS,
+        )
+    assert np.allclose(C_wp.numpy(), A @ B)
+    print("Example matrix multiplication passed")

warp/examples/tile/example_tile_mlp.py ADDED Viewed

@@ -0,0 +1,375 @@
+# Copyright (c) 2024 NVIDIA CORPORATION.  All rights reserved.
+# NVIDIA CORPORATION and its licensors retain all intellectual property
+# and proprietary rights in and to this software, related documentation
+# and any modifications thereto.  Any use, reproduction, disclosure or
+# distribution of this software and related documentation without an express
+# license agreement from NVIDIA CORPORATION is strictly prohibited.
+###########################################################################
+# Example Image Multilayer Perceptron (MLP)
+#
+# Shows how to train a coordinate-based MLP on an image to predict the RGB
+# color at a given input position. By default, a positional encoding is
+# applied to the input coordinates to improve the ability of the MLP to
+# represent higher-frequency content. This can be disabled by passing the
+# '--no_encoding' option.
+#
+# References:
+#   Ben Mildenhall et al. 2021. NeRF: representing scenes
+#   as neural radiance fields for view synthesis. Commun. ACM 65, 1
+#   (January 2022), 99–106. https://doi.org/10.1145/3503250
+#
+###########################################################################
+import math
+import os
+import numpy as np
+from PIL import Image
+import warp as wp
+import warp.examples
+import warp.optim
+rng = np.random.default_rng(45)
+def create_layer(dim_in, dim_hid, dtype=float):
+    w = rng.uniform(-1.0 / np.sqrt(dim_in), 1.0 / np.sqrt(dim_in), (dim_hid, dim_in))
+    b = rng.uniform(-1.0 / np.sqrt(dim_in), 1.0 / np.sqrt(dim_in), (dim_hid, 1))
+    weights = wp.array(w, dtype=dtype, requires_grad=True)
+    bias = wp.array(b, dtype=dtype, requires_grad=True)
+    return (weights, bias)
+def create_array(dim_in, dim_hid, dtype=float):
+    s = rng.uniform(-1.0 / np.sqrt(dim_in), 1.0 / np.sqrt(dim_in), (dim_hid, dim_in))
+    a = wp.array(s, dtype=dtype, requires_grad=True)
+    return a
+# number of frequencies for the positional encoding
+NUM_FREQ = wp.constant(8)
+DIM_IN = wp.constant(4 * NUM_FREQ)  # sin,cos for both x,y at each frequenecy
+DIM_HID = 32
+DIM_OUT = 3
+# threads per-block
+NUM_THREADS = 32
+IMG_WIDTH = 512
+IMG_HEIGHT = 512
+BATCH_SIZE = min(1024, int((IMG_WIDTH * IMG_HEIGHT) / 8))
+# dtype for our weights and bias matrices
+dtype = wp.float16
+@wp.func
+def relu(x: dtype):
+    return wp.max(x, dtype(0.0))
+@wp.kernel
+def compute(
+    indices: wp.array(dtype=int),
+    weights_0: wp.array2d(dtype=dtype),
+    bias_0: wp.array2d(dtype=dtype),
+    weights_1: wp.array2d(dtype=dtype),
+    bias_1: wp.array2d(dtype=dtype),
+    weights_2: wp.array2d(dtype=dtype),
+    bias_2: wp.array2d(dtype=dtype),
+    weights_3: wp.array2d(dtype=dtype),
+    bias_3: wp.array2d(dtype=dtype),
+    reference: wp.array2d(dtype=float),
+    loss: wp.array1d(dtype=float),
+    out: wp.array2d(dtype=float),
+):
+    # batch indices
+    linear = indices[wp.tid()]
+    row = linear / IMG_WIDTH
+    col = linear % IMG_WIDTH
+    # normalize input coordinates to [-1, 1]
+    x = (float(row) / float(IMG_WIDTH) - 0.5) * 2.0
+    y = (float(col) / float(IMG_HEIGHT) - 0.5) * 2.0
+    local = wp.vector(dtype=dtype, length=DIM_IN)
+    # construct positional encoding
+    for s in range(NUM_FREQ):
+        scale = wp.pow(2.0, float(s)) * wp.pi
+        # x-coord
+        local[s * 4 + 0] = dtype(wp.sin(x * scale))
+        local[s * 4 + 1] = dtype(wp.cos(x * scale))
+        # y-coord
+        local[s * 4 + 2] = dtype(wp.sin(y * scale))
+        local[s * 4 + 3] = dtype(wp.cos(y * scale))
+    # tile feature vectors across the block, returns [dim(f), NUM_THREADS]
+    f = wp.tile(local)
+    # input layer
+    w0 = wp.tile_load(weights_0, 0, 0, m=DIM_HID, n=DIM_IN)
+    b0 = wp.tile_load(bias_0, 0, 0, m=DIM_HID, n=1)
+    z = wp.tile_map(relu, wp.tile_matmul(w0, f) + wp.tile_broadcast(b0, m=DIM_HID, n=NUM_THREADS))
+    # hidden layer
+    w1 = wp.tile_load(weights_1, 0, 0, m=DIM_HID, n=DIM_HID)
+    b1 = wp.tile_load(bias_1, 0, 0, m=DIM_HID, n=1)
+    z = wp.tile_map(relu, wp.tile_matmul(w1, z) + wp.tile_broadcast(b1, m=DIM_HID, n=NUM_THREADS))
+    w2 = wp.tile_load(weights_2, 0, 0, m=DIM_HID, n=DIM_HID)
+    b2 = wp.tile_load(bias_2, 0, 0, m=DIM_HID, n=1)
+    z = wp.tile_map(relu, wp.tile_matmul(w2, z) + wp.tile_broadcast(b2, m=DIM_HID, n=NUM_THREADS))
+    # output layer
+    w3 = wp.tile_load(weights_3, 0, 0, m=DIM_OUT, n=DIM_HID)
+    b3 = wp.tile_load(bias_3, 0, 0, m=DIM_OUT, n=1)
+    o = wp.tile_map(relu, wp.tile_matmul(w3, z) + wp.tile_broadcast(b3, m=DIM_OUT, n=NUM_THREADS))
+    # untile back to SIMT
+    output = wp.untile(o)
+    # compute error
+    error = wp.vec3(
+        float(output[0]) - reference[0, linear],
+        float(output[1]) - reference[1, linear],
+        float(output[2]) - reference[2, linear],
+    )
+    # write MSE loss
+    if loss:
+        wp.atomic_add(loss, 0, wp.length_sq(error) / float(3 * BATCH_SIZE))
+    #  write image output
+    if out:
+        for i in range(DIM_OUT):
+            out[i, linear] = float(output[i])
+class Example:
+    def __init__(self, train_iters):
+        self.weights_0, self.bias_0 = create_layer(DIM_IN, DIM_HID, dtype=dtype)
+        self.weights_1, self.bias_1 = create_layer(DIM_HID, DIM_HID, dtype=dtype)
+        self.weights_2, self.bias_2 = create_layer(DIM_HID, DIM_HID, dtype=dtype)
+        self.weights_3, self.bias_3 = create_layer(DIM_HID, DIM_OUT, dtype=dtype)
+        # reference
+        reference_path = os.path.join(wp.examples.get_asset_directory(), "pixel.jpg")
+        with Image.open(reference_path) as im:
+            reference_image = np.asarray(im.resize((IMG_WIDTH, IMG_HEIGHT)).convert("RGB")) / 255.0
+        self.reference = wp.array(reference_image.reshape(IMG_WIDTH * IMG_HEIGHT, 3).T, dtype=float)
+        # create randomized batch indices
+        indices = np.arange(0, IMG_WIDTH * IMG_HEIGHT, dtype=np.int32)
+        rng.shuffle(indices)
+        self.indices = wp.array(indices)
+        self.num_batches = int((IMG_WIDTH * IMG_HEIGHT) / BATCH_SIZE)
+        self.max_iters = train_iters
+        self.max_epochs = max(1, int(self.max_iters / self.num_batches))
+    def train_warp(self):
+        params = [
+            self.weights_0,
+            self.bias_0,
+            self.weights_1,
+            self.bias_1,
+            self.weights_2,
+            self.bias_2,
+            self.weights_3,
+            self.bias_3,
+        ]
+        optimizer_grads = [p.grad.flatten() for p in params]
+        optimizer_inputs = [p.flatten() for p in params]
+        optimizer = warp.optim.Adam(optimizer_inputs, lr=0.01)
+        loss = wp.zeros(1, dtype=float, requires_grad=True)
+        output = create_array(IMG_WIDTH * IMG_HEIGHT, DIM_OUT)
+        # capture graph for whole epoch
+        wp.capture_begin()
+        for b in range(0, IMG_WIDTH * IMG_HEIGHT, BATCH_SIZE):
+            loss.zero_()
+            with wp.Tape() as tape:
+                wp.launch(
+                    compute,
+                    dim=[BATCH_SIZE],
+                    inputs=[
+                        self.indices[b : b + BATCH_SIZE],
+                        self.weights_0,
+                        self.bias_0,
+                        self.weights_1,
+                        self.bias_1,
+                        self.weights_2,
+                        self.bias_2,
+                        self.weights_3,
+                        self.bias_3,
+                        self.reference,
+                        loss,
+                        None,
+                    ],
+                    block_dim=NUM_THREADS,
+                )
+            tape.backward(loss)
+            optimizer.step(optimizer_grads)
+            tape.zero()
+        graph = wp.capture_end()
+        with wp.ScopedTimer("Training"):
+            for i in range(self.max_epochs):
+                with wp.ScopedTimer("Epoch"):
+                    wp.capture_launch(graph)
+                    print(f"Epoch: {i} Loss: {loss.numpy()}")
+        # evaluate full image
+        wp.launch(
+            compute,
+            dim=[IMG_WIDTH * IMG_HEIGHT],
+            inputs=[
+                self.indices,
+                self.weights_0,
+                self.bias_0,
+                self.weights_1,
+                self.bias_1,
+                self.weights_2,
+                self.bias_2,
+                self.weights_3,
+                self.bias_3,
+                self.reference,
+                loss,
+                output,
+            ],
+            block_dim=NUM_THREADS,
+        )
+        self.save_image("example_tile_mlp.jpg", output.numpy())
+    def train_torch(self):
+        import torch as tc
+        weights_0 = tc.nn.Parameter(wp.to_torch(self.weights_0))
+        weights_1 = tc.nn.Parameter(wp.to_torch(self.weights_1))
+        weights_2 = tc.nn.Parameter(wp.to_torch(self.weights_2))
+        weights_3 = tc.nn.Parameter(wp.to_torch(self.weights_3))
+        bias_0 = tc.nn.Parameter(wp.to_torch(self.bias_0))
+        bias_1 = tc.nn.Parameter(wp.to_torch(self.bias_1))
+        bias_2 = tc.nn.Parameter(wp.to_torch(self.bias_2))
+        bias_3 = tc.nn.Parameter(wp.to_torch(self.bias_3))
+        indices = wp.to_torch(self.indices)
+        reference = wp.to_torch(self.reference)
+        optimizer = tc.optim.Adam(
+            [weights_0, bias_0, weights_1, bias_1, weights_2, bias_2, weights_3, bias_3],
+            capturable=True,
+            lr=0.0001,
+            betas=(0.9, 0.95),
+            eps=1.0e-6,
+        )
+        # generate frequency space encoding of pixels
+        # based on their linear index in the image
+        def encode(linear):
+            row = (linear // IMG_WIDTH).float()
+            col = (linear % IMG_WIDTH).float()
+            x = (row / float(IMG_WIDTH) - 0.5) * 2.0
+            y = (col / float(IMG_HEIGHT) - 0.5) * 2.0
+            encoding = tc.zeros((NUM_FREQ * 4, len(linear)), dtype=tc.float16, device="cuda")
+            for s in range(NUM_FREQ):
+                scale = math.pow(2.0, float(s)) * math.pi
+                # Directly write the computed values into the encoding tensor
+                encoding[s * 4 + 0, :] = tc.sin(scale * x)
+                encoding[s * 4 + 1, :] = tc.cos(scale * x)
+                encoding[s * 4 + 2, :] = tc.sin(scale * y)
+                encoding[s * 4 + 3, :] = tc.cos(scale * y)
+            return encoding
+        stream = tc.cuda.Stream()
+        graph = tc.cuda.CUDAGraph()
+        # warm-up
+        with tc.cuda.stream(stream):
+            f = tc.rand((NUM_FREQ * 4, BATCH_SIZE), dtype=tc.float16, device="cuda")
+            z = tc.relu(weights_0 @ f + bias_0)
+            z = tc.relu(weights_1 @ z + bias_1)
+            z = tc.relu(weights_2 @ z + bias_2)
+            z = tc.relu(weights_3 @ z + bias_3)
+            ref = tc.rand((3, BATCH_SIZE), dtype=tc.float16, device="cuda")
+            loss = tc.mean((z - ref) ** 2)
+            optimizer.zero_grad()
+            loss.backward()
+            optimizer.step()
+        with tc.cuda.graph(graph):
+            for b in range(0, IMG_WIDTH * IMG_HEIGHT, BATCH_SIZE):
+                linear = indices[b : b + BATCH_SIZE]
+                f = encode(linear)
+                z = tc.relu(weights_0 @ f + bias_0)
+                z = tc.relu(weights_1 @ z + bias_1)
+                z = tc.relu(weights_2 @ z + bias_2)
+                z = tc.relu(weights_3 @ z + bias_3)
+                ref = reference[:, linear]
+                loss = tc.mean((z - ref) ** 2)
+                optimizer.zero_grad()
+                loss.backward()
+                optimizer.step()
+        with wp.ScopedTimer("Training (Torch)"):
+            for _i in range(self.max_epochs):
+                with wp.ScopedTimer("Epoch"):
+                    graph.replay()
+                    print(loss)
+        f = encode(tc.arange(0, IMG_WIDTH * IMG_HEIGHT))
+        z = tc.relu(weights_0 @ f + bias_0)
+        z = tc.relu(weights_1 @ z + bias_1)
+        z = tc.relu(weights_2 @ z + bias_2)
+        z = tc.relu(weights_3 @ z + bias_3)
+        self.save_image("example_tile_mlp_torch.jpg", z.detach().cpu().numpy())
+    def save_image(self, name, output):
+        predicted_image = output.T.reshape(IMG_WIDTH, IMG_HEIGHT, 3)
+        predicted_image = (predicted_image * 255).astype(np.uint8)
+        predicted_image_pil = Image.fromarray(predicted_image)
+        predicted_image_pil.save(name)
+if __name__ == "__main__":
+    import argparse
+    parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument("--train_iters", type=int, default=20000, help="Total number of training iterations.")
+    args = parser.parse_known_args()[0]
+    with wp.ScopedDevice("cuda:0"):
+        example = Example(args.train_iters)
+        example.train_warp()
+        # example.train_torch()

warp/fem/__init__.py CHANGED Viewed

@@ -24,14 +24,17 @@ from .geometry import (
     LinearGeometryPartition,
     Nanogrid,
     Quadmesh2D,
+    Quadmesh3D,
     Tetmesh,
     Trimesh2D,
+    Trimesh3D,
 )
 from .integrate import integrate, interpolate
 from .operator import (
     D,
     at_node,
     average,
+    cells,
     curl,
     deformation_gradient,
     degree,
@@ -50,6 +53,9 @@ from .operator import (
     normal,
     outer,
     position,
+    to_cell_side,
+    to_inner_cell,
+    to_outer_cell,
 )
 from .polynomial import Polynomial
 from .quadrature import ExplicitQuadrature, NodalQuadrature, PicQuadrature, Quadrature, RegularQuadrature
@@ -65,6 +71,8 @@ from .space import (
     SpaceTopology,
     SymmetricTensorMapper,
     make_collocated_function_space,
+    make_contravariant_function_space,
+    make_covariant_function_space,
     make_polynomial_basis_space,
     make_polynomial_space,
     make_space_partition,