warp-lang 1.4.1__py3-none-manylinux2014_aarch64.whl → 1.5.0__py3-none-manylinux2014_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of warp-lang might be problematic. Click here for more details.

Files changed (164) hide show
  1. warp/__init__.py +4 -0
  2. warp/autograd.py +43 -8
  3. warp/bin/warp-clang.so +0 -0
  4. warp/bin/warp.so +0 -0
  5. warp/build.py +21 -2
  6. warp/build_dll.py +23 -6
  7. warp/builtins.py +1920 -111
  8. warp/codegen.py +186 -62
  9. warp/config.py +2 -2
  10. warp/context.py +322 -73
  11. warp/examples/assets/pixel.jpg +0 -0
  12. warp/examples/benchmarks/benchmark_cloth_paddle.py +86 -0
  13. warp/examples/benchmarks/benchmark_gemm.py +121 -0
  14. warp/examples/benchmarks/benchmark_interop_paddle.py +158 -0
  15. warp/examples/benchmarks/benchmark_tile.py +179 -0
  16. warp/examples/core/example_dem.py +2 -1
  17. warp/examples/core/example_mesh_intersect.py +3 -3
  18. warp/examples/fem/example_adaptive_grid.py +37 -10
  19. warp/examples/fem/example_apic_fluid.py +3 -2
  20. warp/examples/fem/example_convection_diffusion_dg.py +4 -5
  21. warp/examples/fem/example_deformed_geometry.py +1 -1
  22. warp/examples/fem/example_diffusion_3d.py +47 -4
  23. warp/examples/fem/example_distortion_energy.py +220 -0
  24. warp/examples/fem/example_magnetostatics.py +127 -85
  25. warp/examples/fem/example_nonconforming_contact.py +5 -5
  26. warp/examples/fem/example_stokes.py +3 -1
  27. warp/examples/fem/example_streamlines.py +12 -19
  28. warp/examples/fem/utils.py +38 -15
  29. warp/examples/optim/example_walker.py +2 -2
  30. warp/examples/sim/example_cloth.py +2 -25
  31. warp/examples/sim/example_jacobian_ik.py +6 -2
  32. warp/examples/sim/example_quadruped.py +2 -1
  33. warp/examples/tile/example_tile_convolution.py +58 -0
  34. warp/examples/tile/example_tile_fft.py +47 -0
  35. warp/examples/tile/example_tile_filtering.py +105 -0
  36. warp/examples/tile/example_tile_matmul.py +79 -0
  37. warp/examples/tile/example_tile_mlp.py +375 -0
  38. warp/fem/__init__.py +8 -0
  39. warp/fem/cache.py +16 -12
  40. warp/fem/dirichlet.py +1 -1
  41. warp/fem/domain.py +44 -1
  42. warp/fem/field/__init__.py +1 -2
  43. warp/fem/field/field.py +31 -19
  44. warp/fem/field/nodal_field.py +101 -49
  45. warp/fem/field/virtual.py +794 -0
  46. warp/fem/geometry/__init__.py +2 -2
  47. warp/fem/geometry/deformed_geometry.py +3 -105
  48. warp/fem/geometry/element.py +13 -0
  49. warp/fem/geometry/geometry.py +165 -5
  50. warp/fem/geometry/grid_2d.py +3 -6
  51. warp/fem/geometry/grid_3d.py +31 -28
  52. warp/fem/geometry/hexmesh.py +3 -46
  53. warp/fem/geometry/nanogrid.py +3 -2
  54. warp/fem/geometry/{quadmesh_2d.py → quadmesh.py} +280 -159
  55. warp/fem/geometry/tetmesh.py +2 -43
  56. warp/fem/geometry/{trimesh_2d.py → trimesh.py} +354 -186
  57. warp/fem/integrate.py +683 -261
  58. warp/fem/linalg.py +404 -0
  59. warp/fem/operator.py +101 -18
  60. warp/fem/polynomial.py +5 -5
  61. warp/fem/quadrature/quadrature.py +45 -21
  62. warp/fem/space/__init__.py +45 -11
  63. warp/fem/space/basis_function_space.py +451 -0
  64. warp/fem/space/basis_space.py +58 -11
  65. warp/fem/space/function_space.py +146 -5
  66. warp/fem/space/grid_2d_function_space.py +80 -66
  67. warp/fem/space/grid_3d_function_space.py +113 -68
  68. warp/fem/space/hexmesh_function_space.py +96 -108
  69. warp/fem/space/nanogrid_function_space.py +62 -110
  70. warp/fem/space/quadmesh_function_space.py +208 -0
  71. warp/fem/space/shape/__init__.py +45 -7
  72. warp/fem/space/shape/cube_shape_function.py +328 -54
  73. warp/fem/space/shape/shape_function.py +10 -1
  74. warp/fem/space/shape/square_shape_function.py +328 -60
  75. warp/fem/space/shape/tet_shape_function.py +269 -19
  76. warp/fem/space/shape/triangle_shape_function.py +238 -19
  77. warp/fem/space/tetmesh_function_space.py +69 -37
  78. warp/fem/space/topology.py +38 -0
  79. warp/fem/space/trimesh_function_space.py +179 -0
  80. warp/fem/utils.py +6 -331
  81. warp/jax_experimental.py +3 -1
  82. warp/native/array.h +55 -40
  83. warp/native/builtin.h +124 -43
  84. warp/native/bvh.h +4 -0
  85. warp/native/coloring.cpp +600 -0
  86. warp/native/cuda_util.cpp +14 -0
  87. warp/native/cuda_util.h +2 -1
  88. warp/native/fabric.h +8 -0
  89. warp/native/hashgrid.h +4 -0
  90. warp/native/marching.cu +8 -0
  91. warp/native/mat.h +14 -3
  92. warp/native/mathdx.cpp +59 -0
  93. warp/native/mesh.h +4 -0
  94. warp/native/range.h +13 -1
  95. warp/native/reduce.cpp +9 -1
  96. warp/native/reduce.cu +7 -0
  97. warp/native/runlength_encode.cpp +9 -1
  98. warp/native/runlength_encode.cu +7 -1
  99. warp/native/scan.cpp +8 -0
  100. warp/native/scan.cu +8 -0
  101. warp/native/scan.h +8 -1
  102. warp/native/sparse.cpp +8 -0
  103. warp/native/sparse.cu +8 -0
  104. warp/native/temp_buffer.h +7 -0
  105. warp/native/tile.h +1857 -0
  106. warp/native/tile_gemm.h +341 -0
  107. warp/native/tile_reduce.h +210 -0
  108. warp/native/volume_builder.cu +8 -0
  109. warp/native/volume_builder.h +8 -0
  110. warp/native/warp.cpp +10 -2
  111. warp/native/warp.cu +369 -15
  112. warp/native/warp.h +12 -2
  113. warp/optim/adam.py +39 -4
  114. warp/paddle.py +29 -12
  115. warp/render/render_opengl.py +137 -65
  116. warp/sim/graph_coloring.py +292 -0
  117. warp/sim/integrator_euler.py +4 -2
  118. warp/sim/integrator_featherstone.py +115 -44
  119. warp/sim/integrator_vbd.py +6 -0
  120. warp/sim/model.py +90 -17
  121. warp/stubs.py +651 -85
  122. warp/tape.py +12 -7
  123. warp/tests/assets/pixel.npy +0 -0
  124. warp/tests/aux_test_instancing_gc.py +18 -0
  125. warp/tests/test_array.py +207 -48
  126. warp/tests/test_closest_point_edge_edge.py +8 -8
  127. warp/tests/test_codegen.py +120 -1
  128. warp/tests/test_codegen_instancing.py +30 -0
  129. warp/tests/test_collision.py +110 -0
  130. warp/tests/test_coloring.py +241 -0
  131. warp/tests/test_context.py +34 -0
  132. warp/tests/test_examples.py +18 -4
  133. warp/tests/test_fabricarray.py +33 -0
  134. warp/tests/test_fem.py +453 -113
  135. warp/tests/test_func.py +48 -1
  136. warp/tests/test_generics.py +52 -0
  137. warp/tests/test_iter.py +68 -0
  138. warp/tests/test_mat_scalar_ops.py +1 -1
  139. warp/tests/test_mesh_query_point.py +5 -4
  140. warp/tests/test_module_hashing.py +23 -0
  141. warp/tests/test_paddle.py +27 -87
  142. warp/tests/test_print.py +191 -1
  143. warp/tests/test_spatial.py +1 -1
  144. warp/tests/test_tile.py +700 -0
  145. warp/tests/test_tile_mathdx.py +144 -0
  146. warp/tests/test_tile_mlp.py +383 -0
  147. warp/tests/test_tile_reduce.py +374 -0
  148. warp/tests/test_tile_shared_memory.py +190 -0
  149. warp/tests/test_vbd.py +12 -20
  150. warp/tests/test_volume.py +43 -0
  151. warp/tests/unittest_suites.py +23 -2
  152. warp/tests/unittest_utils.py +4 -0
  153. warp/types.py +339 -73
  154. warp/utils.py +22 -1
  155. {warp_lang-1.4.1.dist-info → warp_lang-1.5.0.dist-info}/METADATA +33 -7
  156. {warp_lang-1.4.1.dist-info → warp_lang-1.5.0.dist-info}/RECORD +159 -132
  157. {warp_lang-1.4.1.dist-info → warp_lang-1.5.0.dist-info}/WHEEL +1 -1
  158. warp/fem/field/test.py +0 -180
  159. warp/fem/field/trial.py +0 -183
  160. warp/fem/space/collocated_function_space.py +0 -102
  161. warp/fem/space/quadmesh_2d_function_space.py +0 -261
  162. warp/fem/space/trimesh_2d_function_space.py +0 -153
  163. {warp_lang-1.4.1.dist-info → warp_lang-1.5.0.dist-info}/LICENSE.md +0 -0
  164. {warp_lang-1.4.1.dist-info → warp_lang-1.5.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,58 @@
1
+ # Copyright (c) 2024 NVIDIA CORPORATION. All rights reserved.
2
+ # NVIDIA CORPORATION and its licensors retain all intellectual property
3
+ # and proprietary rights in and to this software, related documentation
4
+ # and any modifications thereto. Any use, reproduction, disclosure or
5
+ # distribution of this software and related documentation without an express
6
+ # license agreement from NVIDIA CORPORATION is strictly prohibited.
7
+
8
+ ###########################################################################
9
+ # Example Tile Convolution
10
+ #
11
+ # Shows how to write a simple convolution kernel using Warp FFT tile
12
+ # primitives.
13
+ #
14
+ ###########################################################################
15
+
16
+ import numpy as np
17
+
18
+ import warp as wp
19
+
20
+ wp.set_module_options({"enable_backward": False})
21
+
22
+ BLOCK_DIM = 64
23
+ TILE_M = 1
24
+ TILE_N = 128
25
+
26
+ scale = wp.vec2d(wp.float64(1 / TILE_N), wp.float64(1 / TILE_N))
27
+
28
+
29
+ @wp.func
30
+ def filter(x: wp.vec2d):
31
+ return wp.cw_mul(x, scale)
32
+
33
+
34
+ @wp.kernel
35
+ def conv_tiled(x: wp.array2d(dtype=wp.vec2d), y: wp.array2d(dtype=wp.vec2d)):
36
+ i, j, _ = wp.tid()
37
+ a = wp.tile_load(x, i, j, m=TILE_M, n=TILE_N)
38
+ wp.tile_fft(a)
39
+ b = wp.tile_map(filter, a)
40
+ wp.tile_ifft(b)
41
+ wp.tile_store(y, i, j, b)
42
+
43
+
44
+ if __name__ == "__main__":
45
+ wp.set_device("cuda:0")
46
+
47
+ rng = np.random.default_rng(42)
48
+
49
+ x_h = rng.standard_normal((TILE_M, TILE_N, 2), dtype=np.float64)
50
+ y_h = np.zeros_like(x_h)
51
+
52
+ x_wp = wp.array2d(x_h, dtype=wp.vec2d)
53
+ y_wp = wp.array2d(y_h, dtype=wp.vec2d)
54
+
55
+ wp.launch_tiled(conv_tiled, dim=[1, 1], inputs=[x_wp], outputs=[y_wp], block_dim=BLOCK_DIM)
56
+
57
+ # Since filter is 1/N, conv_tiled is a ~no-op
58
+ assert np.allclose(x_h, y_wp.numpy())
@@ -0,0 +1,47 @@
1
+ # Copyright (c) 2024 NVIDIA CORPORATION. All rights reserved.
2
+ # NVIDIA CORPORATION and its licensors retain all intellectual property
3
+ # and proprietary rights in and to this software, related documentation
4
+ # and any modifications thereto. Any use, reproduction, disclosure or
5
+ # distribution of this software and related documentation without an express
6
+ # license agreement from NVIDIA CORPORATION is strictly prohibited.
7
+
8
+ ###########################################################################
9
+ # Example Tile FFT
10
+ #
11
+ # Shows how to write a simple FFT kernel using Warp tile primitives.
12
+ #
13
+ ###########################################################################
14
+
15
+ import numpy as np
16
+
17
+ import warp as wp
18
+
19
+ wp.set_module_options({"enable_backward": False})
20
+
21
+ BLOCK_DIM = 8
22
+ TILE_M = 1
23
+ TILE_N = 32
24
+
25
+
26
+ @wp.kernel
27
+ def fft_tiled(x: wp.array2d(dtype=wp.vec2d), y: wp.array2d(dtype=wp.vec2d)):
28
+ i, j, _ = wp.tid()
29
+ a = wp.tile_load(x, i, j, m=TILE_M, n=TILE_N)
30
+ wp.tile_fft(a)
31
+ wp.tile_ifft(a)
32
+ wp.tile_store(y, i, j, a)
33
+
34
+
35
+ if __name__ == "__main__":
36
+ wp.set_device("cuda:0")
37
+
38
+ x_h = np.ones((TILE_M, TILE_N, 2), dtype=np.float64)
39
+ x_h[:, :, 1] = 0
40
+ y_h = 3 * np.ones((TILE_M, TILE_N, 2), dtype=np.float64)
41
+ x_wp = wp.array2d(x_h, dtype=wp.vec2d)
42
+ y_wp = wp.array2d(y_h, dtype=wp.vec2d)
43
+
44
+ wp.launch_tiled(fft_tiled, dim=[1, 1], inputs=[x_wp], outputs=[y_wp], block_dim=BLOCK_DIM)
45
+
46
+ print("Inputs:\n", x_wp) # [1+0i, 1+0i, 1+0i, ...]
47
+ print("Output:\n", y_wp) # [32+0i, 0, 0, ...]
@@ -0,0 +1,105 @@
1
+ # Copyright (c) 2024 NVIDIA CORPORATION. All rights reserved.
2
+ # NVIDIA CORPORATION and its licensors retain all intellectual property
3
+ # and proprietary rights in and to this software, related documentation
4
+ # and any modifications thereto. Any use, reproduction, disclosure or
5
+ # distribution of this software and related documentation without an express
6
+ # license agreement from NVIDIA CORPORATION is strictly prohibited.
7
+
8
+ ###########################################################################
9
+ # Example Tile Filtering
10
+ #
11
+ # Shows how to write a simple filtering kernel using Warp FFT tile
12
+ # primitives.
13
+ #
14
+ ###########################################################################
15
+
16
+ import numpy as np
17
+
18
+ import warp as wp
19
+
20
+ wp.set_module_options({"enable_backward": False})
21
+
22
+ BLOCK_DIM = 128
23
+ TILE_M = 1
24
+ TILE_N = 512
25
+
26
+ scale = wp.vec2d(wp.float64(1 / TILE_N), wp.float64(1 / TILE_N))
27
+
28
+
29
+ def cplx(array):
30
+ return array[..., 0] + 1j * array[..., 1]
31
+
32
+
33
+ @wp.func
34
+ def cplx_prod(x: wp.vec2d, y: wp.vec2d):
35
+ return wp.cw_mul(wp.vec2d(x[0] * y[0] - x[1] * y[1], x[0] * y[1] + x[1] * y[0]), scale)
36
+
37
+
38
+ @wp.kernel
39
+ def conv_tiled(x: wp.array2d(dtype=wp.vec2d), y: wp.array2d(dtype=wp.vec2d), z: wp.array2d(dtype=wp.vec2d)):
40
+ i, j, _ = wp.tid()
41
+ a = wp.tile_load(x, i, j, m=TILE_M, n=TILE_N)
42
+ b = wp.tile_load(y, i, j, m=TILE_M, n=TILE_N)
43
+ wp.tile_fft(a)
44
+ c = wp.tile_map(cplx_prod, a, b)
45
+ wp.tile_ifft(c)
46
+ wp.tile_store(z, i, j, c)
47
+
48
+
49
+ if __name__ == "__main__":
50
+ rng = np.random.default_rng(42)
51
+
52
+ # Create noisy input signal
53
+ t = np.linspace(0, 2 * np.pi, TILE_N, dtype=np.float64)
54
+ x = np.sin(t) + 0.5 * rng.random(TILE_N, dtype=np.float64)
55
+
56
+ # Create filter. This filter keeps only ~10% of the frequencies at the center
57
+ # of the spectrum.
58
+ f = np.ones_like(x)
59
+ freq = np.fft.fftfreq(TILE_N)
60
+ f[np.abs(freq) > 0.05] = 0.0
61
+ f[np.abs(freq) <= 0.05] = 1.0
62
+
63
+ # Create Warp input data
64
+ # We use vec2d to hold complex numbers
65
+ x_h = np.zeros((TILE_M, TILE_N, 2), dtype=np.float64)
66
+ f_h = np.zeros_like(x_h)
67
+ y_h = np.zeros_like(f_h)
68
+
69
+ x_h[:, :, 0] = x
70
+ f_h[:, :, 0] = f
71
+
72
+ x_wp = wp.array2d(x_h, dtype=wp.vec2d)
73
+ f_wp = wp.array2d(f_h, dtype=wp.vec2d)
74
+ y_wp = wp.array2d(y_h, dtype=wp.vec2d)
75
+
76
+ wp.launch_tiled(conv_tiled, dim=[1, 1], inputs=[x_wp, f_wp], outputs=[y_wp], block_dim=BLOCK_DIM)
77
+
78
+ # Extract output and compare with numpy
79
+ x_np = cplx(x_h)
80
+ f_np = cplx(f_h)
81
+ y_test = cplx(y_wp.numpy())
82
+ y_ref = np.fft.ifft(f_np * np.fft.fft(x_np))
83
+ assert np.allclose(y_ref, y_test)
84
+
85
+ try:
86
+ import matplotlib.pyplot as plt
87
+
88
+ fig, ax = plt.subplots(figsize=(10, 5))
89
+
90
+ ax.plot(
91
+ x,
92
+ color="#DDDDDD",
93
+ linewidth=2,
94
+ label="Original",
95
+ )
96
+ ax.plot(y_test[0, :].real, color="#76B900", linewidth=3, label="Smoothed")
97
+
98
+ ax.legend()
99
+ ax.grid(True)
100
+
101
+ plt.tight_layout()
102
+ plt.show()
103
+
104
+ except ModuleNotFoundError:
105
+ print("Matplotlib not available; skipping figure")
@@ -0,0 +1,79 @@
1
+ # Copyright (c) 2024 NVIDIA CORPORATION. All rights reserved.
2
+ # NVIDIA CORPORATION and its licensors retain all intellectual property
3
+ # and proprietary rights in and to this software, related documentation
4
+ # and any modifications thereto. Any use, reproduction, disclosure or
5
+ # distribution of this software and related documentation without an express
6
+ # license agreement from NVIDIA CORPORATION is strictly prohibited.
7
+
8
+ ###########################################################################
9
+ # Example Tile MatMul
10
+ #
11
+ # Shows how to write a simple GEMM kernel using Warp tile primitives.
12
+ #
13
+ ###########################################################################
14
+
15
+ import numpy as np
16
+
17
+ import warp as wp
18
+
19
+ # tile size
20
+ TILE_M = wp.constant(8)
21
+ TILE_N = wp.constant(4)
22
+ TILE_K = wp.constant(8)
23
+
24
+ # num threads per-tile
25
+ TILE_THREADS = 64
26
+
27
+
28
+ @wp.kernel
29
+ def tile_gemm(A: wp.array2d(dtype=wp.float32), B: wp.array2d(dtype=wp.float16), C: wp.array2d(dtype=wp.float64)):
30
+ # output tile index
31
+ i, j = wp.tid()
32
+
33
+ sum = wp.tile_zeros(m=TILE_M, n=TILE_N, dtype=wp.float64)
34
+
35
+ _M = A.shape[0]
36
+ _N = B.shape[1]
37
+ K = A.shape[1]
38
+
39
+ count = int(K / TILE_K)
40
+
41
+ for k in range(0, count):
42
+ a = wp.tile_load(A, i, k, m=TILE_M, n=TILE_K)
43
+ b = wp.tile_load(B, k, j, m=TILE_K, n=TILE_N)
44
+
45
+ # sum += a*b
46
+ wp.tile_matmul(a, b, sum)
47
+
48
+ wp.tile_store(C, i, j, sum)
49
+
50
+
51
+ if __name__ == "__main__":
52
+ wp.set_device("cuda:0")
53
+
54
+ # generate some tile aligned matrix dimensions
55
+ M = TILE_M * 7
56
+ K = TILE_K * 6
57
+ N = TILE_N * 5
58
+
59
+ rng = np.random.default_rng(42)
60
+ A = rng.random((M, K), dtype=np.float32)
61
+ B = rng.random((K, N), dtype=np.float32).astype(np.float16)
62
+ C = np.zeros((M, N), dtype=np.float64)
63
+
64
+ A_wp = wp.array(A, requires_grad=True)
65
+ B_wp = wp.array(B, requires_grad=True)
66
+ C_wp = wp.array(C, requires_grad=True)
67
+
68
+ with wp.Tape() as tape:
69
+ wp.launch_tiled(
70
+ tile_gemm,
71
+ dim=(int(M / TILE_M), int(N / TILE_N)),
72
+ inputs=[A_wp, B_wp],
73
+ outputs=[C_wp],
74
+ block_dim=TILE_THREADS,
75
+ )
76
+
77
+ assert np.allclose(C_wp.numpy(), A @ B)
78
+
79
+ print("Example matrix multiplication passed")
@@ -0,0 +1,375 @@
1
+ # Copyright (c) 2024 NVIDIA CORPORATION. All rights reserved.
2
+ # NVIDIA CORPORATION and its licensors retain all intellectual property
3
+ # and proprietary rights in and to this software, related documentation
4
+ # and any modifications thereto. Any use, reproduction, disclosure or
5
+ # distribution of this software and related documentation without an express
6
+ # license agreement from NVIDIA CORPORATION is strictly prohibited.
7
+
8
+ ###########################################################################
9
+ # Example Image Multilayer Perceptron (MLP)
10
+ #
11
+ # Shows how to train a coordinate-based MLP on an image to predict the RGB
12
+ # color at a given input position. By default, a positional encoding is
13
+ # applied to the input coordinates to improve the ability of the MLP to
14
+ # represent higher-frequency content. This can be disabled by passing the
15
+ # '--no_encoding' option.
16
+ #
17
+ # References:
18
+ # Ben Mildenhall et al. 2021. NeRF: representing scenes
19
+ # as neural radiance fields for view synthesis. Commun. ACM 65, 1
20
+ # (January 2022), 99–106. https://doi.org/10.1145/3503250
21
+ #
22
+ ###########################################################################
23
+
24
+ import math
25
+ import os
26
+
27
+ import numpy as np
28
+ from PIL import Image
29
+
30
+ import warp as wp
31
+ import warp.examples
32
+ import warp.optim
33
+
34
+ rng = np.random.default_rng(45)
35
+
36
+
37
+ def create_layer(dim_in, dim_hid, dtype=float):
38
+ w = rng.uniform(-1.0 / np.sqrt(dim_in), 1.0 / np.sqrt(dim_in), (dim_hid, dim_in))
39
+ b = rng.uniform(-1.0 / np.sqrt(dim_in), 1.0 / np.sqrt(dim_in), (dim_hid, 1))
40
+
41
+ weights = wp.array(w, dtype=dtype, requires_grad=True)
42
+ bias = wp.array(b, dtype=dtype, requires_grad=True)
43
+
44
+ return (weights, bias)
45
+
46
+
47
+ def create_array(dim_in, dim_hid, dtype=float):
48
+ s = rng.uniform(-1.0 / np.sqrt(dim_in), 1.0 / np.sqrt(dim_in), (dim_hid, dim_in))
49
+ a = wp.array(s, dtype=dtype, requires_grad=True)
50
+
51
+ return a
52
+
53
+
54
+ # number of frequencies for the positional encoding
55
+ NUM_FREQ = wp.constant(8)
56
+
57
+ DIM_IN = wp.constant(4 * NUM_FREQ) # sin,cos for both x,y at each frequenecy
58
+ DIM_HID = 32
59
+ DIM_OUT = 3
60
+
61
+ # threads per-block
62
+ NUM_THREADS = 32
63
+
64
+ IMG_WIDTH = 512
65
+ IMG_HEIGHT = 512
66
+
67
+ BATCH_SIZE = min(1024, int((IMG_WIDTH * IMG_HEIGHT) / 8))
68
+
69
+ # dtype for our weights and bias matrices
70
+ dtype = wp.float16
71
+
72
+
73
+ @wp.func
74
+ def relu(x: dtype):
75
+ return wp.max(x, dtype(0.0))
76
+
77
+
78
+ @wp.kernel
79
+ def compute(
80
+ indices: wp.array(dtype=int),
81
+ weights_0: wp.array2d(dtype=dtype),
82
+ bias_0: wp.array2d(dtype=dtype),
83
+ weights_1: wp.array2d(dtype=dtype),
84
+ bias_1: wp.array2d(dtype=dtype),
85
+ weights_2: wp.array2d(dtype=dtype),
86
+ bias_2: wp.array2d(dtype=dtype),
87
+ weights_3: wp.array2d(dtype=dtype),
88
+ bias_3: wp.array2d(dtype=dtype),
89
+ reference: wp.array2d(dtype=float),
90
+ loss: wp.array1d(dtype=float),
91
+ out: wp.array2d(dtype=float),
92
+ ):
93
+ # batch indices
94
+ linear = indices[wp.tid()]
95
+
96
+ row = linear / IMG_WIDTH
97
+ col = linear % IMG_WIDTH
98
+
99
+ # normalize input coordinates to [-1, 1]
100
+ x = (float(row) / float(IMG_WIDTH) - 0.5) * 2.0
101
+ y = (float(col) / float(IMG_HEIGHT) - 0.5) * 2.0
102
+
103
+ local = wp.vector(dtype=dtype, length=DIM_IN)
104
+
105
+ # construct positional encoding
106
+ for s in range(NUM_FREQ):
107
+ scale = wp.pow(2.0, float(s)) * wp.pi
108
+
109
+ # x-coord
110
+ local[s * 4 + 0] = dtype(wp.sin(x * scale))
111
+ local[s * 4 + 1] = dtype(wp.cos(x * scale))
112
+ # y-coord
113
+ local[s * 4 + 2] = dtype(wp.sin(y * scale))
114
+ local[s * 4 + 3] = dtype(wp.cos(y * scale))
115
+
116
+ # tile feature vectors across the block, returns [dim(f), NUM_THREADS]
117
+ f = wp.tile(local)
118
+
119
+ # input layer
120
+ w0 = wp.tile_load(weights_0, 0, 0, m=DIM_HID, n=DIM_IN)
121
+ b0 = wp.tile_load(bias_0, 0, 0, m=DIM_HID, n=1)
122
+ z = wp.tile_map(relu, wp.tile_matmul(w0, f) + wp.tile_broadcast(b0, m=DIM_HID, n=NUM_THREADS))
123
+
124
+ # hidden layer
125
+ w1 = wp.tile_load(weights_1, 0, 0, m=DIM_HID, n=DIM_HID)
126
+ b1 = wp.tile_load(bias_1, 0, 0, m=DIM_HID, n=1)
127
+ z = wp.tile_map(relu, wp.tile_matmul(w1, z) + wp.tile_broadcast(b1, m=DIM_HID, n=NUM_THREADS))
128
+
129
+ w2 = wp.tile_load(weights_2, 0, 0, m=DIM_HID, n=DIM_HID)
130
+ b2 = wp.tile_load(bias_2, 0, 0, m=DIM_HID, n=1)
131
+ z = wp.tile_map(relu, wp.tile_matmul(w2, z) + wp.tile_broadcast(b2, m=DIM_HID, n=NUM_THREADS))
132
+
133
+ # output layer
134
+ w3 = wp.tile_load(weights_3, 0, 0, m=DIM_OUT, n=DIM_HID)
135
+ b3 = wp.tile_load(bias_3, 0, 0, m=DIM_OUT, n=1)
136
+ o = wp.tile_map(relu, wp.tile_matmul(w3, z) + wp.tile_broadcast(b3, m=DIM_OUT, n=NUM_THREADS))
137
+
138
+ # untile back to SIMT
139
+ output = wp.untile(o)
140
+
141
+ # compute error
142
+ error = wp.vec3(
143
+ float(output[0]) - reference[0, linear],
144
+ float(output[1]) - reference[1, linear],
145
+ float(output[2]) - reference[2, linear],
146
+ )
147
+
148
+ # write MSE loss
149
+ if loss:
150
+ wp.atomic_add(loss, 0, wp.length_sq(error) / float(3 * BATCH_SIZE))
151
+
152
+ # write image output
153
+ if out:
154
+ for i in range(DIM_OUT):
155
+ out[i, linear] = float(output[i])
156
+
157
+
158
+ class Example:
159
+ def __init__(self, train_iters):
160
+ self.weights_0, self.bias_0 = create_layer(DIM_IN, DIM_HID, dtype=dtype)
161
+ self.weights_1, self.bias_1 = create_layer(DIM_HID, DIM_HID, dtype=dtype)
162
+ self.weights_2, self.bias_2 = create_layer(DIM_HID, DIM_HID, dtype=dtype)
163
+ self.weights_3, self.bias_3 = create_layer(DIM_HID, DIM_OUT, dtype=dtype)
164
+
165
+ # reference
166
+ reference_path = os.path.join(wp.examples.get_asset_directory(), "pixel.jpg")
167
+ with Image.open(reference_path) as im:
168
+ reference_image = np.asarray(im.resize((IMG_WIDTH, IMG_HEIGHT)).convert("RGB")) / 255.0
169
+ self.reference = wp.array(reference_image.reshape(IMG_WIDTH * IMG_HEIGHT, 3).T, dtype=float)
170
+
171
+ # create randomized batch indices
172
+ indices = np.arange(0, IMG_WIDTH * IMG_HEIGHT, dtype=np.int32)
173
+ rng.shuffle(indices)
174
+ self.indices = wp.array(indices)
175
+
176
+ self.num_batches = int((IMG_WIDTH * IMG_HEIGHT) / BATCH_SIZE)
177
+ self.max_iters = train_iters
178
+ self.max_epochs = max(1, int(self.max_iters / self.num_batches))
179
+
180
+ def train_warp(self):
181
+ params = [
182
+ self.weights_0,
183
+ self.bias_0,
184
+ self.weights_1,
185
+ self.bias_1,
186
+ self.weights_2,
187
+ self.bias_2,
188
+ self.weights_3,
189
+ self.bias_3,
190
+ ]
191
+
192
+ optimizer_grads = [p.grad.flatten() for p in params]
193
+ optimizer_inputs = [p.flatten() for p in params]
194
+ optimizer = warp.optim.Adam(optimizer_inputs, lr=0.01)
195
+
196
+ loss = wp.zeros(1, dtype=float, requires_grad=True)
197
+ output = create_array(IMG_WIDTH * IMG_HEIGHT, DIM_OUT)
198
+
199
+ # capture graph for whole epoch
200
+ wp.capture_begin()
201
+
202
+ for b in range(0, IMG_WIDTH * IMG_HEIGHT, BATCH_SIZE):
203
+ loss.zero_()
204
+
205
+ with wp.Tape() as tape:
206
+ wp.launch(
207
+ compute,
208
+ dim=[BATCH_SIZE],
209
+ inputs=[
210
+ self.indices[b : b + BATCH_SIZE],
211
+ self.weights_0,
212
+ self.bias_0,
213
+ self.weights_1,
214
+ self.bias_1,
215
+ self.weights_2,
216
+ self.bias_2,
217
+ self.weights_3,
218
+ self.bias_3,
219
+ self.reference,
220
+ loss,
221
+ None,
222
+ ],
223
+ block_dim=NUM_THREADS,
224
+ )
225
+
226
+ tape.backward(loss)
227
+ optimizer.step(optimizer_grads)
228
+ tape.zero()
229
+
230
+ graph = wp.capture_end()
231
+
232
+ with wp.ScopedTimer("Training"):
233
+ for i in range(self.max_epochs):
234
+ with wp.ScopedTimer("Epoch"):
235
+ wp.capture_launch(graph)
236
+ print(f"Epoch: {i} Loss: {loss.numpy()}")
237
+
238
+ # evaluate full image
239
+ wp.launch(
240
+ compute,
241
+ dim=[IMG_WIDTH * IMG_HEIGHT],
242
+ inputs=[
243
+ self.indices,
244
+ self.weights_0,
245
+ self.bias_0,
246
+ self.weights_1,
247
+ self.bias_1,
248
+ self.weights_2,
249
+ self.bias_2,
250
+ self.weights_3,
251
+ self.bias_3,
252
+ self.reference,
253
+ loss,
254
+ output,
255
+ ],
256
+ block_dim=NUM_THREADS,
257
+ )
258
+
259
+ self.save_image("example_tile_mlp.jpg", output.numpy())
260
+
261
+ def train_torch(self):
262
+ import torch as tc
263
+
264
+ weights_0 = tc.nn.Parameter(wp.to_torch(self.weights_0))
265
+ weights_1 = tc.nn.Parameter(wp.to_torch(self.weights_1))
266
+ weights_2 = tc.nn.Parameter(wp.to_torch(self.weights_2))
267
+ weights_3 = tc.nn.Parameter(wp.to_torch(self.weights_3))
268
+
269
+ bias_0 = tc.nn.Parameter(wp.to_torch(self.bias_0))
270
+ bias_1 = tc.nn.Parameter(wp.to_torch(self.bias_1))
271
+ bias_2 = tc.nn.Parameter(wp.to_torch(self.bias_2))
272
+ bias_3 = tc.nn.Parameter(wp.to_torch(self.bias_3))
273
+
274
+ indices = wp.to_torch(self.indices)
275
+ reference = wp.to_torch(self.reference)
276
+
277
+ optimizer = tc.optim.Adam(
278
+ [weights_0, bias_0, weights_1, bias_1, weights_2, bias_2, weights_3, bias_3],
279
+ capturable=True,
280
+ lr=0.0001,
281
+ betas=(0.9, 0.95),
282
+ eps=1.0e-6,
283
+ )
284
+
285
+ # generate frequency space encoding of pixels
286
+ # based on their linear index in the image
287
+ def encode(linear):
288
+ row = (linear // IMG_WIDTH).float()
289
+ col = (linear % IMG_WIDTH).float()
290
+
291
+ x = (row / float(IMG_WIDTH) - 0.5) * 2.0
292
+ y = (col / float(IMG_HEIGHT) - 0.5) * 2.0
293
+
294
+ encoding = tc.zeros((NUM_FREQ * 4, len(linear)), dtype=tc.float16, device="cuda")
295
+
296
+ for s in range(NUM_FREQ):
297
+ scale = math.pow(2.0, float(s)) * math.pi
298
+
299
+ # Directly write the computed values into the encoding tensor
300
+ encoding[s * 4 + 0, :] = tc.sin(scale * x)
301
+ encoding[s * 4 + 1, :] = tc.cos(scale * x)
302
+ encoding[s * 4 + 2, :] = tc.sin(scale * y)
303
+ encoding[s * 4 + 3, :] = tc.cos(scale * y)
304
+
305
+ return encoding
306
+
307
+ stream = tc.cuda.Stream()
308
+ graph = tc.cuda.CUDAGraph()
309
+
310
+ # warm-up
311
+ with tc.cuda.stream(stream):
312
+ f = tc.rand((NUM_FREQ * 4, BATCH_SIZE), dtype=tc.float16, device="cuda")
313
+ z = tc.relu(weights_0 @ f + bias_0)
314
+ z = tc.relu(weights_1 @ z + bias_1)
315
+ z = tc.relu(weights_2 @ z + bias_2)
316
+ z = tc.relu(weights_3 @ z + bias_3)
317
+ ref = tc.rand((3, BATCH_SIZE), dtype=tc.float16, device="cuda")
318
+ loss = tc.mean((z - ref) ** 2)
319
+ optimizer.zero_grad()
320
+ loss.backward()
321
+ optimizer.step()
322
+
323
+ with tc.cuda.graph(graph):
324
+ for b in range(0, IMG_WIDTH * IMG_HEIGHT, BATCH_SIZE):
325
+ linear = indices[b : b + BATCH_SIZE]
326
+
327
+ f = encode(linear)
328
+
329
+ z = tc.relu(weights_0 @ f + bias_0)
330
+ z = tc.relu(weights_1 @ z + bias_1)
331
+ z = tc.relu(weights_2 @ z + bias_2)
332
+ z = tc.relu(weights_3 @ z + bias_3)
333
+
334
+ ref = reference[:, linear]
335
+ loss = tc.mean((z - ref) ** 2)
336
+
337
+ optimizer.zero_grad()
338
+ loss.backward()
339
+ optimizer.step()
340
+
341
+ with wp.ScopedTimer("Training (Torch)"):
342
+ for _i in range(self.max_epochs):
343
+ with wp.ScopedTimer("Epoch"):
344
+ graph.replay()
345
+
346
+ print(loss)
347
+
348
+ f = encode(tc.arange(0, IMG_WIDTH * IMG_HEIGHT))
349
+ z = tc.relu(weights_0 @ f + bias_0)
350
+ z = tc.relu(weights_1 @ z + bias_1)
351
+ z = tc.relu(weights_2 @ z + bias_2)
352
+ z = tc.relu(weights_3 @ z + bias_3)
353
+
354
+ self.save_image("example_tile_mlp_torch.jpg", z.detach().cpu().numpy())
355
+
356
+ def save_image(self, name, output):
357
+ predicted_image = output.T.reshape(IMG_WIDTH, IMG_HEIGHT, 3)
358
+ predicted_image = (predicted_image * 255).astype(np.uint8)
359
+
360
+ predicted_image_pil = Image.fromarray(predicted_image)
361
+ predicted_image_pil.save(name)
362
+
363
+
364
+ if __name__ == "__main__":
365
+ import argparse
366
+
367
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
368
+ parser.add_argument("--train_iters", type=int, default=20000, help="Total number of training iterations.")
369
+
370
+ args = parser.parse_known_args()[0]
371
+
372
+ with wp.ScopedDevice("cuda:0"):
373
+ example = Example(args.train_iters)
374
+ example.train_warp()
375
+ # example.train_torch()
warp/fem/__init__.py CHANGED
@@ -24,14 +24,17 @@ from .geometry import (
24
24
  LinearGeometryPartition,
25
25
  Nanogrid,
26
26
  Quadmesh2D,
27
+ Quadmesh3D,
27
28
  Tetmesh,
28
29
  Trimesh2D,
30
+ Trimesh3D,
29
31
  )
30
32
  from .integrate import integrate, interpolate
31
33
  from .operator import (
32
34
  D,
33
35
  at_node,
34
36
  average,
37
+ cells,
35
38
  curl,
36
39
  deformation_gradient,
37
40
  degree,
@@ -50,6 +53,9 @@ from .operator import (
50
53
  normal,
51
54
  outer,
52
55
  position,
56
+ to_cell_side,
57
+ to_inner_cell,
58
+ to_outer_cell,
53
59
  )
54
60
  from .polynomial import Polynomial
55
61
  from .quadrature import ExplicitQuadrature, NodalQuadrature, PicQuadrature, Quadrature, RegularQuadrature
@@ -65,6 +71,8 @@ from .space import (
65
71
  SpaceTopology,
66
72
  SymmetricTensorMapper,
67
73
  make_collocated_function_space,
74
+ make_contravariant_function_space,
75
+ make_covariant_function_space,
68
76
  make_polynomial_basis_space,
69
77
  make_polynomial_space,
70
78
  make_space_partition,