warp-lang 1.4.2__py3-none-macosx_10_13_universal2.whl → 1.5.1__py3-none-macosx_10_13_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of warp-lang might be problematic. Click here for more details.

Files changed (165) hide show
  1. warp/__init__.py +4 -0
  2. warp/autograd.py +43 -8
  3. warp/bin/libwarp.dylib +0 -0
  4. warp/build.py +21 -2
  5. warp/build_dll.py +23 -6
  6. warp/builtins.py +1819 -7
  7. warp/codegen.py +197 -61
  8. warp/config.py +2 -2
  9. warp/context.py +379 -107
  10. warp/examples/assets/pixel.jpg +0 -0
  11. warp/examples/benchmarks/benchmark_cloth_paddle.py +86 -0
  12. warp/examples/benchmarks/benchmark_gemm.py +121 -0
  13. warp/examples/benchmarks/benchmark_interop_paddle.py +158 -0
  14. warp/examples/benchmarks/benchmark_tile.py +179 -0
  15. warp/examples/fem/example_adaptive_grid.py +37 -10
  16. warp/examples/fem/example_apic_fluid.py +3 -2
  17. warp/examples/fem/example_convection_diffusion_dg.py +4 -5
  18. warp/examples/fem/example_deformed_geometry.py +1 -1
  19. warp/examples/fem/example_diffusion_3d.py +47 -4
  20. warp/examples/fem/example_distortion_energy.py +220 -0
  21. warp/examples/fem/example_magnetostatics.py +127 -85
  22. warp/examples/fem/example_nonconforming_contact.py +5 -5
  23. warp/examples/fem/example_stokes.py +3 -1
  24. warp/examples/fem/example_streamlines.py +12 -19
  25. warp/examples/fem/utils.py +38 -15
  26. warp/examples/sim/example_cloth.py +4 -25
  27. warp/examples/sim/example_quadruped.py +2 -1
  28. warp/examples/tile/example_tile_convolution.py +58 -0
  29. warp/examples/tile/example_tile_fft.py +47 -0
  30. warp/examples/tile/example_tile_filtering.py +105 -0
  31. warp/examples/tile/example_tile_matmul.py +79 -0
  32. warp/examples/tile/example_tile_mlp.py +375 -0
  33. warp/fem/__init__.py +8 -0
  34. warp/fem/cache.py +16 -12
  35. warp/fem/dirichlet.py +1 -1
  36. warp/fem/domain.py +44 -1
  37. warp/fem/field/__init__.py +1 -2
  38. warp/fem/field/field.py +31 -19
  39. warp/fem/field/nodal_field.py +101 -49
  40. warp/fem/field/virtual.py +794 -0
  41. warp/fem/geometry/__init__.py +2 -2
  42. warp/fem/geometry/deformed_geometry.py +3 -105
  43. warp/fem/geometry/element.py +13 -0
  44. warp/fem/geometry/geometry.py +165 -7
  45. warp/fem/geometry/grid_2d.py +3 -6
  46. warp/fem/geometry/grid_3d.py +31 -28
  47. warp/fem/geometry/hexmesh.py +3 -46
  48. warp/fem/geometry/nanogrid.py +3 -2
  49. warp/fem/geometry/{quadmesh_2d.py → quadmesh.py} +280 -159
  50. warp/fem/geometry/tetmesh.py +2 -43
  51. warp/fem/geometry/{trimesh_2d.py → trimesh.py} +354 -186
  52. warp/fem/integrate.py +683 -261
  53. warp/fem/linalg.py +404 -0
  54. warp/fem/operator.py +101 -18
  55. warp/fem/polynomial.py +5 -5
  56. warp/fem/quadrature/quadrature.py +45 -21
  57. warp/fem/space/__init__.py +45 -11
  58. warp/fem/space/basis_function_space.py +451 -0
  59. warp/fem/space/basis_space.py +58 -11
  60. warp/fem/space/function_space.py +146 -5
  61. warp/fem/space/grid_2d_function_space.py +80 -66
  62. warp/fem/space/grid_3d_function_space.py +113 -68
  63. warp/fem/space/hexmesh_function_space.py +96 -108
  64. warp/fem/space/nanogrid_function_space.py +62 -110
  65. warp/fem/space/quadmesh_function_space.py +208 -0
  66. warp/fem/space/shape/__init__.py +45 -7
  67. warp/fem/space/shape/cube_shape_function.py +328 -54
  68. warp/fem/space/shape/shape_function.py +10 -1
  69. warp/fem/space/shape/square_shape_function.py +328 -60
  70. warp/fem/space/shape/tet_shape_function.py +269 -19
  71. warp/fem/space/shape/triangle_shape_function.py +238 -19
  72. warp/fem/space/tetmesh_function_space.py +69 -37
  73. warp/fem/space/topology.py +38 -0
  74. warp/fem/space/trimesh_function_space.py +179 -0
  75. warp/fem/utils.py +6 -331
  76. warp/jax_experimental.py +3 -1
  77. warp/native/array.h +15 -0
  78. warp/native/builtin.h +66 -26
  79. warp/native/bvh.h +4 -0
  80. warp/native/coloring.cpp +604 -0
  81. warp/native/cuda_util.cpp +68 -51
  82. warp/native/cuda_util.h +2 -1
  83. warp/native/fabric.h +8 -0
  84. warp/native/hashgrid.h +4 -0
  85. warp/native/marching.cu +8 -0
  86. warp/native/mat.h +14 -3
  87. warp/native/mathdx.cpp +59 -0
  88. warp/native/mesh.h +4 -0
  89. warp/native/range.h +13 -1
  90. warp/native/reduce.cpp +9 -1
  91. warp/native/reduce.cu +7 -0
  92. warp/native/runlength_encode.cpp +9 -1
  93. warp/native/runlength_encode.cu +7 -1
  94. warp/native/scan.cpp +8 -0
  95. warp/native/scan.cu +8 -0
  96. warp/native/scan.h +8 -1
  97. warp/native/sparse.cpp +8 -0
  98. warp/native/sparse.cu +8 -0
  99. warp/native/temp_buffer.h +7 -0
  100. warp/native/tile.h +1854 -0
  101. warp/native/tile_gemm.h +341 -0
  102. warp/native/tile_reduce.h +210 -0
  103. warp/native/volume_builder.cu +8 -0
  104. warp/native/volume_builder.h +8 -0
  105. warp/native/warp.cpp +10 -2
  106. warp/native/warp.cu +369 -15
  107. warp/native/warp.h +12 -2
  108. warp/optim/adam.py +39 -4
  109. warp/paddle.py +29 -12
  110. warp/render/render_opengl.py +140 -67
  111. warp/sim/graph_coloring.py +292 -0
  112. warp/sim/import_urdf.py +8 -8
  113. warp/sim/integrator_euler.py +4 -2
  114. warp/sim/integrator_featherstone.py +115 -44
  115. warp/sim/integrator_vbd.py +6 -0
  116. warp/sim/model.py +109 -32
  117. warp/sparse.py +1 -1
  118. warp/stubs.py +569 -4
  119. warp/tape.py +12 -7
  120. warp/tests/assets/pixel.npy +0 -0
  121. warp/tests/aux_test_instancing_gc.py +18 -0
  122. warp/tests/test_array.py +39 -0
  123. warp/tests/test_codegen.py +81 -1
  124. warp/tests/test_codegen_instancing.py +30 -0
  125. warp/tests/test_collision.py +110 -0
  126. warp/tests/test_coloring.py +251 -0
  127. warp/tests/test_context.py +34 -0
  128. warp/tests/test_examples.py +21 -5
  129. warp/tests/test_fem.py +453 -113
  130. warp/tests/test_func.py +34 -4
  131. warp/tests/test_generics.py +52 -0
  132. warp/tests/test_iter.py +68 -0
  133. warp/tests/test_lerp.py +13 -87
  134. warp/tests/test_mat_scalar_ops.py +1 -1
  135. warp/tests/test_matmul.py +6 -9
  136. warp/tests/test_matmul_lite.py +6 -11
  137. warp/tests/test_mesh_query_point.py +1 -1
  138. warp/tests/test_module_hashing.py +23 -0
  139. warp/tests/test_overwrite.py +45 -0
  140. warp/tests/test_paddle.py +27 -87
  141. warp/tests/test_print.py +56 -1
  142. warp/tests/test_smoothstep.py +17 -83
  143. warp/tests/test_spatial.py +1 -1
  144. warp/tests/test_static.py +3 -3
  145. warp/tests/test_tile.py +744 -0
  146. warp/tests/test_tile_mathdx.py +144 -0
  147. warp/tests/test_tile_mlp.py +383 -0
  148. warp/tests/test_tile_reduce.py +374 -0
  149. warp/tests/test_tile_shared_memory.py +190 -0
  150. warp/tests/test_vbd.py +12 -20
  151. warp/tests/test_volume.py +43 -0
  152. warp/tests/unittest_suites.py +19 -2
  153. warp/tests/unittest_utils.py +4 -2
  154. warp/types.py +340 -74
  155. warp/utils.py +23 -3
  156. {warp_lang-1.4.2.dist-info → warp_lang-1.5.1.dist-info}/METADATA +32 -7
  157. {warp_lang-1.4.2.dist-info → warp_lang-1.5.1.dist-info}/RECORD +160 -133
  158. {warp_lang-1.4.2.dist-info → warp_lang-1.5.1.dist-info}/WHEEL +1 -1
  159. warp/fem/field/test.py +0 -180
  160. warp/fem/field/trial.py +0 -183
  161. warp/fem/space/collocated_function_space.py +0 -102
  162. warp/fem/space/quadmesh_2d_function_space.py +0 -261
  163. warp/fem/space/trimesh_2d_function_space.py +0 -153
  164. {warp_lang-1.4.2.dist-info → warp_lang-1.5.1.dist-info}/LICENSE.md +0 -0
  165. {warp_lang-1.4.2.dist-info → warp_lang-1.5.1.dist-info}/top_level.txt +0 -0
Binary file
@@ -0,0 +1,86 @@
1
+ # Copyright (c) 2022 NVIDIA CORPORATION. All rights reserved.
2
+ # NVIDIA CORPORATION and its licensors retain all intellectual property
3
+ # and proprietary rights in and to this software, related documentation
4
+ # and any modifications thereto. Any use, reproduction, disclosure or
5
+ # distribution of this software and related documentation without an express
6
+ # license agreement from NVIDIA CORPORATION is strictly prohibited.
7
+
8
+ import paddle
9
+
10
+
11
+ def eval_springs(x, v, indices, rest, ke, kd, f):
12
+ i = indices[:, 0]
13
+ j = indices[:, 1]
14
+
15
+ xi = x[i]
16
+ xj = x[j]
17
+
18
+ vi = v[i]
19
+ vj = v[j]
20
+
21
+ xij = xi - xj
22
+ vij = vi - vj
23
+
24
+ l = paddle.linalg.norm(xij, axis=1)
25
+ l_inv = 1.0 / l
26
+
27
+ # normalized spring direction
28
+ dir = (xij.T * l_inv).T
29
+
30
+ c = l - rest
31
+ dcdt = paddle.sum(dir * vij, axis=1)
32
+
33
+ # damping based on relative velocity.
34
+ fs = dir.T * (ke * c + kd * dcdt)
35
+
36
+ f.index_add_(axis=0, index=i, value=-fs.T)
37
+ f.index_add_(axis=0, index=j, value=fs.T)
38
+
39
+
40
+ def integrate_particles(x, v, f, g, w, dt):
41
+ s = w > 0.0
42
+
43
+ a_ext = g * s[:, None].astype(g.dtype)
44
+
45
+ # simple semi-implicit Euler. v1 = v0 + a dt, x1 = x0 + v1 dt
46
+ v += ((f.T * w).T + a_ext) * dt
47
+ x += v * dt
48
+
49
+ # clear forces
50
+ f *= 0.0
51
+
52
+
53
+ class TrIntegrator:
54
+ def __init__(self, cloth, device):
55
+ self.cloth = cloth
56
+
57
+ self.positions = paddle.to_tensor(self.cloth.positions, place=device)
58
+ self.velocities = paddle.to_tensor(self.cloth.velocities, place=device)
59
+ self.inv_mass = paddle.to_tensor(self.cloth.inv_masses, place=device)
60
+
61
+ self.spring_indices = paddle.to_tensor(self.cloth.spring_indices, dtype=paddle.int64, place=device)
62
+ self.spring_lengths = paddle.to_tensor(self.cloth.spring_lengths, place=device)
63
+ self.spring_stiffness = paddle.to_tensor(self.cloth.spring_stiffness, place=device)
64
+ self.spring_damping = paddle.to_tensor(self.cloth.spring_damping, place=device)
65
+
66
+ self.forces = paddle.zeros((self.cloth.num_particles, 3), dtype=paddle.float32).to(device=device)
67
+ self.gravity = paddle.to_tensor((0.0, 0.0 - 9.8, 0.0), dtype=paddle.float32, place=device)
68
+
69
+ def simulate(self, dt, substeps):
70
+ sim_dt = dt / substeps
71
+
72
+ for _s in range(substeps):
73
+ eval_springs(
74
+ self.positions,
75
+ self.velocities,
76
+ self.spring_indices.reshape((self.cloth.num_springs, 2)),
77
+ self.spring_lengths,
78
+ self.spring_stiffness,
79
+ self.spring_damping,
80
+ self.forces,
81
+ )
82
+
83
+ # integrate
84
+ integrate_particles(self.positions, self.velocities, self.forces, self.gravity, self.inv_mass, sim_dt)
85
+
86
+ return self.positions.cpu().numpy()
@@ -0,0 +1,121 @@
1
+ from itertools import product
2
+
3
+ import numpy as np
4
+ import torch as tc
5
+
6
+ import warp as wp
7
+
8
+ tc.backends.cuda.matmul.allow_tf32 = False # Disable TF32 for matrix multiplications
9
+ tc.backends.cudnn.allow_tf32 = False # Disable TF32 for cuDNN operations
10
+
11
+ wp.init()
12
+ wp.clear_kernel_cache()
13
+ wp.set_module_options({"fast_math": True, "enable_backward": False})
14
+
15
+
16
+ def create_mlp_kernel(m, n, k):
17
+ TILE_M = m
18
+ TILE_N = n
19
+ TILE_K = k
20
+
21
+ @wp.kernel
22
+ def mlp(x: wp.array2d(dtype=float), weights_wp: wp.array2d(dtype=float), n_k: int, output: wp.array2d(dtype=float)):
23
+ i_m, i_n = wp.tid()
24
+ sum = wp.tile_zeros(m=TILE_M, n=TILE_N, dtype=wp.float32)
25
+ for count in range(n_k):
26
+ feat = wp.tile_load(x, i_m, count, TILE_M, TILE_K)
27
+ weight = wp.tile_load(weights_wp, count, i_n, TILE_K, TILE_N)
28
+ wp.tile_matmul(feat, weight, sum)
29
+
30
+ wp.tile_store(output, i_m, i_n, sum)
31
+
32
+ return mlp
33
+
34
+
35
+ def benchmark_torch(A, B, warm_up, iterations):
36
+ # warm-up
37
+ for _ in range(warm_up):
38
+ tc.matmul(A, B)
39
+
40
+ timers = {}
41
+ tc.cuda.synchronize()
42
+
43
+ with wp.ScopedTimer("torch", print=False, dict=timers, synchronize=True):
44
+ for _ in range(iterations):
45
+ tc.matmul(A, B)
46
+
47
+ tc.cuda.synchronize()
48
+
49
+ return timers["torch"][0]
50
+
51
+
52
+ def benchmark_warp(A, B, config, warm_up, iterations):
53
+ TILE_M = config[0]
54
+ TILE_N = config[1]
55
+ TILE_K = config[2]
56
+ BLOCK_DIM = config[3]
57
+
58
+ mlp = create_mlp_kernel(TILE_M, TILE_N, TILE_K)
59
+
60
+ M = A.shape[0]
61
+ N = B.shape[1]
62
+ K = A.shape[1]
63
+
64
+ output = wp.zeros((M, N), dtype=float)
65
+
66
+ # warm-up
67
+ for _ in range(warm_up):
68
+ wp.launch_tiled(
69
+ kernel=mlp, dim=[M // TILE_M, N // TILE_N], inputs=[A, B, K // TILE_K, output], block_dim=BLOCK_DIM
70
+ )
71
+
72
+ # check output
73
+ if warm_up > 0:
74
+ assert np.allclose(output.numpy(), A.numpy() @ B.numpy(), atol=1e-3, rtol=1e-3)
75
+
76
+ # benchmark
77
+ timers = {}
78
+ with wp.ScopedTimer("warp", print=False, dict=timers, synchronize=True):
79
+ for _ in range(iterations):
80
+ wp.launch_tiled(
81
+ kernel=mlp, dim=[M // TILE_M, N // TILE_N], inputs=[A, B, K // TILE_K, output], block_dim=BLOCK_DIM
82
+ )
83
+
84
+ return timers["warp"][0]
85
+
86
+
87
+ tile_m = [8, 16, 32, 64]
88
+ tile_n = [8, 16, 32, 64]
89
+ tile_k = [8, 16, 64]
90
+ block = [32, 64, 128]
91
+
92
+ M = 1024
93
+ N = 1024
94
+ K = 1024
95
+
96
+ A = tc.randn(M, K).cuda()
97
+ B = tc.randn(K, N).cuda()
98
+
99
+ iterations = 1000
100
+ warm_up = 10
101
+
102
+ time_torch = benchmark_torch(A, B, warm_up, iterations)
103
+ print(f"Torch: {time_torch}")
104
+
105
+ configs = list(product(tile_m, tile_n, tile_k, block))
106
+
107
+ wp.config.quiet = True
108
+
109
+ # header
110
+ print(
111
+ "{:<{}} {:<{}} {:<{}} {:<{}} {:<{}} {:<{}}".format(
112
+ "TILE_M", 12, "TILE_N", 12, "TILE_K", 12, "BLOCK", 12, "Time", 12, "Relative", 12
113
+ )
114
+ )
115
+ for c in configs:
116
+ time_warp = benchmark_warp(wp.from_torch(A), wp.from_torch(B), c, warm_up, iterations)
117
+ print(
118
+ "{:<{}} {:<{}} {:<{}} {:<{}} {:<{}} {:<{}}".format(
119
+ c[0], 12, c[1], 12, c[2], 12, c[3], 12, time_warp, 12, time_warp / time_torch, 12
120
+ )
121
+ )
@@ -0,0 +1,158 @@
1
+ # Copyright (c) 2024 NVIDIA CORPORATION. All rights reserved.
2
+ # NVIDIA CORPORATION and its licensors retain all intellectual property
3
+ # and proprietary rights in and to this software, related documentation
4
+ # and any modifications thereto. Any use, reproduction, disclosure or
5
+ # distribution of this software and related documentation without an express
6
+ # license agreement from NVIDIA CORPORATION is strictly prohibited.
7
+
8
+ import time
9
+
10
+ import paddle
11
+
12
+ import warp as wp
13
+
14
+
15
+ def create_simple_kernel(dtype):
16
+ def simple_kernel(
17
+ a: wp.array(dtype=dtype),
18
+ b: wp.array(dtype=dtype),
19
+ c: wp.array(dtype=dtype),
20
+ d: wp.array(dtype=dtype),
21
+ e: wp.array(dtype=dtype),
22
+ ):
23
+ pass
24
+
25
+ return wp.Kernel(simple_kernel)
26
+
27
+
28
+ def test_from_paddle(kernel, num_iters, array_size, device, warp_dtype=None):
29
+ warp_device = wp.get_device(device)
30
+ paddle_device = wp.device_to_paddle(warp_device)
31
+
32
+ if hasattr(warp_dtype, "_shape_"):
33
+ paddle_shape = (array_size, *warp_dtype._shape_)
34
+ paddle_dtype = wp.dtype_to_paddle(warp_dtype._wp_scalar_type_)
35
+ else:
36
+ paddle_shape = (array_size,)
37
+ paddle_dtype = paddle.float32 if warp_dtype is None else wp.dtype_to_paddle(warp_dtype)
38
+
39
+ _a = paddle.zeros(paddle_shape, dtype=paddle_dtype).to(device=paddle_device)
40
+ _b = paddle.zeros(paddle_shape, dtype=paddle_dtype).to(device=paddle_device)
41
+ _c = paddle.zeros(paddle_shape, dtype=paddle_dtype).to(device=paddle_device)
42
+ _d = paddle.zeros(paddle_shape, dtype=paddle_dtype).to(device=paddle_device)
43
+ _e = paddle.zeros(paddle_shape, dtype=paddle_dtype).to(device=paddle_device)
44
+
45
+ wp.synchronize()
46
+
47
+ # profiler = Profiler(interval=0.000001)
48
+ # profiler.start()
49
+
50
+ t1 = time.time_ns()
51
+
52
+ for _ in range(num_iters):
53
+ a = wp.from_paddle(_a, dtype=warp_dtype)
54
+ b = wp.from_paddle(_b, dtype=warp_dtype)
55
+ c = wp.from_paddle(_c, dtype=warp_dtype)
56
+ d = wp.from_paddle(_d, dtype=warp_dtype)
57
+ e = wp.from_paddle(_e, dtype=warp_dtype)
58
+ wp.launch(kernel, dim=array_size, inputs=[a, b, c, d, e])
59
+
60
+ t2 = time.time_ns()
61
+ print(f"{(t2 - t1) / 1_000_000 :8.0f} ms from_paddle(...)")
62
+
63
+ # profiler.stop()
64
+ # profiler.print()
65
+
66
+
67
+ def test_array_ctype_from_paddle(kernel, num_iters, array_size, device, warp_dtype=None):
68
+ warp_device = wp.get_device(device)
69
+ paddle_device = wp.device_to_paddle(warp_device)
70
+
71
+ if hasattr(warp_dtype, "_shape_"):
72
+ paddle_shape = (array_size, *warp_dtype._shape_)
73
+ paddle_dtype = wp.dtype_to_paddle(warp_dtype._wp_scalar_type_)
74
+ else:
75
+ paddle_shape = (array_size,)
76
+ paddle_dtype = paddle.float32 if warp_dtype is None else wp.dtype_to_paddle(warp_dtype)
77
+
78
+ _a = paddle.zeros(paddle_shape, dtype=paddle_dtype).to(device=paddle_device)
79
+ _b = paddle.zeros(paddle_shape, dtype=paddle_dtype).to(device=paddle_device)
80
+ _c = paddle.zeros(paddle_shape, dtype=paddle_dtype).to(device=paddle_device)
81
+ _d = paddle.zeros(paddle_shape, dtype=paddle_dtype).to(device=paddle_device)
82
+ _e = paddle.zeros(paddle_shape, dtype=paddle_dtype).to(device=paddle_device)
83
+
84
+ wp.synchronize()
85
+
86
+ # profiler = Profiler(interval=0.000001)
87
+ # profiler.start()
88
+
89
+ t1 = time.time_ns()
90
+
91
+ for _ in range(num_iters):
92
+ a = wp.from_paddle(_a, dtype=warp_dtype, return_ctype=True)
93
+ b = wp.from_paddle(_b, dtype=warp_dtype, return_ctype=True)
94
+ c = wp.from_paddle(_c, dtype=warp_dtype, return_ctype=True)
95
+ d = wp.from_paddle(_d, dtype=warp_dtype, return_ctype=True)
96
+ e = wp.from_paddle(_e, dtype=warp_dtype, return_ctype=True)
97
+ wp.launch(kernel, dim=array_size, inputs=[a, b, c, d, e])
98
+
99
+ t2 = time.time_ns()
100
+ print(f"{(t2 - t1) / 1_000_000 :8.0f} ms from_paddle(..., return_ctype=True)")
101
+
102
+ # profiler.stop()
103
+ # profiler.print()
104
+
105
+
106
+ def test_direct_from_paddle(kernel, num_iters, array_size, device, warp_dtype=None):
107
+ warp_device = wp.get_device(device)
108
+ paddle_device = wp.device_to_paddle(warp_device)
109
+
110
+ if hasattr(warp_dtype, "_shape_"):
111
+ paddle_shape = (array_size, *warp_dtype._shape_)
112
+ paddle_dtype = wp.dtype_to_paddle(warp_dtype._wp_scalar_type_)
113
+ else:
114
+ paddle_shape = (array_size,)
115
+ paddle_dtype = paddle.float32 if warp_dtype is None else wp.dtype_to_paddle(warp_dtype)
116
+
117
+ _a = paddle.zeros(paddle_shape, dtype=paddle_dtype).to(device=paddle_device)
118
+ _b = paddle.zeros(paddle_shape, dtype=paddle_dtype).to(device=paddle_device)
119
+ _c = paddle.zeros(paddle_shape, dtype=paddle_dtype).to(device=paddle_device)
120
+ _d = paddle.zeros(paddle_shape, dtype=paddle_dtype).to(device=paddle_device)
121
+ _e = paddle.zeros(paddle_shape, dtype=paddle_dtype).to(device=paddle_device)
122
+
123
+ wp.synchronize()
124
+
125
+ # profiler = Profiler(interval=0.000001)
126
+ # profiler.start()
127
+
128
+ t1 = time.time_ns()
129
+
130
+ for _ in range(num_iters):
131
+ wp.launch(kernel, dim=array_size, inputs=[_a, _b, _c, _d, _e])
132
+
133
+ t2 = time.time_ns()
134
+ print(f"{(t2 - t1) / 1_000_000 :8.0f} ms direct from paddle")
135
+
136
+ # profiler.stop()
137
+ # profiler.print()
138
+
139
+
140
+ wp.init()
141
+
142
+ params = [
143
+ # (warp_dtype arg, kernel)
144
+ (None, create_simple_kernel(wp.float32)),
145
+ (wp.float32, create_simple_kernel(wp.float32)),
146
+ (wp.vec3f, create_simple_kernel(wp.vec3f)),
147
+ (wp.mat22f, create_simple_kernel(wp.mat22f)),
148
+ ]
149
+
150
+ wp.load_module()
151
+
152
+ num_iters = 100000
153
+
154
+ for warp_dtype, kernel in params:
155
+ print(f"\ndtype={wp.context.type_str(warp_dtype)}")
156
+ test_from_paddle(kernel, num_iters, 10, "cuda:0", warp_dtype=warp_dtype)
157
+ test_array_ctype_from_paddle(kernel, num_iters, 10, "cuda:0", warp_dtype=warp_dtype)
158
+ test_direct_from_paddle(kernel, num_iters, 10, "cuda:0", warp_dtype=warp_dtype)
@@ -0,0 +1,179 @@
1
+ # Copyright (c) 2024 NVIDIA CORPORATION. All rights reserved.
2
+ # NVIDIA CORPORATION and its licensors retain all intellectual property
3
+ # and proprietary rights in and to this software, related documentation
4
+ # and any modifications thereto. Any use, reproduction, disclosure or
5
+ # distribution of this software and related documentation without an express
6
+ # license agreement from NVIDIA CORPORATION is strictly prohibited.
7
+
8
+ import numpy as np
9
+ import torch
10
+
11
+ import warp as wp
12
+
13
+ wp.init()
14
+ wp.set_module_options({"enable_backward": False, "fast_math": True})
15
+ wp.set_device("cuda:0")
16
+
17
+ wp.build.clear_kernel_cache()
18
+
19
+
20
+ @wp.kernel
21
+ def gemm(A: wp.array2d(dtype=float), B: wp.array2d(dtype=float), C: wp.array2d(dtype=float)):
22
+ # output index
23
+ i, j = wp.tid()
24
+
25
+ sum = float(0.0)
26
+
27
+ for k in range(0, A.shape[1]):
28
+ sum += A[i, k] * B[k, j]
29
+
30
+ C[i, j] = sum
31
+
32
+
33
+ TILE_M = wp.constant(64)
34
+ TILE_N = wp.constant(64)
35
+ TILE_K = wp.constant(8)
36
+
37
+
38
+ @wp.kernel
39
+ def gemm_tiled(A: wp.array2d(dtype=float), B: wp.array2d(dtype=float), C: wp.array2d(dtype=float)):
40
+ # output tile index
41
+ i, j = wp.tid()
42
+
43
+ sum = wp.tile_zeros(m=TILE_M, n=TILE_N, dtype=wp.float32)
44
+
45
+ _M = A.shape[0]
46
+ _N = B.shape[1]
47
+ K = A.shape[1]
48
+
49
+ count = int(K / 8) # TODO: code-gen bug if you use a constant before passing it to a kwd arg (in this case TILE_K)
50
+
51
+ for k in range(count):
52
+ a = wp.tile_load(A, i, k, m=TILE_M, n=TILE_K)
53
+ b = wp.tile_load(B, k, j, m=TILE_K, n=TILE_N)
54
+
55
+ # sum += a*b
56
+ wp.tile_matmul(a, b, sum)
57
+
58
+ wp.tile_store(C, i, j, sum)
59
+
60
+
61
+ def benchmark_numpy(A, B, C):
62
+ timers = {}
63
+ iters = 10
64
+
65
+ # warm up
66
+ for _i in range(10):
67
+ _C = A @ B
68
+
69
+ with wp.ScopedTimer("NumPy", dict=timers):
70
+ for _i in range(iters):
71
+ _C = A @ B
72
+
73
+ return min(timers["NumPy"])
74
+
75
+
76
+ def benchmark_warp_simt(A, B, C):
77
+ timers = {}
78
+ iters = 10
79
+
80
+ A_wp = wp.array(A)
81
+ B_wp = wp.array(B)
82
+ C_wp = wp.array(C)
83
+
84
+ # warm up
85
+ for _i in range(10):
86
+ wp.launch(gemm, dim=(M, N), inputs=[A_wp, B_wp, C_wp])
87
+
88
+ with wp.ScopedTimer("Warp (SIMT)", dict=timers, print=False, synchronize=True):
89
+ for _i in range(iters):
90
+ wp.launch(gemm, dim=(M, N), inputs=[A_wp, B_wp, C_wp])
91
+
92
+ return min(timers["Warp (SIMT)"])
93
+
94
+
95
+ def benchmark_warp_tiled(A, B, C):
96
+ timers = {}
97
+ iters = 10
98
+
99
+ # must match with the tile_matmul() partition size
100
+ SUB_TILE_M = 4
101
+ SUB_TILE_N = 4
102
+
103
+ num_threads = int(TILE_M / SUB_TILE_M) * int(TILE_N / SUB_TILE_N)
104
+ A_wp = wp.array(A)
105
+ B_wp = wp.array(B)
106
+ C_wp = wp.array(C)
107
+
108
+ # warm up
109
+ wp.capture_begin()
110
+
111
+ for _i in range(iters):
112
+ wp.launch(gemm_tiled, dim=(int(M / TILE_M), int(N / TILE_N)), inputs=[A_wp, B_wp, C_wp], tile_size=num_threads)
113
+
114
+ graph = wp.capture_end()
115
+
116
+ with wp.ScopedTimer("Warp (Tiled)", dict=timers, print=False, synchronize=True):
117
+ # for i in range(iters):
118
+ # wp.launch(gemm_tiled, dim=(int(M/TILE_M), int(N/TILE_N)), inputs=[A_wp, B_wp, C_wp], tile_size=num_threads)
119
+ wp.capture_launch(graph)
120
+
121
+ return min(timers["Warp (Tiled)"])
122
+
123
+
124
+ def benchmark_torch(A, B, C):
125
+ A_tc = torch.from_numpy(A).to("cuda:0")
126
+ B_tc = torch.from_numpy(B).to("cuda:0")
127
+ C_tc = torch.from_numpy(C).to("cuda:0")
128
+
129
+ # warm-up
130
+ for _i in range(10):
131
+ torch.matmul(A_tc, B_tc, out=C_tc)
132
+
133
+ timers = {}
134
+ iters = 10
135
+
136
+ torch.cuda.synchronize()
137
+
138
+ with wp.ScopedTimer("Torch", dict=timers, print=False):
139
+ for _i in range(iters):
140
+ torch.matmul(A_tc, B_tc) # , out=C_tc)
141
+
142
+ torch.cuda.synchronize()
143
+
144
+ return min(timers["Torch"])
145
+
146
+
147
+ results_torch = []
148
+ results_warp_simt = []
149
+ results_warp_tiled = []
150
+
151
+ print("{:>8s} {:>8s} {:>8s} {:>8s} {:>8s} {:>8s}".format("M", "N", "K", "Torch", "Warp (SIMT)", "Warp (Tiled)"))
152
+ print("--------------------------------------------------------")
153
+
154
+ for i in range(2, 33):
155
+ # for i in range(8,9):
156
+
157
+ M = i * 128
158
+ N = M
159
+ K = N
160
+
161
+ # M = TILE_M*21
162
+ # K = TILE_K*7
163
+ # N = TILE_M*12
164
+
165
+ rng = np.random.default_rng(42)
166
+
167
+ A = rng.random((M, K), dtype=np.float32)
168
+ B = rng.random((K, N), dtype=np.float32)
169
+ C = np.zeros((M, N), dtype=np.float32)
170
+
171
+ results_torch.append(benchmark_torch(A, B, C))
172
+ results_warp_simt.append(0.0) # benchmark_warp_simt(A, B, C))
173
+ results_warp_tiled.append(benchmark_warp_tiled(A, B, C))
174
+
175
+ print(
176
+ "{:>8d} {:>8d} {:>8d} {:>8f} {:>8f} {:>8f}".format(
177
+ M, N, K, results_torch[-1], results_warp_simt[-1], results_warp_tiled[-1]
178
+ )
179
+ )
@@ -56,7 +56,7 @@ def mass_form(
56
56
  u: fem.Field,
57
57
  v: fem.Field,
58
58
  ):
59
- return u(s) * v(s)
59
+ return fem.linalg.generalized_inner(u(s), v(s))
60
60
 
61
61
 
62
62
  @fem.integrand
@@ -86,9 +86,12 @@ def pressure_anomaly_field(s: fem.Sample, domain: fem.Domain, pressure: fem.Fiel
86
86
 
87
87
 
88
88
  class Example:
89
- def __init__(self, quiet=False, degree=2, base_resolution=8, level_count=4, headless: bool = False):
89
+ def __init__(
90
+ self, quiet=False, degree=2, div_conforming=False, base_resolution=8, level_count=4, headless: bool = False
91
+ ):
90
92
  self._quiet = quiet
91
93
  self._degree = degree
94
+ self._div_conforming = div_conforming
92
95
 
93
96
  # Start from a coarse, dense grid
94
97
  res = wp.vec3i(2 * base_resolution, base_resolution // 2, base_resolution)
@@ -110,9 +113,13 @@ class Example:
110
113
  sim_vol, level_count, refinement_field=refinement, grading="face"
111
114
  )
112
115
 
113
- # Function spaces for velocity, scalars and pressure (Pk / Pk / Pk-1)
114
- self._u_basis = fem.make_polynomial_basis_space(geo=self._geo, degree=self._degree)
115
- u_space = fem.make_collocated_function_space(self._u_basis, dtype=wp.vec3)
116
+ # Function spaces for velocity, pressure (RTk / Pk-1 or Pk / Pk-1)
117
+ u_space = fem.make_polynomial_space(
118
+ geo=self._geo,
119
+ element_basis=fem.ElementBasis.RAVIART_THOMAS if div_conforming else None,
120
+ degree=self._degree,
121
+ dtype=wp.vec3,
122
+ )
116
123
  p_space = fem.make_polynomial_space(geo=self._geo, degree=self._degree - 1, dtype=float)
117
124
 
118
125
  self.pressure_field = p_space.make_field()
@@ -137,7 +144,17 @@ class Example:
137
144
  def render(self):
138
145
  # self.renderer.add_field("solution", self.pressure_field)
139
146
  self.plot.add_field("pressure_anomaly", self.pressure_anomaly_field)
140
- self.plot.add_field("velocity", self.velocity_field)
147
+
148
+ if self._div_conforming:
149
+ # If using H(div)-conforming elements, interpolate to continuous space
150
+ velocity_field_lagrange = fem.make_polynomial_space(
151
+ self.velocity_field.geometry, dtype=wp.vec3, degree=self._degree
152
+ ).make_field()
153
+ fem.interpolate(self.velocity_field, dest=velocity_field_lagrange)
154
+ else:
155
+ velocity_field_lagrange = self.velocity_field
156
+
157
+ self.plot.add_field("velocity", velocity_field_lagrange)
141
158
 
142
159
  def step(self):
143
160
  u_space = self.velocity_field.space
@@ -153,9 +170,14 @@ class Example:
153
170
  fem.normalize_dirichlet_projector(dirichlet_projector)
154
171
 
155
172
  # (Diagonal) mass matrix
156
- s_space = fem.make_collocated_function_space(self._u_basis, dtype=float)
157
- rho_test = fem.make_test(s_space)
158
- rho_trial = fem.make_trial(s_space)
173
+ if self._div_conforming:
174
+ rho_test = fem.make_test(u_space)
175
+ rho_trial = fem.make_trial(u_space)
176
+ else:
177
+ rho_space = fem.make_polynomial_space(geo=u_space.geometry, degree=self._degree)
178
+ rho_test = fem.make_test(rho_space)
179
+ rho_trial = fem.make_trial(rho_space)
180
+
159
181
  inv_mass_matrix = fem.integrate(
160
182
  mass_form, fields={"u": rho_trial, "v": rho_test}, nodal=True, output_dtype=float
161
183
  )
@@ -177,6 +199,7 @@ class Example:
177
199
  side_divergence_form,
178
200
  fields={"u": u_side_trial, "psi": p_side_test},
179
201
  output_dtype=float,
202
+ assembly="generic", # not required, for test coverage purposes
180
203
  )
181
204
 
182
205
  # Solve incompressibility
@@ -204,7 +227,10 @@ if __name__ == "__main__":
204
227
  parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
205
228
  parser.add_argument("--device", type=str, default=None, help="Override the default Warp device.")
206
229
  parser.add_argument("--resolution", type=int, default=8, help="Grid resolution.")
207
- parser.add_argument("--degree", type=int, default=2, help="Polynomial degree of shape functions.")
230
+ parser.add_argument("--degree", type=int, default=1, help="Polynomial degree of shape functions.")
231
+ parser.add_argument(
232
+ "--div_conforming", action="store_true", default=False, help="Use H(div)-conforming function space"
233
+ )
208
234
  parser.add_argument("--level_count", type=int, default=4, help="Number of refinement levels.")
209
235
  parser.add_argument(
210
236
  "--headless",
@@ -219,6 +245,7 @@ if __name__ == "__main__":
219
245
  example = Example(
220
246
  quiet=args.quiet,
221
247
  degree=args.degree,
248
+ div_conforming=args.div_conforming,
222
249
  base_resolution=args.resolution,
223
250
  level_count=args.level_count,
224
251
  headless=args.headless,
@@ -12,6 +12,8 @@
12
12
  # grid and the PicQuadrature class.
13
13
  ###########################################################################
14
14
 
15
+ from typing import Any
16
+
15
17
  import numpy as np
16
18
 
17
19
  import warp as wp
@@ -123,7 +125,7 @@ def scalar_vector_multiply(
123
125
  @wp.kernel
124
126
  def scale_transposed_divergence_mat(
125
127
  tr_divergence_mat_offsets: wp.array(dtype=int),
126
- tr_divergence_mat_values: wp.array(dtype=wp.mat(shape=(3, 1), dtype=float)),
128
+ tr_divergence_mat_values: wp.array(dtype=Any),
127
129
  inv_fraction_int: wp.array(dtype=float),
128
130
  ):
129
131
  # In-place scaling of gradient operator rows with inverse mass
@@ -203,7 +205,6 @@ class Example:
203
205
  particle_grid_offset = wp.vec3(self.radius, self.radius, self.radius)
204
206
 
205
207
  # Initialize warp.sim model, spawn particles
206
- np.random.seed(0)
207
208
  builder = wp.sim.ModelBuilder()
208
209
  builder.add_particle_grid(
209
210
  dim_x=particle_grid_res[0],