warp-lang 1.4.2__py3-none-manylinux2014_aarch64.whl → 1.5.0__py3-none-manylinux2014_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of warp-lang might be problematic. Click here for more details.
- warp/__init__.py +4 -0
- warp/autograd.py +43 -8
- warp/bin/warp-clang.so +0 -0
- warp/bin/warp.so +0 -0
- warp/build.py +21 -2
- warp/build_dll.py +23 -6
- warp/builtins.py +1783 -2
- warp/codegen.py +177 -45
- warp/config.py +2 -2
- warp/context.py +321 -73
- warp/examples/assets/pixel.jpg +0 -0
- warp/examples/benchmarks/benchmark_cloth_paddle.py +86 -0
- warp/examples/benchmarks/benchmark_gemm.py +121 -0
- warp/examples/benchmarks/benchmark_interop_paddle.py +158 -0
- warp/examples/benchmarks/benchmark_tile.py +179 -0
- warp/examples/fem/example_adaptive_grid.py +37 -10
- warp/examples/fem/example_apic_fluid.py +3 -2
- warp/examples/fem/example_convection_diffusion_dg.py +4 -5
- warp/examples/fem/example_deformed_geometry.py +1 -1
- warp/examples/fem/example_diffusion_3d.py +47 -4
- warp/examples/fem/example_distortion_energy.py +220 -0
- warp/examples/fem/example_magnetostatics.py +127 -85
- warp/examples/fem/example_nonconforming_contact.py +5 -5
- warp/examples/fem/example_stokes.py +3 -1
- warp/examples/fem/example_streamlines.py +12 -19
- warp/examples/fem/utils.py +38 -15
- warp/examples/sim/example_cloth.py +2 -25
- warp/examples/sim/example_quadruped.py +2 -1
- warp/examples/tile/example_tile_convolution.py +58 -0
- warp/examples/tile/example_tile_fft.py +47 -0
- warp/examples/tile/example_tile_filtering.py +105 -0
- warp/examples/tile/example_tile_matmul.py +79 -0
- warp/examples/tile/example_tile_mlp.py +375 -0
- warp/fem/__init__.py +8 -0
- warp/fem/cache.py +16 -12
- warp/fem/dirichlet.py +1 -1
- warp/fem/domain.py +44 -1
- warp/fem/field/__init__.py +1 -2
- warp/fem/field/field.py +31 -19
- warp/fem/field/nodal_field.py +101 -49
- warp/fem/field/virtual.py +794 -0
- warp/fem/geometry/__init__.py +2 -2
- warp/fem/geometry/deformed_geometry.py +3 -105
- warp/fem/geometry/element.py +13 -0
- warp/fem/geometry/geometry.py +165 -5
- warp/fem/geometry/grid_2d.py +3 -6
- warp/fem/geometry/grid_3d.py +31 -28
- warp/fem/geometry/hexmesh.py +3 -46
- warp/fem/geometry/nanogrid.py +3 -2
- warp/fem/geometry/{quadmesh_2d.py → quadmesh.py} +280 -159
- warp/fem/geometry/tetmesh.py +2 -43
- warp/fem/geometry/{trimesh_2d.py → trimesh.py} +354 -186
- warp/fem/integrate.py +683 -261
- warp/fem/linalg.py +404 -0
- warp/fem/operator.py +101 -18
- warp/fem/polynomial.py +5 -5
- warp/fem/quadrature/quadrature.py +45 -21
- warp/fem/space/__init__.py +45 -11
- warp/fem/space/basis_function_space.py +451 -0
- warp/fem/space/basis_space.py +58 -11
- warp/fem/space/function_space.py +146 -5
- warp/fem/space/grid_2d_function_space.py +80 -66
- warp/fem/space/grid_3d_function_space.py +113 -68
- warp/fem/space/hexmesh_function_space.py +96 -108
- warp/fem/space/nanogrid_function_space.py +62 -110
- warp/fem/space/quadmesh_function_space.py +208 -0
- warp/fem/space/shape/__init__.py +45 -7
- warp/fem/space/shape/cube_shape_function.py +328 -54
- warp/fem/space/shape/shape_function.py +10 -1
- warp/fem/space/shape/square_shape_function.py +328 -60
- warp/fem/space/shape/tet_shape_function.py +269 -19
- warp/fem/space/shape/triangle_shape_function.py +238 -19
- warp/fem/space/tetmesh_function_space.py +69 -37
- warp/fem/space/topology.py +38 -0
- warp/fem/space/trimesh_function_space.py +179 -0
- warp/fem/utils.py +6 -331
- warp/jax_experimental.py +3 -1
- warp/native/array.h +15 -0
- warp/native/builtin.h +66 -26
- warp/native/bvh.h +4 -0
- warp/native/coloring.cpp +600 -0
- warp/native/cuda_util.cpp +14 -0
- warp/native/cuda_util.h +2 -1
- warp/native/fabric.h +8 -0
- warp/native/hashgrid.h +4 -0
- warp/native/marching.cu +8 -0
- warp/native/mat.h +14 -3
- warp/native/mathdx.cpp +59 -0
- warp/native/mesh.h +4 -0
- warp/native/range.h +13 -1
- warp/native/reduce.cpp +9 -1
- warp/native/reduce.cu +7 -0
- warp/native/runlength_encode.cpp +9 -1
- warp/native/runlength_encode.cu +7 -1
- warp/native/scan.cpp +8 -0
- warp/native/scan.cu +8 -0
- warp/native/scan.h +8 -1
- warp/native/sparse.cpp +8 -0
- warp/native/sparse.cu +8 -0
- warp/native/temp_buffer.h +7 -0
- warp/native/tile.h +1857 -0
- warp/native/tile_gemm.h +341 -0
- warp/native/tile_reduce.h +210 -0
- warp/native/volume_builder.cu +8 -0
- warp/native/volume_builder.h +8 -0
- warp/native/warp.cpp +10 -2
- warp/native/warp.cu +369 -15
- warp/native/warp.h +12 -2
- warp/optim/adam.py +39 -4
- warp/paddle.py +29 -12
- warp/render/render_opengl.py +137 -65
- warp/sim/graph_coloring.py +292 -0
- warp/sim/integrator_euler.py +4 -2
- warp/sim/integrator_featherstone.py +115 -44
- warp/sim/integrator_vbd.py +6 -0
- warp/sim/model.py +88 -15
- warp/stubs.py +569 -4
- warp/tape.py +12 -7
- warp/tests/assets/pixel.npy +0 -0
- warp/tests/aux_test_instancing_gc.py +18 -0
- warp/tests/test_array.py +39 -0
- warp/tests/test_codegen.py +81 -1
- warp/tests/test_codegen_instancing.py +30 -0
- warp/tests/test_collision.py +110 -0
- warp/tests/test_coloring.py +241 -0
- warp/tests/test_context.py +34 -0
- warp/tests/test_examples.py +18 -4
- warp/tests/test_fem.py +453 -113
- warp/tests/test_func.py +13 -0
- warp/tests/test_generics.py +52 -0
- warp/tests/test_iter.py +68 -0
- warp/tests/test_mat_scalar_ops.py +1 -1
- warp/tests/test_mesh_query_point.py +1 -1
- warp/tests/test_module_hashing.py +23 -0
- warp/tests/test_paddle.py +27 -87
- warp/tests/test_print.py +56 -1
- warp/tests/test_spatial.py +1 -1
- warp/tests/test_tile.py +700 -0
- warp/tests/test_tile_mathdx.py +144 -0
- warp/tests/test_tile_mlp.py +383 -0
- warp/tests/test_tile_reduce.py +374 -0
- warp/tests/test_tile_shared_memory.py +190 -0
- warp/tests/test_vbd.py +12 -20
- warp/tests/test_volume.py +43 -0
- warp/tests/unittest_suites.py +19 -2
- warp/tests/unittest_utils.py +4 -0
- warp/types.py +338 -72
- warp/utils.py +22 -1
- {warp_lang-1.4.2.dist-info → warp_lang-1.5.0.dist-info}/METADATA +33 -7
- {warp_lang-1.4.2.dist-info → warp_lang-1.5.0.dist-info}/RECORD +153 -126
- {warp_lang-1.4.2.dist-info → warp_lang-1.5.0.dist-info}/WHEEL +1 -1
- warp/fem/field/test.py +0 -180
- warp/fem/field/trial.py +0 -183
- warp/fem/space/collocated_function_space.py +0 -102
- warp/fem/space/quadmesh_2d_function_space.py +0 -261
- warp/fem/space/trimesh_2d_function_space.py +0 -153
- {warp_lang-1.4.2.dist-info → warp_lang-1.5.0.dist-info}/LICENSE.md +0 -0
- {warp_lang-1.4.2.dist-info → warp_lang-1.5.0.dist-info}/top_level.txt +0 -0
|
Binary file
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
# Copyright (c) 2022 NVIDIA CORPORATION. All rights reserved.
|
|
2
|
+
# NVIDIA CORPORATION and its licensors retain all intellectual property
|
|
3
|
+
# and proprietary rights in and to this software, related documentation
|
|
4
|
+
# and any modifications thereto. Any use, reproduction, disclosure or
|
|
5
|
+
# distribution of this software and related documentation without an express
|
|
6
|
+
# license agreement from NVIDIA CORPORATION is strictly prohibited.
|
|
7
|
+
|
|
8
|
+
import paddle
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def eval_springs(x, v, indices, rest, ke, kd, f):
|
|
12
|
+
i = indices[:, 0]
|
|
13
|
+
j = indices[:, 1]
|
|
14
|
+
|
|
15
|
+
xi = x[i]
|
|
16
|
+
xj = x[j]
|
|
17
|
+
|
|
18
|
+
vi = v[i]
|
|
19
|
+
vj = v[j]
|
|
20
|
+
|
|
21
|
+
xij = xi - xj
|
|
22
|
+
vij = vi - vj
|
|
23
|
+
|
|
24
|
+
l = paddle.linalg.norm(xij, axis=1)
|
|
25
|
+
l_inv = 1.0 / l
|
|
26
|
+
|
|
27
|
+
# normalized spring direction
|
|
28
|
+
dir = (xij.T * l_inv).T
|
|
29
|
+
|
|
30
|
+
c = l - rest
|
|
31
|
+
dcdt = paddle.sum(dir * vij, axis=1)
|
|
32
|
+
|
|
33
|
+
# damping based on relative velocity.
|
|
34
|
+
fs = dir.T * (ke * c + kd * dcdt)
|
|
35
|
+
|
|
36
|
+
f.index_add_(axis=0, index=i, value=-fs.T)
|
|
37
|
+
f.index_add_(axis=0, index=j, value=fs.T)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def integrate_particles(x, v, f, g, w, dt):
|
|
41
|
+
s = w > 0.0
|
|
42
|
+
|
|
43
|
+
a_ext = g * s[:, None].astype(g.dtype)
|
|
44
|
+
|
|
45
|
+
# simple semi-implicit Euler. v1 = v0 + a dt, x1 = x0 + v1 dt
|
|
46
|
+
v += ((f.T * w).T + a_ext) * dt
|
|
47
|
+
x += v * dt
|
|
48
|
+
|
|
49
|
+
# clear forces
|
|
50
|
+
f *= 0.0
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class TrIntegrator:
|
|
54
|
+
def __init__(self, cloth, device):
|
|
55
|
+
self.cloth = cloth
|
|
56
|
+
|
|
57
|
+
self.positions = paddle.to_tensor(self.cloth.positions, place=device)
|
|
58
|
+
self.velocities = paddle.to_tensor(self.cloth.velocities, place=device)
|
|
59
|
+
self.inv_mass = paddle.to_tensor(self.cloth.inv_masses, place=device)
|
|
60
|
+
|
|
61
|
+
self.spring_indices = paddle.to_tensor(self.cloth.spring_indices, dtype=paddle.int64, place=device)
|
|
62
|
+
self.spring_lengths = paddle.to_tensor(self.cloth.spring_lengths, place=device)
|
|
63
|
+
self.spring_stiffness = paddle.to_tensor(self.cloth.spring_stiffness, place=device)
|
|
64
|
+
self.spring_damping = paddle.to_tensor(self.cloth.spring_damping, place=device)
|
|
65
|
+
|
|
66
|
+
self.forces = paddle.zeros((self.cloth.num_particles, 3), dtype=paddle.float32).to(device=device)
|
|
67
|
+
self.gravity = paddle.to_tensor((0.0, 0.0 - 9.8, 0.0), dtype=paddle.float32, place=device)
|
|
68
|
+
|
|
69
|
+
def simulate(self, dt, substeps):
|
|
70
|
+
sim_dt = dt / substeps
|
|
71
|
+
|
|
72
|
+
for _s in range(substeps):
|
|
73
|
+
eval_springs(
|
|
74
|
+
self.positions,
|
|
75
|
+
self.velocities,
|
|
76
|
+
self.spring_indices.reshape((self.cloth.num_springs, 2)),
|
|
77
|
+
self.spring_lengths,
|
|
78
|
+
self.spring_stiffness,
|
|
79
|
+
self.spring_damping,
|
|
80
|
+
self.forces,
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
# integrate
|
|
84
|
+
integrate_particles(self.positions, self.velocities, self.forces, self.gravity, self.inv_mass, sim_dt)
|
|
85
|
+
|
|
86
|
+
return self.positions.cpu().numpy()
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
from itertools import product
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
import torch as tc
|
|
5
|
+
|
|
6
|
+
import warp as wp
|
|
7
|
+
|
|
8
|
+
tc.backends.cuda.matmul.allow_tf32 = False # Disable TF32 for matrix multiplications
|
|
9
|
+
tc.backends.cudnn.allow_tf32 = False # Disable TF32 for cuDNN operations
|
|
10
|
+
|
|
11
|
+
wp.init()
|
|
12
|
+
wp.clear_kernel_cache()
|
|
13
|
+
wp.set_module_options({"fast_math": True, "enable_backward": False})
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def create_mlp_kernel(m, n, k):
|
|
17
|
+
TILE_M = m
|
|
18
|
+
TILE_N = n
|
|
19
|
+
TILE_K = k
|
|
20
|
+
|
|
21
|
+
@wp.kernel
|
|
22
|
+
def mlp(x: wp.array2d(dtype=float), weights_wp: wp.array2d(dtype=float), n_k: int, output: wp.array2d(dtype=float)):
|
|
23
|
+
i_m, i_n = wp.tid()
|
|
24
|
+
sum = wp.tile_zeros(m=TILE_M, n=TILE_N, dtype=wp.float32)
|
|
25
|
+
for count in range(n_k):
|
|
26
|
+
feat = wp.tile_load(x, i_m, count, TILE_M, TILE_K)
|
|
27
|
+
weight = wp.tile_load(weights_wp, count, i_n, TILE_K, TILE_N)
|
|
28
|
+
wp.tile_matmul(feat, weight, sum)
|
|
29
|
+
|
|
30
|
+
wp.tile_store(output, i_m, i_n, sum)
|
|
31
|
+
|
|
32
|
+
return mlp
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def benchmark_torch(A, B, warm_up, iterations):
|
|
36
|
+
# warm-up
|
|
37
|
+
for _ in range(warm_up):
|
|
38
|
+
tc.matmul(A, B)
|
|
39
|
+
|
|
40
|
+
timers = {}
|
|
41
|
+
tc.cuda.synchronize()
|
|
42
|
+
|
|
43
|
+
with wp.ScopedTimer("torch", print=False, dict=timers, synchronize=True):
|
|
44
|
+
for _ in range(iterations):
|
|
45
|
+
tc.matmul(A, B)
|
|
46
|
+
|
|
47
|
+
tc.cuda.synchronize()
|
|
48
|
+
|
|
49
|
+
return timers["torch"][0]
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def benchmark_warp(A, B, config, warm_up, iterations):
|
|
53
|
+
TILE_M = config[0]
|
|
54
|
+
TILE_N = config[1]
|
|
55
|
+
TILE_K = config[2]
|
|
56
|
+
BLOCK_DIM = config[3]
|
|
57
|
+
|
|
58
|
+
mlp = create_mlp_kernel(TILE_M, TILE_N, TILE_K)
|
|
59
|
+
|
|
60
|
+
M = A.shape[0]
|
|
61
|
+
N = B.shape[1]
|
|
62
|
+
K = A.shape[1]
|
|
63
|
+
|
|
64
|
+
output = wp.zeros((M, N), dtype=float)
|
|
65
|
+
|
|
66
|
+
# warm-up
|
|
67
|
+
for _ in range(warm_up):
|
|
68
|
+
wp.launch_tiled(
|
|
69
|
+
kernel=mlp, dim=[M // TILE_M, N // TILE_N], inputs=[A, B, K // TILE_K, output], block_dim=BLOCK_DIM
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
# check output
|
|
73
|
+
if warm_up > 0:
|
|
74
|
+
assert np.allclose(output.numpy(), A.numpy() @ B.numpy(), atol=1e-3, rtol=1e-3)
|
|
75
|
+
|
|
76
|
+
# benchmark
|
|
77
|
+
timers = {}
|
|
78
|
+
with wp.ScopedTimer("warp", print=False, dict=timers, synchronize=True):
|
|
79
|
+
for _ in range(iterations):
|
|
80
|
+
wp.launch_tiled(
|
|
81
|
+
kernel=mlp, dim=[M // TILE_M, N // TILE_N], inputs=[A, B, K // TILE_K, output], block_dim=BLOCK_DIM
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
return timers["warp"][0]
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
tile_m = [8, 16, 32, 64]
|
|
88
|
+
tile_n = [8, 16, 32, 64]
|
|
89
|
+
tile_k = [8, 16, 64]
|
|
90
|
+
block = [32, 64, 128]
|
|
91
|
+
|
|
92
|
+
M = 1024
|
|
93
|
+
N = 1024
|
|
94
|
+
K = 1024
|
|
95
|
+
|
|
96
|
+
A = tc.randn(M, K).cuda()
|
|
97
|
+
B = tc.randn(K, N).cuda()
|
|
98
|
+
|
|
99
|
+
iterations = 1000
|
|
100
|
+
warm_up = 10
|
|
101
|
+
|
|
102
|
+
time_torch = benchmark_torch(A, B, warm_up, iterations)
|
|
103
|
+
print(f"Torch: {time_torch}")
|
|
104
|
+
|
|
105
|
+
configs = list(product(tile_m, tile_n, tile_k, block))
|
|
106
|
+
|
|
107
|
+
wp.config.quiet = True
|
|
108
|
+
|
|
109
|
+
# header
|
|
110
|
+
print(
|
|
111
|
+
"{:<{}} {:<{}} {:<{}} {:<{}} {:<{}} {:<{}}".format(
|
|
112
|
+
"TILE_M", 12, "TILE_N", 12, "TILE_K", 12, "BLOCK", 12, "Time", 12, "Relative", 12
|
|
113
|
+
)
|
|
114
|
+
)
|
|
115
|
+
for c in configs:
|
|
116
|
+
time_warp = benchmark_warp(wp.from_torch(A), wp.from_torch(B), c, warm_up, iterations)
|
|
117
|
+
print(
|
|
118
|
+
"{:<{}} {:<{}} {:<{}} {:<{}} {:<{}} {:<{}}".format(
|
|
119
|
+
c[0], 12, c[1], 12, c[2], 12, c[3], 12, time_warp, 12, time_warp / time_torch, 12
|
|
120
|
+
)
|
|
121
|
+
)
|
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
# Copyright (c) 2024 NVIDIA CORPORATION. All rights reserved.
|
|
2
|
+
# NVIDIA CORPORATION and its licensors retain all intellectual property
|
|
3
|
+
# and proprietary rights in and to this software, related documentation
|
|
4
|
+
# and any modifications thereto. Any use, reproduction, disclosure or
|
|
5
|
+
# distribution of this software and related documentation without an express
|
|
6
|
+
# license agreement from NVIDIA CORPORATION is strictly prohibited.
|
|
7
|
+
|
|
8
|
+
import time
|
|
9
|
+
|
|
10
|
+
import paddle
|
|
11
|
+
|
|
12
|
+
import warp as wp
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def create_simple_kernel(dtype):
|
|
16
|
+
def simple_kernel(
|
|
17
|
+
a: wp.array(dtype=dtype),
|
|
18
|
+
b: wp.array(dtype=dtype),
|
|
19
|
+
c: wp.array(dtype=dtype),
|
|
20
|
+
d: wp.array(dtype=dtype),
|
|
21
|
+
e: wp.array(dtype=dtype),
|
|
22
|
+
):
|
|
23
|
+
pass
|
|
24
|
+
|
|
25
|
+
return wp.Kernel(simple_kernel)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def test_from_paddle(kernel, num_iters, array_size, device, warp_dtype=None):
|
|
29
|
+
warp_device = wp.get_device(device)
|
|
30
|
+
paddle_device = wp.device_to_paddle(warp_device)
|
|
31
|
+
|
|
32
|
+
if hasattr(warp_dtype, "_shape_"):
|
|
33
|
+
paddle_shape = (array_size, *warp_dtype._shape_)
|
|
34
|
+
paddle_dtype = wp.dtype_to_paddle(warp_dtype._wp_scalar_type_)
|
|
35
|
+
else:
|
|
36
|
+
paddle_shape = (array_size,)
|
|
37
|
+
paddle_dtype = paddle.float32 if warp_dtype is None else wp.dtype_to_paddle(warp_dtype)
|
|
38
|
+
|
|
39
|
+
_a = paddle.zeros(paddle_shape, dtype=paddle_dtype).to(device=paddle_device)
|
|
40
|
+
_b = paddle.zeros(paddle_shape, dtype=paddle_dtype).to(device=paddle_device)
|
|
41
|
+
_c = paddle.zeros(paddle_shape, dtype=paddle_dtype).to(device=paddle_device)
|
|
42
|
+
_d = paddle.zeros(paddle_shape, dtype=paddle_dtype).to(device=paddle_device)
|
|
43
|
+
_e = paddle.zeros(paddle_shape, dtype=paddle_dtype).to(device=paddle_device)
|
|
44
|
+
|
|
45
|
+
wp.synchronize()
|
|
46
|
+
|
|
47
|
+
# profiler = Profiler(interval=0.000001)
|
|
48
|
+
# profiler.start()
|
|
49
|
+
|
|
50
|
+
t1 = time.time_ns()
|
|
51
|
+
|
|
52
|
+
for _ in range(num_iters):
|
|
53
|
+
a = wp.from_paddle(_a, dtype=warp_dtype)
|
|
54
|
+
b = wp.from_paddle(_b, dtype=warp_dtype)
|
|
55
|
+
c = wp.from_paddle(_c, dtype=warp_dtype)
|
|
56
|
+
d = wp.from_paddle(_d, dtype=warp_dtype)
|
|
57
|
+
e = wp.from_paddle(_e, dtype=warp_dtype)
|
|
58
|
+
wp.launch(kernel, dim=array_size, inputs=[a, b, c, d, e])
|
|
59
|
+
|
|
60
|
+
t2 = time.time_ns()
|
|
61
|
+
print(f"{(t2 - t1) / 1_000_000 :8.0f} ms from_paddle(...)")
|
|
62
|
+
|
|
63
|
+
# profiler.stop()
|
|
64
|
+
# profiler.print()
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def test_array_ctype_from_paddle(kernel, num_iters, array_size, device, warp_dtype=None):
|
|
68
|
+
warp_device = wp.get_device(device)
|
|
69
|
+
paddle_device = wp.device_to_paddle(warp_device)
|
|
70
|
+
|
|
71
|
+
if hasattr(warp_dtype, "_shape_"):
|
|
72
|
+
paddle_shape = (array_size, *warp_dtype._shape_)
|
|
73
|
+
paddle_dtype = wp.dtype_to_paddle(warp_dtype._wp_scalar_type_)
|
|
74
|
+
else:
|
|
75
|
+
paddle_shape = (array_size,)
|
|
76
|
+
paddle_dtype = paddle.float32 if warp_dtype is None else wp.dtype_to_paddle(warp_dtype)
|
|
77
|
+
|
|
78
|
+
_a = paddle.zeros(paddle_shape, dtype=paddle_dtype).to(device=paddle_device)
|
|
79
|
+
_b = paddle.zeros(paddle_shape, dtype=paddle_dtype).to(device=paddle_device)
|
|
80
|
+
_c = paddle.zeros(paddle_shape, dtype=paddle_dtype).to(device=paddle_device)
|
|
81
|
+
_d = paddle.zeros(paddle_shape, dtype=paddle_dtype).to(device=paddle_device)
|
|
82
|
+
_e = paddle.zeros(paddle_shape, dtype=paddle_dtype).to(device=paddle_device)
|
|
83
|
+
|
|
84
|
+
wp.synchronize()
|
|
85
|
+
|
|
86
|
+
# profiler = Profiler(interval=0.000001)
|
|
87
|
+
# profiler.start()
|
|
88
|
+
|
|
89
|
+
t1 = time.time_ns()
|
|
90
|
+
|
|
91
|
+
for _ in range(num_iters):
|
|
92
|
+
a = wp.from_paddle(_a, dtype=warp_dtype, return_ctype=True)
|
|
93
|
+
b = wp.from_paddle(_b, dtype=warp_dtype, return_ctype=True)
|
|
94
|
+
c = wp.from_paddle(_c, dtype=warp_dtype, return_ctype=True)
|
|
95
|
+
d = wp.from_paddle(_d, dtype=warp_dtype, return_ctype=True)
|
|
96
|
+
e = wp.from_paddle(_e, dtype=warp_dtype, return_ctype=True)
|
|
97
|
+
wp.launch(kernel, dim=array_size, inputs=[a, b, c, d, e])
|
|
98
|
+
|
|
99
|
+
t2 = time.time_ns()
|
|
100
|
+
print(f"{(t2 - t1) / 1_000_000 :8.0f} ms from_paddle(..., return_ctype=True)")
|
|
101
|
+
|
|
102
|
+
# profiler.stop()
|
|
103
|
+
# profiler.print()
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def test_direct_from_paddle(kernel, num_iters, array_size, device, warp_dtype=None):
|
|
107
|
+
warp_device = wp.get_device(device)
|
|
108
|
+
paddle_device = wp.device_to_paddle(warp_device)
|
|
109
|
+
|
|
110
|
+
if hasattr(warp_dtype, "_shape_"):
|
|
111
|
+
paddle_shape = (array_size, *warp_dtype._shape_)
|
|
112
|
+
paddle_dtype = wp.dtype_to_paddle(warp_dtype._wp_scalar_type_)
|
|
113
|
+
else:
|
|
114
|
+
paddle_shape = (array_size,)
|
|
115
|
+
paddle_dtype = paddle.float32 if warp_dtype is None else wp.dtype_to_paddle(warp_dtype)
|
|
116
|
+
|
|
117
|
+
_a = paddle.zeros(paddle_shape, dtype=paddle_dtype).to(device=paddle_device)
|
|
118
|
+
_b = paddle.zeros(paddle_shape, dtype=paddle_dtype).to(device=paddle_device)
|
|
119
|
+
_c = paddle.zeros(paddle_shape, dtype=paddle_dtype).to(device=paddle_device)
|
|
120
|
+
_d = paddle.zeros(paddle_shape, dtype=paddle_dtype).to(device=paddle_device)
|
|
121
|
+
_e = paddle.zeros(paddle_shape, dtype=paddle_dtype).to(device=paddle_device)
|
|
122
|
+
|
|
123
|
+
wp.synchronize()
|
|
124
|
+
|
|
125
|
+
# profiler = Profiler(interval=0.000001)
|
|
126
|
+
# profiler.start()
|
|
127
|
+
|
|
128
|
+
t1 = time.time_ns()
|
|
129
|
+
|
|
130
|
+
for _ in range(num_iters):
|
|
131
|
+
wp.launch(kernel, dim=array_size, inputs=[_a, _b, _c, _d, _e])
|
|
132
|
+
|
|
133
|
+
t2 = time.time_ns()
|
|
134
|
+
print(f"{(t2 - t1) / 1_000_000 :8.0f} ms direct from paddle")
|
|
135
|
+
|
|
136
|
+
# profiler.stop()
|
|
137
|
+
# profiler.print()
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
wp.init()
|
|
141
|
+
|
|
142
|
+
params = [
|
|
143
|
+
# (warp_dtype arg, kernel)
|
|
144
|
+
(None, create_simple_kernel(wp.float32)),
|
|
145
|
+
(wp.float32, create_simple_kernel(wp.float32)),
|
|
146
|
+
(wp.vec3f, create_simple_kernel(wp.vec3f)),
|
|
147
|
+
(wp.mat22f, create_simple_kernel(wp.mat22f)),
|
|
148
|
+
]
|
|
149
|
+
|
|
150
|
+
wp.load_module()
|
|
151
|
+
|
|
152
|
+
num_iters = 100000
|
|
153
|
+
|
|
154
|
+
for warp_dtype, kernel in params:
|
|
155
|
+
print(f"\ndtype={wp.context.type_str(warp_dtype)}")
|
|
156
|
+
test_from_paddle(kernel, num_iters, 10, "cuda:0", warp_dtype=warp_dtype)
|
|
157
|
+
test_array_ctype_from_paddle(kernel, num_iters, 10, "cuda:0", warp_dtype=warp_dtype)
|
|
158
|
+
test_direct_from_paddle(kernel, num_iters, 10, "cuda:0", warp_dtype=warp_dtype)
|
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
# Copyright (c) 2024 NVIDIA CORPORATION. All rights reserved.
|
|
2
|
+
# NVIDIA CORPORATION and its licensors retain all intellectual property
|
|
3
|
+
# and proprietary rights in and to this software, related documentation
|
|
4
|
+
# and any modifications thereto. Any use, reproduction, disclosure or
|
|
5
|
+
# distribution of this software and related documentation without an express
|
|
6
|
+
# license agreement from NVIDIA CORPORATION is strictly prohibited.
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
import torch
|
|
10
|
+
|
|
11
|
+
import warp as wp
|
|
12
|
+
|
|
13
|
+
wp.init()
|
|
14
|
+
wp.set_module_options({"enable_backward": False, "fast_math": True})
|
|
15
|
+
wp.set_device("cuda:0")
|
|
16
|
+
|
|
17
|
+
wp.build.clear_kernel_cache()
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@wp.kernel
|
|
21
|
+
def gemm(A: wp.array2d(dtype=float), B: wp.array2d(dtype=float), C: wp.array2d(dtype=float)):
|
|
22
|
+
# output index
|
|
23
|
+
i, j = wp.tid()
|
|
24
|
+
|
|
25
|
+
sum = float(0.0)
|
|
26
|
+
|
|
27
|
+
for k in range(0, A.shape[1]):
|
|
28
|
+
sum += A[i, k] * B[k, j]
|
|
29
|
+
|
|
30
|
+
C[i, j] = sum
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
TILE_M = wp.constant(64)
|
|
34
|
+
TILE_N = wp.constant(64)
|
|
35
|
+
TILE_K = wp.constant(8)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@wp.kernel
|
|
39
|
+
def gemm_tiled(A: wp.array2d(dtype=float), B: wp.array2d(dtype=float), C: wp.array2d(dtype=float)):
|
|
40
|
+
# output tile index
|
|
41
|
+
i, j = wp.tid()
|
|
42
|
+
|
|
43
|
+
sum = wp.tile_zeros(m=TILE_M, n=TILE_N, dtype=wp.float32)
|
|
44
|
+
|
|
45
|
+
_M = A.shape[0]
|
|
46
|
+
_N = B.shape[1]
|
|
47
|
+
K = A.shape[1]
|
|
48
|
+
|
|
49
|
+
count = int(K / 8) # TODO: code-gen bug if you use a constant before passing it to a kwd arg (in this case TILE_K)
|
|
50
|
+
|
|
51
|
+
for k in range(count):
|
|
52
|
+
a = wp.tile_load(A, i, k, m=TILE_M, n=TILE_K)
|
|
53
|
+
b = wp.tile_load(B, k, j, m=TILE_K, n=TILE_N)
|
|
54
|
+
|
|
55
|
+
# sum += a*b
|
|
56
|
+
wp.tile_matmul(a, b, sum)
|
|
57
|
+
|
|
58
|
+
wp.tile_store(C, i, j, sum)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def benchmark_numpy(A, B, C):
|
|
62
|
+
timers = {}
|
|
63
|
+
iters = 10
|
|
64
|
+
|
|
65
|
+
# warm up
|
|
66
|
+
for _i in range(10):
|
|
67
|
+
_C = A @ B
|
|
68
|
+
|
|
69
|
+
with wp.ScopedTimer("NumPy", dict=timers):
|
|
70
|
+
for _i in range(iters):
|
|
71
|
+
_C = A @ B
|
|
72
|
+
|
|
73
|
+
return min(timers["NumPy"])
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def benchmark_warp_simt(A, B, C):
|
|
77
|
+
timers = {}
|
|
78
|
+
iters = 10
|
|
79
|
+
|
|
80
|
+
A_wp = wp.array(A)
|
|
81
|
+
B_wp = wp.array(B)
|
|
82
|
+
C_wp = wp.array(C)
|
|
83
|
+
|
|
84
|
+
# warm up
|
|
85
|
+
for _i in range(10):
|
|
86
|
+
wp.launch(gemm, dim=(M, N), inputs=[A_wp, B_wp, C_wp])
|
|
87
|
+
|
|
88
|
+
with wp.ScopedTimer("Warp (SIMT)", dict=timers, print=False, synchronize=True):
|
|
89
|
+
for _i in range(iters):
|
|
90
|
+
wp.launch(gemm, dim=(M, N), inputs=[A_wp, B_wp, C_wp])
|
|
91
|
+
|
|
92
|
+
return min(timers["Warp (SIMT)"])
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def benchmark_warp_tiled(A, B, C):
|
|
96
|
+
timers = {}
|
|
97
|
+
iters = 10
|
|
98
|
+
|
|
99
|
+
# must match with the tile_matmul() partition size
|
|
100
|
+
SUB_TILE_M = 4
|
|
101
|
+
SUB_TILE_N = 4
|
|
102
|
+
|
|
103
|
+
num_threads = int(TILE_M / SUB_TILE_M) * int(TILE_N / SUB_TILE_N)
|
|
104
|
+
A_wp = wp.array(A)
|
|
105
|
+
B_wp = wp.array(B)
|
|
106
|
+
C_wp = wp.array(C)
|
|
107
|
+
|
|
108
|
+
# warm up
|
|
109
|
+
wp.capture_begin()
|
|
110
|
+
|
|
111
|
+
for _i in range(iters):
|
|
112
|
+
wp.launch(gemm_tiled, dim=(int(M / TILE_M), int(N / TILE_N)), inputs=[A_wp, B_wp, C_wp], tile_size=num_threads)
|
|
113
|
+
|
|
114
|
+
graph = wp.capture_end()
|
|
115
|
+
|
|
116
|
+
with wp.ScopedTimer("Warp (Tiled)", dict=timers, print=False, synchronize=True):
|
|
117
|
+
# for i in range(iters):
|
|
118
|
+
# wp.launch(gemm_tiled, dim=(int(M/TILE_M), int(N/TILE_N)), inputs=[A_wp, B_wp, C_wp], tile_size=num_threads)
|
|
119
|
+
wp.capture_launch(graph)
|
|
120
|
+
|
|
121
|
+
return min(timers["Warp (Tiled)"])
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def benchmark_torch(A, B, C):
|
|
125
|
+
A_tc = torch.from_numpy(A).to("cuda:0")
|
|
126
|
+
B_tc = torch.from_numpy(B).to("cuda:0")
|
|
127
|
+
C_tc = torch.from_numpy(C).to("cuda:0")
|
|
128
|
+
|
|
129
|
+
# warm-up
|
|
130
|
+
for _i in range(10):
|
|
131
|
+
torch.matmul(A_tc, B_tc, out=C_tc)
|
|
132
|
+
|
|
133
|
+
timers = {}
|
|
134
|
+
iters = 10
|
|
135
|
+
|
|
136
|
+
torch.cuda.synchronize()
|
|
137
|
+
|
|
138
|
+
with wp.ScopedTimer("Torch", dict=timers, print=False):
|
|
139
|
+
for _i in range(iters):
|
|
140
|
+
torch.matmul(A_tc, B_tc) # , out=C_tc)
|
|
141
|
+
|
|
142
|
+
torch.cuda.synchronize()
|
|
143
|
+
|
|
144
|
+
return min(timers["Torch"])
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
results_torch = []
|
|
148
|
+
results_warp_simt = []
|
|
149
|
+
results_warp_tiled = []
|
|
150
|
+
|
|
151
|
+
print("{:>8s} {:>8s} {:>8s} {:>8s} {:>8s} {:>8s}".format("M", "N", "K", "Torch", "Warp (SIMT)", "Warp (Tiled)"))
|
|
152
|
+
print("--------------------------------------------------------")
|
|
153
|
+
|
|
154
|
+
for i in range(2, 33):
|
|
155
|
+
# for i in range(8,9):
|
|
156
|
+
|
|
157
|
+
M = i * 128
|
|
158
|
+
N = M
|
|
159
|
+
K = N
|
|
160
|
+
|
|
161
|
+
# M = TILE_M*21
|
|
162
|
+
# K = TILE_K*7
|
|
163
|
+
# N = TILE_M*12
|
|
164
|
+
|
|
165
|
+
rng = np.random.default_rng(42)
|
|
166
|
+
|
|
167
|
+
A = rng.random((M, K), dtype=np.float32)
|
|
168
|
+
B = rng.random((K, N), dtype=np.float32)
|
|
169
|
+
C = np.zeros((M, N), dtype=np.float32)
|
|
170
|
+
|
|
171
|
+
results_torch.append(benchmark_torch(A, B, C))
|
|
172
|
+
results_warp_simt.append(0.0) # benchmark_warp_simt(A, B, C))
|
|
173
|
+
results_warp_tiled.append(benchmark_warp_tiled(A, B, C))
|
|
174
|
+
|
|
175
|
+
print(
|
|
176
|
+
"{:>8d} {:>8d} {:>8d} {:>8f} {:>8f} {:>8f}".format(
|
|
177
|
+
M, N, K, results_torch[-1], results_warp_simt[-1], results_warp_tiled[-1]
|
|
178
|
+
)
|
|
179
|
+
)
|
|
@@ -56,7 +56,7 @@ def mass_form(
|
|
|
56
56
|
u: fem.Field,
|
|
57
57
|
v: fem.Field,
|
|
58
58
|
):
|
|
59
|
-
return u(s)
|
|
59
|
+
return fem.linalg.generalized_inner(u(s), v(s))
|
|
60
60
|
|
|
61
61
|
|
|
62
62
|
@fem.integrand
|
|
@@ -86,9 +86,12 @@ def pressure_anomaly_field(s: fem.Sample, domain: fem.Domain, pressure: fem.Fiel
|
|
|
86
86
|
|
|
87
87
|
|
|
88
88
|
class Example:
|
|
89
|
-
def __init__(
|
|
89
|
+
def __init__(
|
|
90
|
+
self, quiet=False, degree=2, div_conforming=False, base_resolution=8, level_count=4, headless: bool = False
|
|
91
|
+
):
|
|
90
92
|
self._quiet = quiet
|
|
91
93
|
self._degree = degree
|
|
94
|
+
self._div_conforming = div_conforming
|
|
92
95
|
|
|
93
96
|
# Start from a coarse, dense grid
|
|
94
97
|
res = wp.vec3i(2 * base_resolution, base_resolution // 2, base_resolution)
|
|
@@ -110,9 +113,13 @@ class Example:
|
|
|
110
113
|
sim_vol, level_count, refinement_field=refinement, grading="face"
|
|
111
114
|
)
|
|
112
115
|
|
|
113
|
-
# Function spaces for velocity,
|
|
114
|
-
|
|
115
|
-
|
|
116
|
+
# Function spaces for velocity, pressure (RTk / Pk-1 or Pk / Pk-1)
|
|
117
|
+
u_space = fem.make_polynomial_space(
|
|
118
|
+
geo=self._geo,
|
|
119
|
+
element_basis=fem.ElementBasis.RAVIART_THOMAS if div_conforming else None,
|
|
120
|
+
degree=self._degree,
|
|
121
|
+
dtype=wp.vec3,
|
|
122
|
+
)
|
|
116
123
|
p_space = fem.make_polynomial_space(geo=self._geo, degree=self._degree - 1, dtype=float)
|
|
117
124
|
|
|
118
125
|
self.pressure_field = p_space.make_field()
|
|
@@ -137,7 +144,17 @@ class Example:
|
|
|
137
144
|
def render(self):
|
|
138
145
|
# self.renderer.add_field("solution", self.pressure_field)
|
|
139
146
|
self.plot.add_field("pressure_anomaly", self.pressure_anomaly_field)
|
|
140
|
-
|
|
147
|
+
|
|
148
|
+
if self._div_conforming:
|
|
149
|
+
# If using H(div)-conforming elements, interpolate to continuous space
|
|
150
|
+
velocity_field_lagrange = fem.make_polynomial_space(
|
|
151
|
+
self.velocity_field.geometry, dtype=wp.vec3, degree=self._degree
|
|
152
|
+
).make_field()
|
|
153
|
+
fem.interpolate(self.velocity_field, dest=velocity_field_lagrange)
|
|
154
|
+
else:
|
|
155
|
+
velocity_field_lagrange = self.velocity_field
|
|
156
|
+
|
|
157
|
+
self.plot.add_field("velocity", velocity_field_lagrange)
|
|
141
158
|
|
|
142
159
|
def step(self):
|
|
143
160
|
u_space = self.velocity_field.space
|
|
@@ -153,9 +170,14 @@ class Example:
|
|
|
153
170
|
fem.normalize_dirichlet_projector(dirichlet_projector)
|
|
154
171
|
|
|
155
172
|
# (Diagonal) mass matrix
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
173
|
+
if self._div_conforming:
|
|
174
|
+
rho_test = fem.make_test(u_space)
|
|
175
|
+
rho_trial = fem.make_trial(u_space)
|
|
176
|
+
else:
|
|
177
|
+
rho_space = fem.make_polynomial_space(geo=u_space.geometry, degree=self._degree)
|
|
178
|
+
rho_test = fem.make_test(rho_space)
|
|
179
|
+
rho_trial = fem.make_trial(rho_space)
|
|
180
|
+
|
|
159
181
|
inv_mass_matrix = fem.integrate(
|
|
160
182
|
mass_form, fields={"u": rho_trial, "v": rho_test}, nodal=True, output_dtype=float
|
|
161
183
|
)
|
|
@@ -177,6 +199,7 @@ class Example:
|
|
|
177
199
|
side_divergence_form,
|
|
178
200
|
fields={"u": u_side_trial, "psi": p_side_test},
|
|
179
201
|
output_dtype=float,
|
|
202
|
+
assembly="generic", # not required, for test coverage purposes
|
|
180
203
|
)
|
|
181
204
|
|
|
182
205
|
# Solve incompressibility
|
|
@@ -204,7 +227,10 @@ if __name__ == "__main__":
|
|
|
204
227
|
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
|
205
228
|
parser.add_argument("--device", type=str, default=None, help="Override the default Warp device.")
|
|
206
229
|
parser.add_argument("--resolution", type=int, default=8, help="Grid resolution.")
|
|
207
|
-
parser.add_argument("--degree", type=int, default=
|
|
230
|
+
parser.add_argument("--degree", type=int, default=1, help="Polynomial degree of shape functions.")
|
|
231
|
+
parser.add_argument(
|
|
232
|
+
"--div_conforming", action="store_true", default=False, help="Use H(div)-conforming function space"
|
|
233
|
+
)
|
|
208
234
|
parser.add_argument("--level_count", type=int, default=4, help="Number of refinement levels.")
|
|
209
235
|
parser.add_argument(
|
|
210
236
|
"--headless",
|
|
@@ -219,6 +245,7 @@ if __name__ == "__main__":
|
|
|
219
245
|
example = Example(
|
|
220
246
|
quiet=args.quiet,
|
|
221
247
|
degree=args.degree,
|
|
248
|
+
div_conforming=args.div_conforming,
|
|
222
249
|
base_resolution=args.resolution,
|
|
223
250
|
level_count=args.level_count,
|
|
224
251
|
headless=args.headless,
|
|
@@ -12,6 +12,8 @@
|
|
|
12
12
|
# grid and the PicQuadrature class.
|
|
13
13
|
###########################################################################
|
|
14
14
|
|
|
15
|
+
from typing import Any
|
|
16
|
+
|
|
15
17
|
import numpy as np
|
|
16
18
|
|
|
17
19
|
import warp as wp
|
|
@@ -123,7 +125,7 @@ def scalar_vector_multiply(
|
|
|
123
125
|
@wp.kernel
|
|
124
126
|
def scale_transposed_divergence_mat(
|
|
125
127
|
tr_divergence_mat_offsets: wp.array(dtype=int),
|
|
126
|
-
tr_divergence_mat_values: wp.array(dtype=
|
|
128
|
+
tr_divergence_mat_values: wp.array(dtype=Any),
|
|
127
129
|
inv_fraction_int: wp.array(dtype=float),
|
|
128
130
|
):
|
|
129
131
|
# In-place scaling of gradient operator rows with inverse mass
|
|
@@ -203,7 +205,6 @@ class Example:
|
|
|
203
205
|
particle_grid_offset = wp.vec3(self.radius, self.radius, self.radius)
|
|
204
206
|
|
|
205
207
|
# Initialize warp.sim model, spawn particles
|
|
206
|
-
np.random.seed(0)
|
|
207
208
|
builder = wp.sim.ModelBuilder()
|
|
208
209
|
builder.add_particle_grid(
|
|
209
210
|
dim_x=particle_grid_res[0],
|