warp-lang 1.4.1__py3-none-manylinux2014_x86_64.whl → 1.5.0__py3-none-manylinux2014_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of warp-lang might be problematic. Click here for more details.
- warp/__init__.py +4 -0
- warp/autograd.py +43 -8
- warp/bin/warp-clang.so +0 -0
- warp/bin/warp.so +0 -0
- warp/build.py +21 -2
- warp/build_dll.py +23 -6
- warp/builtins.py +1920 -111
- warp/codegen.py +186 -62
- warp/config.py +2 -2
- warp/context.py +322 -73
- warp/examples/assets/pixel.jpg +0 -0
- warp/examples/benchmarks/benchmark_cloth_paddle.py +86 -0
- warp/examples/benchmarks/benchmark_gemm.py +121 -0
- warp/examples/benchmarks/benchmark_interop_paddle.py +158 -0
- warp/examples/benchmarks/benchmark_tile.py +179 -0
- warp/examples/core/example_dem.py +2 -1
- warp/examples/core/example_mesh_intersect.py +3 -3
- warp/examples/fem/example_adaptive_grid.py +37 -10
- warp/examples/fem/example_apic_fluid.py +3 -2
- warp/examples/fem/example_convection_diffusion_dg.py +4 -5
- warp/examples/fem/example_deformed_geometry.py +1 -1
- warp/examples/fem/example_diffusion_3d.py +47 -4
- warp/examples/fem/example_distortion_energy.py +220 -0
- warp/examples/fem/example_magnetostatics.py +127 -85
- warp/examples/fem/example_nonconforming_contact.py +5 -5
- warp/examples/fem/example_stokes.py +3 -1
- warp/examples/fem/example_streamlines.py +12 -19
- warp/examples/fem/utils.py +38 -15
- warp/examples/optim/example_walker.py +2 -2
- warp/examples/sim/example_cloth.py +2 -25
- warp/examples/sim/example_jacobian_ik.py +6 -2
- warp/examples/sim/example_quadruped.py +2 -1
- warp/examples/tile/example_tile_convolution.py +58 -0
- warp/examples/tile/example_tile_fft.py +47 -0
- warp/examples/tile/example_tile_filtering.py +105 -0
- warp/examples/tile/example_tile_matmul.py +79 -0
- warp/examples/tile/example_tile_mlp.py +375 -0
- warp/fem/__init__.py +8 -0
- warp/fem/cache.py +16 -12
- warp/fem/dirichlet.py +1 -1
- warp/fem/domain.py +44 -1
- warp/fem/field/__init__.py +1 -2
- warp/fem/field/field.py +31 -19
- warp/fem/field/nodal_field.py +101 -49
- warp/fem/field/virtual.py +794 -0
- warp/fem/geometry/__init__.py +2 -2
- warp/fem/geometry/deformed_geometry.py +3 -105
- warp/fem/geometry/element.py +13 -0
- warp/fem/geometry/geometry.py +165 -5
- warp/fem/geometry/grid_2d.py +3 -6
- warp/fem/geometry/grid_3d.py +31 -28
- warp/fem/geometry/hexmesh.py +3 -46
- warp/fem/geometry/nanogrid.py +3 -2
- warp/fem/geometry/{quadmesh_2d.py → quadmesh.py} +280 -159
- warp/fem/geometry/tetmesh.py +2 -43
- warp/fem/geometry/{trimesh_2d.py → trimesh.py} +354 -186
- warp/fem/integrate.py +683 -261
- warp/fem/linalg.py +404 -0
- warp/fem/operator.py +101 -18
- warp/fem/polynomial.py +5 -5
- warp/fem/quadrature/quadrature.py +45 -21
- warp/fem/space/__init__.py +45 -11
- warp/fem/space/basis_function_space.py +451 -0
- warp/fem/space/basis_space.py +58 -11
- warp/fem/space/function_space.py +146 -5
- warp/fem/space/grid_2d_function_space.py +80 -66
- warp/fem/space/grid_3d_function_space.py +113 -68
- warp/fem/space/hexmesh_function_space.py +96 -108
- warp/fem/space/nanogrid_function_space.py +62 -110
- warp/fem/space/quadmesh_function_space.py +208 -0
- warp/fem/space/shape/__init__.py +45 -7
- warp/fem/space/shape/cube_shape_function.py +328 -54
- warp/fem/space/shape/shape_function.py +10 -1
- warp/fem/space/shape/square_shape_function.py +328 -60
- warp/fem/space/shape/tet_shape_function.py +269 -19
- warp/fem/space/shape/triangle_shape_function.py +238 -19
- warp/fem/space/tetmesh_function_space.py +69 -37
- warp/fem/space/topology.py +38 -0
- warp/fem/space/trimesh_function_space.py +179 -0
- warp/fem/utils.py +6 -331
- warp/jax_experimental.py +3 -1
- warp/native/array.h +55 -40
- warp/native/builtin.h +124 -43
- warp/native/bvh.h +4 -0
- warp/native/coloring.cpp +600 -0
- warp/native/cuda_util.cpp +14 -0
- warp/native/cuda_util.h +2 -1
- warp/native/fabric.h +8 -0
- warp/native/hashgrid.h +4 -0
- warp/native/marching.cu +8 -0
- warp/native/mat.h +14 -3
- warp/native/mathdx.cpp +59 -0
- warp/native/mesh.h +4 -0
- warp/native/range.h +13 -1
- warp/native/reduce.cpp +9 -1
- warp/native/reduce.cu +7 -0
- warp/native/runlength_encode.cpp +9 -1
- warp/native/runlength_encode.cu +7 -1
- warp/native/scan.cpp +8 -0
- warp/native/scan.cu +8 -0
- warp/native/scan.h +8 -1
- warp/native/sparse.cpp +8 -0
- warp/native/sparse.cu +8 -0
- warp/native/temp_buffer.h +7 -0
- warp/native/tile.h +1857 -0
- warp/native/tile_gemm.h +341 -0
- warp/native/tile_reduce.h +210 -0
- warp/native/volume_builder.cu +8 -0
- warp/native/volume_builder.h +8 -0
- warp/native/warp.cpp +10 -2
- warp/native/warp.cu +369 -15
- warp/native/warp.h +12 -2
- warp/optim/adam.py +39 -4
- warp/paddle.py +29 -12
- warp/render/render_opengl.py +137 -65
- warp/sim/graph_coloring.py +292 -0
- warp/sim/integrator_euler.py +4 -2
- warp/sim/integrator_featherstone.py +115 -44
- warp/sim/integrator_vbd.py +6 -0
- warp/sim/model.py +90 -17
- warp/stubs.py +651 -85
- warp/tape.py +12 -7
- warp/tests/assets/pixel.npy +0 -0
- warp/tests/aux_test_instancing_gc.py +18 -0
- warp/tests/test_array.py +207 -48
- warp/tests/test_closest_point_edge_edge.py +8 -8
- warp/tests/test_codegen.py +120 -1
- warp/tests/test_codegen_instancing.py +30 -0
- warp/tests/test_collision.py +110 -0
- warp/tests/test_coloring.py +241 -0
- warp/tests/test_context.py +34 -0
- warp/tests/test_examples.py +18 -4
- warp/tests/test_fabricarray.py +33 -0
- warp/tests/test_fem.py +453 -113
- warp/tests/test_func.py +48 -1
- warp/tests/test_generics.py +52 -0
- warp/tests/test_iter.py +68 -0
- warp/tests/test_mat_scalar_ops.py +1 -1
- warp/tests/test_mesh_query_point.py +5 -4
- warp/tests/test_module_hashing.py +23 -0
- warp/tests/test_paddle.py +27 -87
- warp/tests/test_print.py +191 -1
- warp/tests/test_spatial.py +1 -1
- warp/tests/test_tile.py +700 -0
- warp/tests/test_tile_mathdx.py +144 -0
- warp/tests/test_tile_mlp.py +383 -0
- warp/tests/test_tile_reduce.py +374 -0
- warp/tests/test_tile_shared_memory.py +190 -0
- warp/tests/test_vbd.py +12 -20
- warp/tests/test_volume.py +43 -0
- warp/tests/unittest_suites.py +23 -2
- warp/tests/unittest_utils.py +4 -0
- warp/types.py +339 -73
- warp/utils.py +22 -1
- {warp_lang-1.4.1.dist-info → warp_lang-1.5.0.dist-info}/METADATA +33 -7
- {warp_lang-1.4.1.dist-info → warp_lang-1.5.0.dist-info}/RECORD +159 -132
- {warp_lang-1.4.1.dist-info → warp_lang-1.5.0.dist-info}/WHEEL +1 -1
- warp/fem/field/test.py +0 -180
- warp/fem/field/trial.py +0 -183
- warp/fem/space/collocated_function_space.py +0 -102
- warp/fem/space/quadmesh_2d_function_space.py +0 -261
- warp/fem/space/trimesh_2d_function_space.py +0 -153
- {warp_lang-1.4.1.dist-info → warp_lang-1.5.0.dist-info}/LICENSE.md +0 -0
- {warp_lang-1.4.1.dist-info → warp_lang-1.5.0.dist-info}/top_level.txt +0 -0
|
Binary file
|
|
@@ -0,0 +1,86 @@
|
|
|
1
|
+
# Copyright (c) 2022 NVIDIA CORPORATION. All rights reserved.
|
|
2
|
+
# NVIDIA CORPORATION and its licensors retain all intellectual property
|
|
3
|
+
# and proprietary rights in and to this software, related documentation
|
|
4
|
+
# and any modifications thereto. Any use, reproduction, disclosure or
|
|
5
|
+
# distribution of this software and related documentation without an express
|
|
6
|
+
# license agreement from NVIDIA CORPORATION is strictly prohibited.
|
|
7
|
+
|
|
8
|
+
import paddle
|
|
9
|
+
|
|
10
|
+
|
|
11
|
+
def eval_springs(x, v, indices, rest, ke, kd, f):
|
|
12
|
+
i = indices[:, 0]
|
|
13
|
+
j = indices[:, 1]
|
|
14
|
+
|
|
15
|
+
xi = x[i]
|
|
16
|
+
xj = x[j]
|
|
17
|
+
|
|
18
|
+
vi = v[i]
|
|
19
|
+
vj = v[j]
|
|
20
|
+
|
|
21
|
+
xij = xi - xj
|
|
22
|
+
vij = vi - vj
|
|
23
|
+
|
|
24
|
+
l = paddle.linalg.norm(xij, axis=1)
|
|
25
|
+
l_inv = 1.0 / l
|
|
26
|
+
|
|
27
|
+
# normalized spring direction
|
|
28
|
+
dir = (xij.T * l_inv).T
|
|
29
|
+
|
|
30
|
+
c = l - rest
|
|
31
|
+
dcdt = paddle.sum(dir * vij, axis=1)
|
|
32
|
+
|
|
33
|
+
# damping based on relative velocity.
|
|
34
|
+
fs = dir.T * (ke * c + kd * dcdt)
|
|
35
|
+
|
|
36
|
+
f.index_add_(axis=0, index=i, value=-fs.T)
|
|
37
|
+
f.index_add_(axis=0, index=j, value=fs.T)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def integrate_particles(x, v, f, g, w, dt):
|
|
41
|
+
s = w > 0.0
|
|
42
|
+
|
|
43
|
+
a_ext = g * s[:, None].astype(g.dtype)
|
|
44
|
+
|
|
45
|
+
# simple semi-implicit Euler. v1 = v0 + a dt, x1 = x0 + v1 dt
|
|
46
|
+
v += ((f.T * w).T + a_ext) * dt
|
|
47
|
+
x += v * dt
|
|
48
|
+
|
|
49
|
+
# clear forces
|
|
50
|
+
f *= 0.0
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
class TrIntegrator:
|
|
54
|
+
def __init__(self, cloth, device):
|
|
55
|
+
self.cloth = cloth
|
|
56
|
+
|
|
57
|
+
self.positions = paddle.to_tensor(self.cloth.positions, place=device)
|
|
58
|
+
self.velocities = paddle.to_tensor(self.cloth.velocities, place=device)
|
|
59
|
+
self.inv_mass = paddle.to_tensor(self.cloth.inv_masses, place=device)
|
|
60
|
+
|
|
61
|
+
self.spring_indices = paddle.to_tensor(self.cloth.spring_indices, dtype=paddle.int64, place=device)
|
|
62
|
+
self.spring_lengths = paddle.to_tensor(self.cloth.spring_lengths, place=device)
|
|
63
|
+
self.spring_stiffness = paddle.to_tensor(self.cloth.spring_stiffness, place=device)
|
|
64
|
+
self.spring_damping = paddle.to_tensor(self.cloth.spring_damping, place=device)
|
|
65
|
+
|
|
66
|
+
self.forces = paddle.zeros((self.cloth.num_particles, 3), dtype=paddle.float32).to(device=device)
|
|
67
|
+
self.gravity = paddle.to_tensor((0.0, 0.0 - 9.8, 0.0), dtype=paddle.float32, place=device)
|
|
68
|
+
|
|
69
|
+
def simulate(self, dt, substeps):
|
|
70
|
+
sim_dt = dt / substeps
|
|
71
|
+
|
|
72
|
+
for _s in range(substeps):
|
|
73
|
+
eval_springs(
|
|
74
|
+
self.positions,
|
|
75
|
+
self.velocities,
|
|
76
|
+
self.spring_indices.reshape((self.cloth.num_springs, 2)),
|
|
77
|
+
self.spring_lengths,
|
|
78
|
+
self.spring_stiffness,
|
|
79
|
+
self.spring_damping,
|
|
80
|
+
self.forces,
|
|
81
|
+
)
|
|
82
|
+
|
|
83
|
+
# integrate
|
|
84
|
+
integrate_particles(self.positions, self.velocities, self.forces, self.gravity, self.inv_mass, sim_dt)
|
|
85
|
+
|
|
86
|
+
return self.positions.cpu().numpy()
|
|
@@ -0,0 +1,121 @@
|
|
|
1
|
+
from itertools import product
|
|
2
|
+
|
|
3
|
+
import numpy as np
|
|
4
|
+
import torch as tc
|
|
5
|
+
|
|
6
|
+
import warp as wp
|
|
7
|
+
|
|
8
|
+
tc.backends.cuda.matmul.allow_tf32 = False # Disable TF32 for matrix multiplications
|
|
9
|
+
tc.backends.cudnn.allow_tf32 = False # Disable TF32 for cuDNN operations
|
|
10
|
+
|
|
11
|
+
wp.init()
|
|
12
|
+
wp.clear_kernel_cache()
|
|
13
|
+
wp.set_module_options({"fast_math": True, "enable_backward": False})
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
def create_mlp_kernel(m, n, k):
|
|
17
|
+
TILE_M = m
|
|
18
|
+
TILE_N = n
|
|
19
|
+
TILE_K = k
|
|
20
|
+
|
|
21
|
+
@wp.kernel
|
|
22
|
+
def mlp(x: wp.array2d(dtype=float), weights_wp: wp.array2d(dtype=float), n_k: int, output: wp.array2d(dtype=float)):
|
|
23
|
+
i_m, i_n = wp.tid()
|
|
24
|
+
sum = wp.tile_zeros(m=TILE_M, n=TILE_N, dtype=wp.float32)
|
|
25
|
+
for count in range(n_k):
|
|
26
|
+
feat = wp.tile_load(x, i_m, count, TILE_M, TILE_K)
|
|
27
|
+
weight = wp.tile_load(weights_wp, count, i_n, TILE_K, TILE_N)
|
|
28
|
+
wp.tile_matmul(feat, weight, sum)
|
|
29
|
+
|
|
30
|
+
wp.tile_store(output, i_m, i_n, sum)
|
|
31
|
+
|
|
32
|
+
return mlp
|
|
33
|
+
|
|
34
|
+
|
|
35
|
+
def benchmark_torch(A, B, warm_up, iterations):
|
|
36
|
+
# warm-up
|
|
37
|
+
for _ in range(warm_up):
|
|
38
|
+
tc.matmul(A, B)
|
|
39
|
+
|
|
40
|
+
timers = {}
|
|
41
|
+
tc.cuda.synchronize()
|
|
42
|
+
|
|
43
|
+
with wp.ScopedTimer("torch", print=False, dict=timers, synchronize=True):
|
|
44
|
+
for _ in range(iterations):
|
|
45
|
+
tc.matmul(A, B)
|
|
46
|
+
|
|
47
|
+
tc.cuda.synchronize()
|
|
48
|
+
|
|
49
|
+
return timers["torch"][0]
|
|
50
|
+
|
|
51
|
+
|
|
52
|
+
def benchmark_warp(A, B, config, warm_up, iterations):
|
|
53
|
+
TILE_M = config[0]
|
|
54
|
+
TILE_N = config[1]
|
|
55
|
+
TILE_K = config[2]
|
|
56
|
+
BLOCK_DIM = config[3]
|
|
57
|
+
|
|
58
|
+
mlp = create_mlp_kernel(TILE_M, TILE_N, TILE_K)
|
|
59
|
+
|
|
60
|
+
M = A.shape[0]
|
|
61
|
+
N = B.shape[1]
|
|
62
|
+
K = A.shape[1]
|
|
63
|
+
|
|
64
|
+
output = wp.zeros((M, N), dtype=float)
|
|
65
|
+
|
|
66
|
+
# warm-up
|
|
67
|
+
for _ in range(warm_up):
|
|
68
|
+
wp.launch_tiled(
|
|
69
|
+
kernel=mlp, dim=[M // TILE_M, N // TILE_N], inputs=[A, B, K // TILE_K, output], block_dim=BLOCK_DIM
|
|
70
|
+
)
|
|
71
|
+
|
|
72
|
+
# check output
|
|
73
|
+
if warm_up > 0:
|
|
74
|
+
assert np.allclose(output.numpy(), A.numpy() @ B.numpy(), atol=1e-3, rtol=1e-3)
|
|
75
|
+
|
|
76
|
+
# benchmark
|
|
77
|
+
timers = {}
|
|
78
|
+
with wp.ScopedTimer("warp", print=False, dict=timers, synchronize=True):
|
|
79
|
+
for _ in range(iterations):
|
|
80
|
+
wp.launch_tiled(
|
|
81
|
+
kernel=mlp, dim=[M // TILE_M, N // TILE_N], inputs=[A, B, K // TILE_K, output], block_dim=BLOCK_DIM
|
|
82
|
+
)
|
|
83
|
+
|
|
84
|
+
return timers["warp"][0]
|
|
85
|
+
|
|
86
|
+
|
|
87
|
+
tile_m = [8, 16, 32, 64]
|
|
88
|
+
tile_n = [8, 16, 32, 64]
|
|
89
|
+
tile_k = [8, 16, 64]
|
|
90
|
+
block = [32, 64, 128]
|
|
91
|
+
|
|
92
|
+
M = 1024
|
|
93
|
+
N = 1024
|
|
94
|
+
K = 1024
|
|
95
|
+
|
|
96
|
+
A = tc.randn(M, K).cuda()
|
|
97
|
+
B = tc.randn(K, N).cuda()
|
|
98
|
+
|
|
99
|
+
iterations = 1000
|
|
100
|
+
warm_up = 10
|
|
101
|
+
|
|
102
|
+
time_torch = benchmark_torch(A, B, warm_up, iterations)
|
|
103
|
+
print(f"Torch: {time_torch}")
|
|
104
|
+
|
|
105
|
+
configs = list(product(tile_m, tile_n, tile_k, block))
|
|
106
|
+
|
|
107
|
+
wp.config.quiet = True
|
|
108
|
+
|
|
109
|
+
# header
|
|
110
|
+
print(
|
|
111
|
+
"{:<{}} {:<{}} {:<{}} {:<{}} {:<{}} {:<{}}".format(
|
|
112
|
+
"TILE_M", 12, "TILE_N", 12, "TILE_K", 12, "BLOCK", 12, "Time", 12, "Relative", 12
|
|
113
|
+
)
|
|
114
|
+
)
|
|
115
|
+
for c in configs:
|
|
116
|
+
time_warp = benchmark_warp(wp.from_torch(A), wp.from_torch(B), c, warm_up, iterations)
|
|
117
|
+
print(
|
|
118
|
+
"{:<{}} {:<{}} {:<{}} {:<{}} {:<{}} {:<{}}".format(
|
|
119
|
+
c[0], 12, c[1], 12, c[2], 12, c[3], 12, time_warp, 12, time_warp / time_torch, 12
|
|
120
|
+
)
|
|
121
|
+
)
|
|
@@ -0,0 +1,158 @@
|
|
|
1
|
+
# Copyright (c) 2024 NVIDIA CORPORATION. All rights reserved.
|
|
2
|
+
# NVIDIA CORPORATION and its licensors retain all intellectual property
|
|
3
|
+
# and proprietary rights in and to this software, related documentation
|
|
4
|
+
# and any modifications thereto. Any use, reproduction, disclosure or
|
|
5
|
+
# distribution of this software and related documentation without an express
|
|
6
|
+
# license agreement from NVIDIA CORPORATION is strictly prohibited.
|
|
7
|
+
|
|
8
|
+
import time
|
|
9
|
+
|
|
10
|
+
import paddle
|
|
11
|
+
|
|
12
|
+
import warp as wp
|
|
13
|
+
|
|
14
|
+
|
|
15
|
+
def create_simple_kernel(dtype):
|
|
16
|
+
def simple_kernel(
|
|
17
|
+
a: wp.array(dtype=dtype),
|
|
18
|
+
b: wp.array(dtype=dtype),
|
|
19
|
+
c: wp.array(dtype=dtype),
|
|
20
|
+
d: wp.array(dtype=dtype),
|
|
21
|
+
e: wp.array(dtype=dtype),
|
|
22
|
+
):
|
|
23
|
+
pass
|
|
24
|
+
|
|
25
|
+
return wp.Kernel(simple_kernel)
|
|
26
|
+
|
|
27
|
+
|
|
28
|
+
def test_from_paddle(kernel, num_iters, array_size, device, warp_dtype=None):
|
|
29
|
+
warp_device = wp.get_device(device)
|
|
30
|
+
paddle_device = wp.device_to_paddle(warp_device)
|
|
31
|
+
|
|
32
|
+
if hasattr(warp_dtype, "_shape_"):
|
|
33
|
+
paddle_shape = (array_size, *warp_dtype._shape_)
|
|
34
|
+
paddle_dtype = wp.dtype_to_paddle(warp_dtype._wp_scalar_type_)
|
|
35
|
+
else:
|
|
36
|
+
paddle_shape = (array_size,)
|
|
37
|
+
paddle_dtype = paddle.float32 if warp_dtype is None else wp.dtype_to_paddle(warp_dtype)
|
|
38
|
+
|
|
39
|
+
_a = paddle.zeros(paddle_shape, dtype=paddle_dtype).to(device=paddle_device)
|
|
40
|
+
_b = paddle.zeros(paddle_shape, dtype=paddle_dtype).to(device=paddle_device)
|
|
41
|
+
_c = paddle.zeros(paddle_shape, dtype=paddle_dtype).to(device=paddle_device)
|
|
42
|
+
_d = paddle.zeros(paddle_shape, dtype=paddle_dtype).to(device=paddle_device)
|
|
43
|
+
_e = paddle.zeros(paddle_shape, dtype=paddle_dtype).to(device=paddle_device)
|
|
44
|
+
|
|
45
|
+
wp.synchronize()
|
|
46
|
+
|
|
47
|
+
# profiler = Profiler(interval=0.000001)
|
|
48
|
+
# profiler.start()
|
|
49
|
+
|
|
50
|
+
t1 = time.time_ns()
|
|
51
|
+
|
|
52
|
+
for _ in range(num_iters):
|
|
53
|
+
a = wp.from_paddle(_a, dtype=warp_dtype)
|
|
54
|
+
b = wp.from_paddle(_b, dtype=warp_dtype)
|
|
55
|
+
c = wp.from_paddle(_c, dtype=warp_dtype)
|
|
56
|
+
d = wp.from_paddle(_d, dtype=warp_dtype)
|
|
57
|
+
e = wp.from_paddle(_e, dtype=warp_dtype)
|
|
58
|
+
wp.launch(kernel, dim=array_size, inputs=[a, b, c, d, e])
|
|
59
|
+
|
|
60
|
+
t2 = time.time_ns()
|
|
61
|
+
print(f"{(t2 - t1) / 1_000_000 :8.0f} ms from_paddle(...)")
|
|
62
|
+
|
|
63
|
+
# profiler.stop()
|
|
64
|
+
# profiler.print()
|
|
65
|
+
|
|
66
|
+
|
|
67
|
+
def test_array_ctype_from_paddle(kernel, num_iters, array_size, device, warp_dtype=None):
|
|
68
|
+
warp_device = wp.get_device(device)
|
|
69
|
+
paddle_device = wp.device_to_paddle(warp_device)
|
|
70
|
+
|
|
71
|
+
if hasattr(warp_dtype, "_shape_"):
|
|
72
|
+
paddle_shape = (array_size, *warp_dtype._shape_)
|
|
73
|
+
paddle_dtype = wp.dtype_to_paddle(warp_dtype._wp_scalar_type_)
|
|
74
|
+
else:
|
|
75
|
+
paddle_shape = (array_size,)
|
|
76
|
+
paddle_dtype = paddle.float32 if warp_dtype is None else wp.dtype_to_paddle(warp_dtype)
|
|
77
|
+
|
|
78
|
+
_a = paddle.zeros(paddle_shape, dtype=paddle_dtype).to(device=paddle_device)
|
|
79
|
+
_b = paddle.zeros(paddle_shape, dtype=paddle_dtype).to(device=paddle_device)
|
|
80
|
+
_c = paddle.zeros(paddle_shape, dtype=paddle_dtype).to(device=paddle_device)
|
|
81
|
+
_d = paddle.zeros(paddle_shape, dtype=paddle_dtype).to(device=paddle_device)
|
|
82
|
+
_e = paddle.zeros(paddle_shape, dtype=paddle_dtype).to(device=paddle_device)
|
|
83
|
+
|
|
84
|
+
wp.synchronize()
|
|
85
|
+
|
|
86
|
+
# profiler = Profiler(interval=0.000001)
|
|
87
|
+
# profiler.start()
|
|
88
|
+
|
|
89
|
+
t1 = time.time_ns()
|
|
90
|
+
|
|
91
|
+
for _ in range(num_iters):
|
|
92
|
+
a = wp.from_paddle(_a, dtype=warp_dtype, return_ctype=True)
|
|
93
|
+
b = wp.from_paddle(_b, dtype=warp_dtype, return_ctype=True)
|
|
94
|
+
c = wp.from_paddle(_c, dtype=warp_dtype, return_ctype=True)
|
|
95
|
+
d = wp.from_paddle(_d, dtype=warp_dtype, return_ctype=True)
|
|
96
|
+
e = wp.from_paddle(_e, dtype=warp_dtype, return_ctype=True)
|
|
97
|
+
wp.launch(kernel, dim=array_size, inputs=[a, b, c, d, e])
|
|
98
|
+
|
|
99
|
+
t2 = time.time_ns()
|
|
100
|
+
print(f"{(t2 - t1) / 1_000_000 :8.0f} ms from_paddle(..., return_ctype=True)")
|
|
101
|
+
|
|
102
|
+
# profiler.stop()
|
|
103
|
+
# profiler.print()
|
|
104
|
+
|
|
105
|
+
|
|
106
|
+
def test_direct_from_paddle(kernel, num_iters, array_size, device, warp_dtype=None):
|
|
107
|
+
warp_device = wp.get_device(device)
|
|
108
|
+
paddle_device = wp.device_to_paddle(warp_device)
|
|
109
|
+
|
|
110
|
+
if hasattr(warp_dtype, "_shape_"):
|
|
111
|
+
paddle_shape = (array_size, *warp_dtype._shape_)
|
|
112
|
+
paddle_dtype = wp.dtype_to_paddle(warp_dtype._wp_scalar_type_)
|
|
113
|
+
else:
|
|
114
|
+
paddle_shape = (array_size,)
|
|
115
|
+
paddle_dtype = paddle.float32 if warp_dtype is None else wp.dtype_to_paddle(warp_dtype)
|
|
116
|
+
|
|
117
|
+
_a = paddle.zeros(paddle_shape, dtype=paddle_dtype).to(device=paddle_device)
|
|
118
|
+
_b = paddle.zeros(paddle_shape, dtype=paddle_dtype).to(device=paddle_device)
|
|
119
|
+
_c = paddle.zeros(paddle_shape, dtype=paddle_dtype).to(device=paddle_device)
|
|
120
|
+
_d = paddle.zeros(paddle_shape, dtype=paddle_dtype).to(device=paddle_device)
|
|
121
|
+
_e = paddle.zeros(paddle_shape, dtype=paddle_dtype).to(device=paddle_device)
|
|
122
|
+
|
|
123
|
+
wp.synchronize()
|
|
124
|
+
|
|
125
|
+
# profiler = Profiler(interval=0.000001)
|
|
126
|
+
# profiler.start()
|
|
127
|
+
|
|
128
|
+
t1 = time.time_ns()
|
|
129
|
+
|
|
130
|
+
for _ in range(num_iters):
|
|
131
|
+
wp.launch(kernel, dim=array_size, inputs=[_a, _b, _c, _d, _e])
|
|
132
|
+
|
|
133
|
+
t2 = time.time_ns()
|
|
134
|
+
print(f"{(t2 - t1) / 1_000_000 :8.0f} ms direct from paddle")
|
|
135
|
+
|
|
136
|
+
# profiler.stop()
|
|
137
|
+
# profiler.print()
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
wp.init()
|
|
141
|
+
|
|
142
|
+
params = [
|
|
143
|
+
# (warp_dtype arg, kernel)
|
|
144
|
+
(None, create_simple_kernel(wp.float32)),
|
|
145
|
+
(wp.float32, create_simple_kernel(wp.float32)),
|
|
146
|
+
(wp.vec3f, create_simple_kernel(wp.vec3f)),
|
|
147
|
+
(wp.mat22f, create_simple_kernel(wp.mat22f)),
|
|
148
|
+
]
|
|
149
|
+
|
|
150
|
+
wp.load_module()
|
|
151
|
+
|
|
152
|
+
num_iters = 100000
|
|
153
|
+
|
|
154
|
+
for warp_dtype, kernel in params:
|
|
155
|
+
print(f"\ndtype={wp.context.type_str(warp_dtype)}")
|
|
156
|
+
test_from_paddle(kernel, num_iters, 10, "cuda:0", warp_dtype=warp_dtype)
|
|
157
|
+
test_array_ctype_from_paddle(kernel, num_iters, 10, "cuda:0", warp_dtype=warp_dtype)
|
|
158
|
+
test_direct_from_paddle(kernel, num_iters, 10, "cuda:0", warp_dtype=warp_dtype)
|
|
@@ -0,0 +1,179 @@
|
|
|
1
|
+
# Copyright (c) 2024 NVIDIA CORPORATION. All rights reserved.
|
|
2
|
+
# NVIDIA CORPORATION and its licensors retain all intellectual property
|
|
3
|
+
# and proprietary rights in and to this software, related documentation
|
|
4
|
+
# and any modifications thereto. Any use, reproduction, disclosure or
|
|
5
|
+
# distribution of this software and related documentation without an express
|
|
6
|
+
# license agreement from NVIDIA CORPORATION is strictly prohibited.
|
|
7
|
+
|
|
8
|
+
import numpy as np
|
|
9
|
+
import torch
|
|
10
|
+
|
|
11
|
+
import warp as wp
|
|
12
|
+
|
|
13
|
+
wp.init()
|
|
14
|
+
wp.set_module_options({"enable_backward": False, "fast_math": True})
|
|
15
|
+
wp.set_device("cuda:0")
|
|
16
|
+
|
|
17
|
+
wp.build.clear_kernel_cache()
|
|
18
|
+
|
|
19
|
+
|
|
20
|
+
@wp.kernel
|
|
21
|
+
def gemm(A: wp.array2d(dtype=float), B: wp.array2d(dtype=float), C: wp.array2d(dtype=float)):
|
|
22
|
+
# output index
|
|
23
|
+
i, j = wp.tid()
|
|
24
|
+
|
|
25
|
+
sum = float(0.0)
|
|
26
|
+
|
|
27
|
+
for k in range(0, A.shape[1]):
|
|
28
|
+
sum += A[i, k] * B[k, j]
|
|
29
|
+
|
|
30
|
+
C[i, j] = sum
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
TILE_M = wp.constant(64)
|
|
34
|
+
TILE_N = wp.constant(64)
|
|
35
|
+
TILE_K = wp.constant(8)
|
|
36
|
+
|
|
37
|
+
|
|
38
|
+
@wp.kernel
|
|
39
|
+
def gemm_tiled(A: wp.array2d(dtype=float), B: wp.array2d(dtype=float), C: wp.array2d(dtype=float)):
|
|
40
|
+
# output tile index
|
|
41
|
+
i, j = wp.tid()
|
|
42
|
+
|
|
43
|
+
sum = wp.tile_zeros(m=TILE_M, n=TILE_N, dtype=wp.float32)
|
|
44
|
+
|
|
45
|
+
_M = A.shape[0]
|
|
46
|
+
_N = B.shape[1]
|
|
47
|
+
K = A.shape[1]
|
|
48
|
+
|
|
49
|
+
count = int(K / 8) # TODO: code-gen bug if you use a constant before passing it to a kwd arg (in this case TILE_K)
|
|
50
|
+
|
|
51
|
+
for k in range(count):
|
|
52
|
+
a = wp.tile_load(A, i, k, m=TILE_M, n=TILE_K)
|
|
53
|
+
b = wp.tile_load(B, k, j, m=TILE_K, n=TILE_N)
|
|
54
|
+
|
|
55
|
+
# sum += a*b
|
|
56
|
+
wp.tile_matmul(a, b, sum)
|
|
57
|
+
|
|
58
|
+
wp.tile_store(C, i, j, sum)
|
|
59
|
+
|
|
60
|
+
|
|
61
|
+
def benchmark_numpy(A, B, C):
|
|
62
|
+
timers = {}
|
|
63
|
+
iters = 10
|
|
64
|
+
|
|
65
|
+
# warm up
|
|
66
|
+
for _i in range(10):
|
|
67
|
+
_C = A @ B
|
|
68
|
+
|
|
69
|
+
with wp.ScopedTimer("NumPy", dict=timers):
|
|
70
|
+
for _i in range(iters):
|
|
71
|
+
_C = A @ B
|
|
72
|
+
|
|
73
|
+
return min(timers["NumPy"])
|
|
74
|
+
|
|
75
|
+
|
|
76
|
+
def benchmark_warp_simt(A, B, C):
|
|
77
|
+
timers = {}
|
|
78
|
+
iters = 10
|
|
79
|
+
|
|
80
|
+
A_wp = wp.array(A)
|
|
81
|
+
B_wp = wp.array(B)
|
|
82
|
+
C_wp = wp.array(C)
|
|
83
|
+
|
|
84
|
+
# warm up
|
|
85
|
+
for _i in range(10):
|
|
86
|
+
wp.launch(gemm, dim=(M, N), inputs=[A_wp, B_wp, C_wp])
|
|
87
|
+
|
|
88
|
+
with wp.ScopedTimer("Warp (SIMT)", dict=timers, print=False, synchronize=True):
|
|
89
|
+
for _i in range(iters):
|
|
90
|
+
wp.launch(gemm, dim=(M, N), inputs=[A_wp, B_wp, C_wp])
|
|
91
|
+
|
|
92
|
+
return min(timers["Warp (SIMT)"])
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
def benchmark_warp_tiled(A, B, C):
|
|
96
|
+
timers = {}
|
|
97
|
+
iters = 10
|
|
98
|
+
|
|
99
|
+
# must match with the tile_matmul() partition size
|
|
100
|
+
SUB_TILE_M = 4
|
|
101
|
+
SUB_TILE_N = 4
|
|
102
|
+
|
|
103
|
+
num_threads = int(TILE_M / SUB_TILE_M) * int(TILE_N / SUB_TILE_N)
|
|
104
|
+
A_wp = wp.array(A)
|
|
105
|
+
B_wp = wp.array(B)
|
|
106
|
+
C_wp = wp.array(C)
|
|
107
|
+
|
|
108
|
+
# warm up
|
|
109
|
+
wp.capture_begin()
|
|
110
|
+
|
|
111
|
+
for _i in range(iters):
|
|
112
|
+
wp.launch(gemm_tiled, dim=(int(M / TILE_M), int(N / TILE_N)), inputs=[A_wp, B_wp, C_wp], tile_size=num_threads)
|
|
113
|
+
|
|
114
|
+
graph = wp.capture_end()
|
|
115
|
+
|
|
116
|
+
with wp.ScopedTimer("Warp (Tiled)", dict=timers, print=False, synchronize=True):
|
|
117
|
+
# for i in range(iters):
|
|
118
|
+
# wp.launch(gemm_tiled, dim=(int(M/TILE_M), int(N/TILE_N)), inputs=[A_wp, B_wp, C_wp], tile_size=num_threads)
|
|
119
|
+
wp.capture_launch(graph)
|
|
120
|
+
|
|
121
|
+
return min(timers["Warp (Tiled)"])
|
|
122
|
+
|
|
123
|
+
|
|
124
|
+
def benchmark_torch(A, B, C):
|
|
125
|
+
A_tc = torch.from_numpy(A).to("cuda:0")
|
|
126
|
+
B_tc = torch.from_numpy(B).to("cuda:0")
|
|
127
|
+
C_tc = torch.from_numpy(C).to("cuda:0")
|
|
128
|
+
|
|
129
|
+
# warm-up
|
|
130
|
+
for _i in range(10):
|
|
131
|
+
torch.matmul(A_tc, B_tc, out=C_tc)
|
|
132
|
+
|
|
133
|
+
timers = {}
|
|
134
|
+
iters = 10
|
|
135
|
+
|
|
136
|
+
torch.cuda.synchronize()
|
|
137
|
+
|
|
138
|
+
with wp.ScopedTimer("Torch", dict=timers, print=False):
|
|
139
|
+
for _i in range(iters):
|
|
140
|
+
torch.matmul(A_tc, B_tc) # , out=C_tc)
|
|
141
|
+
|
|
142
|
+
torch.cuda.synchronize()
|
|
143
|
+
|
|
144
|
+
return min(timers["Torch"])
|
|
145
|
+
|
|
146
|
+
|
|
147
|
+
results_torch = []
|
|
148
|
+
results_warp_simt = []
|
|
149
|
+
results_warp_tiled = []
|
|
150
|
+
|
|
151
|
+
print("{:>8s} {:>8s} {:>8s} {:>8s} {:>8s} {:>8s}".format("M", "N", "K", "Torch", "Warp (SIMT)", "Warp (Tiled)"))
|
|
152
|
+
print("--------------------------------------------------------")
|
|
153
|
+
|
|
154
|
+
for i in range(2, 33):
|
|
155
|
+
# for i in range(8,9):
|
|
156
|
+
|
|
157
|
+
M = i * 128
|
|
158
|
+
N = M
|
|
159
|
+
K = N
|
|
160
|
+
|
|
161
|
+
# M = TILE_M*21
|
|
162
|
+
# K = TILE_K*7
|
|
163
|
+
# N = TILE_M*12
|
|
164
|
+
|
|
165
|
+
rng = np.random.default_rng(42)
|
|
166
|
+
|
|
167
|
+
A = rng.random((M, K), dtype=np.float32)
|
|
168
|
+
B = rng.random((K, N), dtype=np.float32)
|
|
169
|
+
C = np.zeros((M, N), dtype=np.float32)
|
|
170
|
+
|
|
171
|
+
results_torch.append(benchmark_torch(A, B, C))
|
|
172
|
+
results_warp_simt.append(0.0) # benchmark_warp_simt(A, B, C))
|
|
173
|
+
results_warp_tiled.append(benchmark_warp_tiled(A, B, C))
|
|
174
|
+
|
|
175
|
+
print(
|
|
176
|
+
"{:>8d} {:>8d} {:>8d} {:>8f} {:>8f} {:>8f}".format(
|
|
177
|
+
M, N, K, results_torch[-1], results_warp_simt[-1], results_warp_tiled[-1]
|
|
178
|
+
)
|
|
179
|
+
)
|
|
@@ -199,9 +199,10 @@ class Example:
|
|
|
199
199
|
|
|
200
200
|
# creates a grid of particles
|
|
201
201
|
def particle_grid(self, dim_x, dim_y, dim_z, lower, radius, jitter):
|
|
202
|
+
rng = np.random.default_rng(42)
|
|
202
203
|
points = np.meshgrid(np.linspace(0, dim_x, dim_x), np.linspace(0, dim_y, dim_y), np.linspace(0, dim_z, dim_z))
|
|
203
204
|
points_t = np.array((points[0], points[1], points[2])).T * radius * 2.0 + np.array(lower)
|
|
204
|
-
points_t = points_t +
|
|
205
|
+
points_t = points_t + rng.random(size=points_t.shape) * radius * jitter
|
|
205
206
|
|
|
206
207
|
return points_t.reshape((-1, 3))
|
|
207
208
|
|
|
@@ -98,11 +98,11 @@ class Example:
|
|
|
98
98
|
|
|
99
99
|
for _ in range(self.query_count):
|
|
100
100
|
# random offset
|
|
101
|
-
p = wp.vec3(rng.random(3) * 0.5 - 0.5) * 5.0
|
|
101
|
+
p = wp.vec3(rng.random(size=3) * 0.5 - 0.5) * 5.0
|
|
102
102
|
|
|
103
103
|
# random orientation
|
|
104
|
-
axis = wp.normalize(wp.vec3(rng.random(3) * 0.5 - 0.5))
|
|
105
|
-
angle =
|
|
104
|
+
axis = wp.normalize(wp.vec3(rng.random(size=3) * 0.5 - 0.5))
|
|
105
|
+
angle = rng.random()
|
|
106
106
|
|
|
107
107
|
q = wp.quat_from_axis_angle(wp.normalize(axis), angle)
|
|
108
108
|
|
|
@@ -56,7 +56,7 @@ def mass_form(
|
|
|
56
56
|
u: fem.Field,
|
|
57
57
|
v: fem.Field,
|
|
58
58
|
):
|
|
59
|
-
return u(s)
|
|
59
|
+
return fem.linalg.generalized_inner(u(s), v(s))
|
|
60
60
|
|
|
61
61
|
|
|
62
62
|
@fem.integrand
|
|
@@ -86,9 +86,12 @@ def pressure_anomaly_field(s: fem.Sample, domain: fem.Domain, pressure: fem.Fiel
|
|
|
86
86
|
|
|
87
87
|
|
|
88
88
|
class Example:
|
|
89
|
-
def __init__(
|
|
89
|
+
def __init__(
|
|
90
|
+
self, quiet=False, degree=2, div_conforming=False, base_resolution=8, level_count=4, headless: bool = False
|
|
91
|
+
):
|
|
90
92
|
self._quiet = quiet
|
|
91
93
|
self._degree = degree
|
|
94
|
+
self._div_conforming = div_conforming
|
|
92
95
|
|
|
93
96
|
# Start from a coarse, dense grid
|
|
94
97
|
res = wp.vec3i(2 * base_resolution, base_resolution // 2, base_resolution)
|
|
@@ -110,9 +113,13 @@ class Example:
|
|
|
110
113
|
sim_vol, level_count, refinement_field=refinement, grading="face"
|
|
111
114
|
)
|
|
112
115
|
|
|
113
|
-
# Function spaces for velocity,
|
|
114
|
-
|
|
115
|
-
|
|
116
|
+
# Function spaces for velocity, pressure (RTk / Pk-1 or Pk / Pk-1)
|
|
117
|
+
u_space = fem.make_polynomial_space(
|
|
118
|
+
geo=self._geo,
|
|
119
|
+
element_basis=fem.ElementBasis.RAVIART_THOMAS if div_conforming else None,
|
|
120
|
+
degree=self._degree,
|
|
121
|
+
dtype=wp.vec3,
|
|
122
|
+
)
|
|
116
123
|
p_space = fem.make_polynomial_space(geo=self._geo, degree=self._degree - 1, dtype=float)
|
|
117
124
|
|
|
118
125
|
self.pressure_field = p_space.make_field()
|
|
@@ -137,7 +144,17 @@ class Example:
|
|
|
137
144
|
def render(self):
|
|
138
145
|
# self.renderer.add_field("solution", self.pressure_field)
|
|
139
146
|
self.plot.add_field("pressure_anomaly", self.pressure_anomaly_field)
|
|
140
|
-
|
|
147
|
+
|
|
148
|
+
if self._div_conforming:
|
|
149
|
+
# If using H(div)-conforming elements, interpolate to continuous space
|
|
150
|
+
velocity_field_lagrange = fem.make_polynomial_space(
|
|
151
|
+
self.velocity_field.geometry, dtype=wp.vec3, degree=self._degree
|
|
152
|
+
).make_field()
|
|
153
|
+
fem.interpolate(self.velocity_field, dest=velocity_field_lagrange)
|
|
154
|
+
else:
|
|
155
|
+
velocity_field_lagrange = self.velocity_field
|
|
156
|
+
|
|
157
|
+
self.plot.add_field("velocity", velocity_field_lagrange)
|
|
141
158
|
|
|
142
159
|
def step(self):
|
|
143
160
|
u_space = self.velocity_field.space
|
|
@@ -153,9 +170,14 @@ class Example:
|
|
|
153
170
|
fem.normalize_dirichlet_projector(dirichlet_projector)
|
|
154
171
|
|
|
155
172
|
# (Diagonal) mass matrix
|
|
156
|
-
|
|
157
|
-
|
|
158
|
-
|
|
173
|
+
if self._div_conforming:
|
|
174
|
+
rho_test = fem.make_test(u_space)
|
|
175
|
+
rho_trial = fem.make_trial(u_space)
|
|
176
|
+
else:
|
|
177
|
+
rho_space = fem.make_polynomial_space(geo=u_space.geometry, degree=self._degree)
|
|
178
|
+
rho_test = fem.make_test(rho_space)
|
|
179
|
+
rho_trial = fem.make_trial(rho_space)
|
|
180
|
+
|
|
159
181
|
inv_mass_matrix = fem.integrate(
|
|
160
182
|
mass_form, fields={"u": rho_trial, "v": rho_test}, nodal=True, output_dtype=float
|
|
161
183
|
)
|
|
@@ -177,6 +199,7 @@ class Example:
|
|
|
177
199
|
side_divergence_form,
|
|
178
200
|
fields={"u": u_side_trial, "psi": p_side_test},
|
|
179
201
|
output_dtype=float,
|
|
202
|
+
assembly="generic", # not required, for test coverage purposes
|
|
180
203
|
)
|
|
181
204
|
|
|
182
205
|
# Solve incompressibility
|
|
@@ -204,7 +227,10 @@ if __name__ == "__main__":
|
|
|
204
227
|
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
|
205
228
|
parser.add_argument("--device", type=str, default=None, help="Override the default Warp device.")
|
|
206
229
|
parser.add_argument("--resolution", type=int, default=8, help="Grid resolution.")
|
|
207
|
-
parser.add_argument("--degree", type=int, default=
|
|
230
|
+
parser.add_argument("--degree", type=int, default=1, help="Polynomial degree of shape functions.")
|
|
231
|
+
parser.add_argument(
|
|
232
|
+
"--div_conforming", action="store_true", default=False, help="Use H(div)-conforming function space"
|
|
233
|
+
)
|
|
208
234
|
parser.add_argument("--level_count", type=int, default=4, help="Number of refinement levels.")
|
|
209
235
|
parser.add_argument(
|
|
210
236
|
"--headless",
|
|
@@ -219,6 +245,7 @@ if __name__ == "__main__":
|
|
|
219
245
|
example = Example(
|
|
220
246
|
quiet=args.quiet,
|
|
221
247
|
degree=args.degree,
|
|
248
|
+
div_conforming=args.div_conforming,
|
|
222
249
|
base_resolution=args.resolution,
|
|
223
250
|
level_count=args.level_count,
|
|
224
251
|
headless=args.headless,
|