warp-lang 1.7.0__py3-none-macosx_10_13_universal2.whl → 1.7.2rc1__py3-none-macosx_10_13_universal2.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of warp-lang might be problematic. Click here for more details.
- warp/autograd.py +12 -2
- warp/bin/libwarp-clang.dylib +0 -0
- warp/bin/libwarp.dylib +0 -0
- warp/build.py +1 -1
- warp/builtins.py +103 -66
- warp/codegen.py +48 -27
- warp/config.py +1 -1
- warp/context.py +112 -49
- warp/examples/benchmarks/benchmark_cloth.py +1 -1
- warp/examples/distributed/example_jacobi_mpi.py +507 -0
- warp/fem/cache.py +1 -1
- warp/fem/field/field.py +11 -1
- warp/fem/field/nodal_field.py +36 -22
- warp/fem/geometry/adaptive_nanogrid.py +7 -3
- warp/fem/geometry/trimesh.py +4 -12
- warp/jax_experimental/custom_call.py +14 -2
- warp/jax_experimental/ffi.py +100 -67
- warp/native/builtin.h +91 -65
- warp/native/svd.h +59 -49
- warp/native/tile.h +55 -26
- warp/native/volume.cpp +2 -2
- warp/native/volume_builder.cu +33 -22
- warp/native/warp.cu +1 -1
- warp/render/render_opengl.py +41 -34
- warp/render/render_usd.py +96 -6
- warp/sim/collide.py +11 -9
- warp/sim/inertia.py +189 -156
- warp/sim/integrator_euler.py +3 -0
- warp/sim/integrator_xpbd.py +3 -0
- warp/sim/model.py +56 -31
- warp/sim/render.py +4 -0
- warp/sparse.py +1 -1
- warp/stubs.py +73 -25
- warp/tests/assets/torus.usda +1 -1
- warp/tests/cuda/test_streams.py +1 -1
- warp/tests/sim/test_collision.py +237 -206
- warp/tests/sim/test_inertia.py +161 -0
- warp/tests/sim/test_model.py +5 -3
- warp/tests/sim/{flaky_test_sim_grad.py → test_sim_grad.py} +1 -4
- warp/tests/sim/test_xpbd.py +399 -0
- warp/tests/test_array.py +8 -7
- warp/tests/test_atomic.py +181 -2
- warp/tests/test_builtins_resolution.py +38 -38
- warp/tests/test_codegen.py +24 -3
- warp/tests/test_examples.py +16 -6
- warp/tests/test_fem.py +93 -14
- warp/tests/test_func.py +1 -1
- warp/tests/test_mat.py +416 -119
- warp/tests/test_quat.py +321 -137
- warp/tests/test_struct.py +116 -0
- warp/tests/test_vec.py +320 -174
- warp/tests/tile/test_tile.py +27 -0
- warp/tests/tile/test_tile_load.py +124 -0
- warp/tests/unittest_suites.py +2 -5
- warp/types.py +107 -9
- {warp_lang-1.7.0.dist-info → warp_lang-1.7.2rc1.dist-info}/METADATA +41 -19
- {warp_lang-1.7.0.dist-info → warp_lang-1.7.2rc1.dist-info}/RECORD +60 -57
- {warp_lang-1.7.0.dist-info → warp_lang-1.7.2rc1.dist-info}/WHEEL +1 -1
- {warp_lang-1.7.0.dist-info → warp_lang-1.7.2rc1.dist-info}/licenses/LICENSE.md +0 -26
- {warp_lang-1.7.0.dist-info → warp_lang-1.7.2rc1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,507 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
"""An example implementation of a distributed Jacobi solver using MPI.
|
|
16
|
+
|
|
17
|
+
This example shows how to solve the Laplace equation using Jacobi iteration on
|
|
18
|
+
multiple GPUs using Warp and mpi4py. This example is based on the basic "mpi"
|
|
19
|
+
example from the Multi GPU Programming Models repository.
|
|
20
|
+
|
|
21
|
+
This example requires mpi4py and a CUDA-aware MPI implementation. We suggest
|
|
22
|
+
downloading and installing NVIDIA HPC-X, followed by installing mpi4py from its
|
|
23
|
+
source distribution: python -m pip install mpi4py
|
|
24
|
+
|
|
25
|
+
Usage:
|
|
26
|
+
mpirun -n 2 python example_jacobi_mpi.py
|
|
27
|
+
|
|
28
|
+
References:
|
|
29
|
+
https://github.com/NVIDIA/multi-gpu-programming-models
|
|
30
|
+
https://developer.nvidia.com/networking/hpc-x
|
|
31
|
+
https://github.com/mpi4py/mpi4py
|
|
32
|
+
"""
|
|
33
|
+
|
|
34
|
+
import math
|
|
35
|
+
import sys
|
|
36
|
+
from typing import Tuple
|
|
37
|
+
|
|
38
|
+
import numpy as np
|
|
39
|
+
from mpi4py import MPI
|
|
40
|
+
|
|
41
|
+
import warp as wp
|
|
42
|
+
import warp.context
|
|
43
|
+
from warp.types import warp_type_to_np_dtype
|
|
44
|
+
|
|
45
|
+
wp.config.quiet = True # Suppress wp.init() output
|
|
46
|
+
|
|
47
|
+
|
|
48
|
+
tol = 1e-8
|
|
49
|
+
wptype = wp.float32 # Global precision setting, can set wp.float64 here for double precision
|
|
50
|
+
pi = wptype(math.pi) # GitHub #485
|
|
51
|
+
|
|
52
|
+
|
|
53
|
+
def calc_default_device(mpi_comm: "MPI.Comm") -> warp.context.Device:
|
|
54
|
+
"""Return the device that should be used for the current rank.
|
|
55
|
+
|
|
56
|
+
This function is used to ensure that multiple MPI ranks running on the same
|
|
57
|
+
node are assigned to different GPUs.
|
|
58
|
+
|
|
59
|
+
Args:
|
|
60
|
+
mpi_comm: The MPI communicator.
|
|
61
|
+
|
|
62
|
+
Returns:
|
|
63
|
+
The Warp device that should be used for the current rank.
|
|
64
|
+
|
|
65
|
+
Raises:
|
|
66
|
+
RuntimeError: If the number of visible devices is less than the number of ranks on the node.
|
|
67
|
+
"""
|
|
68
|
+
|
|
69
|
+
# Find the local rank and size
|
|
70
|
+
local_mpi_comm = mpi_comm.Split_type(MPI.COMM_TYPE_SHARED)
|
|
71
|
+
|
|
72
|
+
local_size = local_mpi_comm.Get_size()
|
|
73
|
+
local_rank = local_mpi_comm.Get_rank()
|
|
74
|
+
|
|
75
|
+
num_cuda_devices = warp.get_cuda_device_count()
|
|
76
|
+
|
|
77
|
+
if 1 < num_cuda_devices < local_size:
|
|
78
|
+
raise RuntimeError(
|
|
79
|
+
f"Number of visible devices ({num_cuda_devices}) is less than number of ranks on the node ({local_size})"
|
|
80
|
+
)
|
|
81
|
+
|
|
82
|
+
if 1 < num_cuda_devices:
|
|
83
|
+
# Get the device based on local_rank
|
|
84
|
+
return warp.get_cuda_device(local_rank)
|
|
85
|
+
else:
|
|
86
|
+
return warp.get_device()
|
|
87
|
+
|
|
88
|
+
|
|
89
|
+
def calc_decomp_1d(total_points: int, rank: int, total_ranks: int) -> Tuple[int, int]:
|
|
90
|
+
"""Calculate a 1-D decomposition to divide ``total_points`` among ``total_ranks`` domains.
|
|
91
|
+
|
|
92
|
+
Returns a tuple containing the starting index of the decomposition followed
|
|
93
|
+
by number of points in the domain.
|
|
94
|
+
|
|
95
|
+
If ``total_points`` can not be evenly divided among ``total_ranks``,
|
|
96
|
+
the first ``total_points % total_ranks`` domains will contain one additional
|
|
97
|
+
point.
|
|
98
|
+
"""
|
|
99
|
+
|
|
100
|
+
if rank < total_points % total_ranks:
|
|
101
|
+
num_domain_points = total_points // total_ranks + 1
|
|
102
|
+
start_index = rank * num_domain_points
|
|
103
|
+
else:
|
|
104
|
+
num_domain_points = total_points // total_ranks
|
|
105
|
+
start_index = total_points - (total_ranks - rank) * num_domain_points
|
|
106
|
+
|
|
107
|
+
return (start_index, num_domain_points)
|
|
108
|
+
|
|
109
|
+
|
|
110
|
+
@wp.kernel
|
|
111
|
+
def jacobi_update(
|
|
112
|
+
a: wp.array2d(dtype=wptype),
|
|
113
|
+
iy_start: int,
|
|
114
|
+
iy_end: int,
|
|
115
|
+
nx: int,
|
|
116
|
+
calculate_norm: bool,
|
|
117
|
+
a_new: wp.array2d(dtype=wptype),
|
|
118
|
+
l2_norm: wp.array(dtype=wptype),
|
|
119
|
+
):
|
|
120
|
+
i, j = wp.tid()
|
|
121
|
+
|
|
122
|
+
# Convert from local thread indices to the indices used to access the arrays
|
|
123
|
+
|
|
124
|
+
iy = i + iy_start
|
|
125
|
+
ix = j + 1
|
|
126
|
+
|
|
127
|
+
local_l2_norm = wptype(0.0)
|
|
128
|
+
|
|
129
|
+
if iy < iy_end and ix < nx - 1:
|
|
130
|
+
new_val = wptype(0.25) * (a[iy - 1, ix] + a[iy + 1, ix] + a[iy, ix - 1] + a[iy, ix + 1])
|
|
131
|
+
a_new[iy, ix] = new_val
|
|
132
|
+
|
|
133
|
+
if calculate_norm:
|
|
134
|
+
residue = new_val - a[iy, ix]
|
|
135
|
+
local_l2_norm = residue * residue
|
|
136
|
+
|
|
137
|
+
if calculate_norm:
|
|
138
|
+
t = wp.tile(local_l2_norm)
|
|
139
|
+
s = wp.tile_sum(t)
|
|
140
|
+
wp.tile_atomic_add(l2_norm, s)
|
|
141
|
+
|
|
142
|
+
|
|
143
|
+
@wp.kernel
|
|
144
|
+
def initialize_boundaries(
|
|
145
|
+
nx: int,
|
|
146
|
+
ny: int,
|
|
147
|
+
offset: int,
|
|
148
|
+
a: wp.array2d(dtype=wptype),
|
|
149
|
+
a_new: wp.array2d(dtype=wptype),
|
|
150
|
+
):
|
|
151
|
+
i = wp.tid()
|
|
152
|
+
|
|
153
|
+
boundary_val = wp.sin(wptype(2.0) * pi * wptype(i + offset) / wptype(ny - 1))
|
|
154
|
+
|
|
155
|
+
a[i, 0] = boundary_val
|
|
156
|
+
a[i, nx - 1] = boundary_val
|
|
157
|
+
a_new[i, 0] = boundary_val
|
|
158
|
+
a_new[i, nx - 1] = boundary_val
|
|
159
|
+
|
|
160
|
+
|
|
161
|
+
def benchmark_single_gpu(nx: int, ny: int, iter_max: int, nccheck: int = 1, verbose: bool = False):
|
|
162
|
+
"""Compute the solution on a single GPU for performance and correctness comparisons.
|
|
163
|
+
|
|
164
|
+
Args:
|
|
165
|
+
nx: The number of points in the x-direction.
|
|
166
|
+
ny: The number of points in the y-direction.
|
|
167
|
+
iter_max: The maximum number of Jacobi iterations.
|
|
168
|
+
nccheck: The number of iterations between norm checks. Defaults to 1.
|
|
169
|
+
verbose: Whether to print verbose output. Defaults to False.
|
|
170
|
+
|
|
171
|
+
Returns:
|
|
172
|
+
tuple: A tuple containing:
|
|
173
|
+
- runtime (float): The execution time of the solution in seconds.
|
|
174
|
+
- solution (warp.array2d): The solution as a Warp array on the host
|
|
175
|
+
with dimensions ``(ny, nx)``.
|
|
176
|
+
"""
|
|
177
|
+
|
|
178
|
+
a = wp.zeros((ny, nx), dtype=wptype)
|
|
179
|
+
a_new = wp.zeros_like(a)
|
|
180
|
+
|
|
181
|
+
l2_norm_d = wp.zeros((1,), dtype=wptype)
|
|
182
|
+
l2_norm_h = wp.ones_like(l2_norm_d, device="cpu", pinned=True)
|
|
183
|
+
|
|
184
|
+
compute_stream = wp.Stream()
|
|
185
|
+
push_top_stream = wp.Stream()
|
|
186
|
+
push_bottom_stream = wp.Stream()
|
|
187
|
+
|
|
188
|
+
compute_done = wp.Event()
|
|
189
|
+
push_top_done = wp.Event()
|
|
190
|
+
push_bottom_done = wp.Event()
|
|
191
|
+
|
|
192
|
+
iy_start = 1
|
|
193
|
+
iy_end = ny - 1
|
|
194
|
+
update_shape = (iy_end - iy_start, nx - 2)
|
|
195
|
+
|
|
196
|
+
wp.launch(initialize_boundaries, dim=(ny,), inputs=[nx, ny, 0], outputs=[a, a_new])
|
|
197
|
+
|
|
198
|
+
if verbose:
|
|
199
|
+
print(
|
|
200
|
+
f"Single GPU jacobi relaxation: {iter_max} iterations on {ny} x {nx} mesh with norm check every {nccheck}"
|
|
201
|
+
" iterations"
|
|
202
|
+
)
|
|
203
|
+
|
|
204
|
+
iter = 0
|
|
205
|
+
l2_norm = 1.0
|
|
206
|
+
|
|
207
|
+
start_time = MPI.Wtime()
|
|
208
|
+
|
|
209
|
+
while l2_norm > tol and iter < iter_max:
|
|
210
|
+
calculate_norm = (iter % nccheck == 0) or (iter % 100 == 0)
|
|
211
|
+
|
|
212
|
+
with wp.ScopedStream(compute_stream):
|
|
213
|
+
l2_norm_d.zero_()
|
|
214
|
+
|
|
215
|
+
compute_stream.wait_event(push_top_done)
|
|
216
|
+
compute_stream.wait_event(push_bottom_done)
|
|
217
|
+
|
|
218
|
+
wp.launch(
|
|
219
|
+
jacobi_update,
|
|
220
|
+
update_shape,
|
|
221
|
+
inputs=[a, iy_start, iy_end, nx, calculate_norm],
|
|
222
|
+
outputs=[a_new, l2_norm_d],
|
|
223
|
+
)
|
|
224
|
+
wp.record_event(compute_done)
|
|
225
|
+
|
|
226
|
+
if calculate_norm:
|
|
227
|
+
wp.copy(l2_norm_h, l2_norm_d, stream=compute_stream)
|
|
228
|
+
|
|
229
|
+
# Apply periodic boundary conditions
|
|
230
|
+
push_top_stream.wait_event(compute_done)
|
|
231
|
+
wp.copy(a_new[0], a_new[iy_end - 1], stream=push_top_stream)
|
|
232
|
+
push_top_stream.record_event(push_top_done)
|
|
233
|
+
|
|
234
|
+
push_bottom_stream.wait_event(compute_done)
|
|
235
|
+
wp.copy(a_new[iy_end], a_new[iy_start], stream=push_bottom_stream)
|
|
236
|
+
push_bottom_stream.record_event(push_bottom_done)
|
|
237
|
+
|
|
238
|
+
if calculate_norm:
|
|
239
|
+
wp.synchronize_stream(compute_stream)
|
|
240
|
+
|
|
241
|
+
l2_norm = math.sqrt(l2_norm_h.numpy()[0])
|
|
242
|
+
|
|
243
|
+
if verbose and iter % 100 == 0:
|
|
244
|
+
print(f"{iter:5d}, {l2_norm:.6f}")
|
|
245
|
+
|
|
246
|
+
# Swap arrays
|
|
247
|
+
a, a_new = a_new, a
|
|
248
|
+
|
|
249
|
+
iter += 1
|
|
250
|
+
|
|
251
|
+
wp.synchronize_device()
|
|
252
|
+
stop_time = MPI.Wtime()
|
|
253
|
+
|
|
254
|
+
a_ref_h = wp.empty((ny, nx), dtype=wptype, device="cpu")
|
|
255
|
+
wp.copy(a_ref_h, a)
|
|
256
|
+
|
|
257
|
+
return stop_time - start_time, a_ref_h
|
|
258
|
+
|
|
259
|
+
|
|
260
|
+
class Example:
|
|
261
|
+
def __init__(
|
|
262
|
+
self,
|
|
263
|
+
nx: int = 16384,
|
|
264
|
+
ny: int = 16384,
|
|
265
|
+
iter_max: int = 1000,
|
|
266
|
+
nccheck: int = 1,
|
|
267
|
+
csv: bool = False,
|
|
268
|
+
):
|
|
269
|
+
self.iter_max = iter_max
|
|
270
|
+
self.nx = nx # Global resolution
|
|
271
|
+
self.ny = ny # Global resolution
|
|
272
|
+
self.nccheck = nccheck
|
|
273
|
+
self.csv = csv
|
|
274
|
+
|
|
275
|
+
self.mpi_comm = MPI.COMM_WORLD
|
|
276
|
+
self.mpi_rank = self.mpi_comm.Get_rank()
|
|
277
|
+
self.mpi_size = self.mpi_comm.Get_size()
|
|
278
|
+
|
|
279
|
+
# Set the default device on the current rank
|
|
280
|
+
self.device = calc_default_device(self.mpi_comm)
|
|
281
|
+
wp.set_device(self.device)
|
|
282
|
+
|
|
283
|
+
# We need to disable memory pools for peer-to-peer transfers using MPI
|
|
284
|
+
# wp.set_mempool_enabled(wp.get_cuda_device(), False)
|
|
285
|
+
self.compute_stream = wp.Stream()
|
|
286
|
+
self.compute_done = wp.Event()
|
|
287
|
+
|
|
288
|
+
# Compute the solution on a single GPU for comparisons
|
|
289
|
+
self.runtime_serial, self.a_ref_h = benchmark_single_gpu(
|
|
290
|
+
self.nx, self.ny, self.iter_max, self.nccheck, not self.csv and self.mpi_rank == 0
|
|
291
|
+
)
|
|
292
|
+
|
|
293
|
+
# num_local_rows: Number of rows from the full (self.ny, self.nx) solution that
|
|
294
|
+
# this rank will calculate (excludes halo regions)
|
|
295
|
+
# iy_start_global: Allows us to go from a local index to a global index.
|
|
296
|
+
|
|
297
|
+
# self.ny-2 rows are distributed among the ranks for comparison with single-GPU case,
|
|
298
|
+
# which reserves the first and last rows for the boundary conditions
|
|
299
|
+
iy_decomp_start, self.num_local_rows = calc_decomp_1d(self.ny - 2, self.mpi_rank, self.mpi_size)
|
|
300
|
+
|
|
301
|
+
# Add 1 to get the global start index since the 1-D decomposition excludes the boundaries
|
|
302
|
+
self.iy_start_global = iy_decomp_start + 1
|
|
303
|
+
|
|
304
|
+
self.mpi_comm.Barrier()
|
|
305
|
+
if not self.csv:
|
|
306
|
+
print(
|
|
307
|
+
f"Rank {self.mpi_rank} on device {wp.get_cuda_device().pci_bus_id}: "
|
|
308
|
+
f"{self.num_local_rows} rows from y = {self.iy_start_global} to y = {self.iy_start_global + self.num_local_rows - 1}"
|
|
309
|
+
)
|
|
310
|
+
self.mpi_comm.Barrier()
|
|
311
|
+
|
|
312
|
+
# Allocate local array (the +2 is for the halo layer on each side)
|
|
313
|
+
self.a = wp.zeros((self.num_local_rows + 2, self.nx), dtype=wptype)
|
|
314
|
+
self.a_new = wp.zeros_like(self.a)
|
|
315
|
+
|
|
316
|
+
# Allocate host array for the final result
|
|
317
|
+
self.a_h = wp.empty((self.ny, self.nx), dtype=wptype, device="cpu")
|
|
318
|
+
|
|
319
|
+
self.l2_norm_d = wp.zeros((1,), dtype=wptype)
|
|
320
|
+
self.l2_norm_h = wp.ones_like(self.l2_norm_d, device="cpu", pinned=True)
|
|
321
|
+
|
|
322
|
+
# Boundary Conditions
|
|
323
|
+
# - y-boundaries (iy=0 and iy=self.ny-1): Periodic
|
|
324
|
+
# - x-boundaries (ix=0 and ix=self.nx-1): Dirichlet
|
|
325
|
+
|
|
326
|
+
# Local Indices
|
|
327
|
+
self.iy_start = 1
|
|
328
|
+
self.iy_end = self.iy_start + self.num_local_rows # Last owned row begins at [iy_end-1, 0]
|
|
329
|
+
|
|
330
|
+
# Don't need to loop over the Dirichlet boundaries in the Jacobi iteration
|
|
331
|
+
self.update_shape = (self.num_local_rows, self.nx - 2)
|
|
332
|
+
|
|
333
|
+
# Used for inter-rank communication
|
|
334
|
+
self.lower_neighbor = (self.mpi_rank + 1) % self.mpi_size
|
|
335
|
+
self.upper_neighbor = self.mpi_rank - 1 if self.mpi_rank > 0 else self.mpi_size - 1
|
|
336
|
+
|
|
337
|
+
# Apply Dirichlet boundary conditions to both a and a_new
|
|
338
|
+
wp.launch(
|
|
339
|
+
initialize_boundaries,
|
|
340
|
+
dim=(self.num_local_rows + 2,),
|
|
341
|
+
inputs=[self.nx, self.ny, self.iy_start_global - 1],
|
|
342
|
+
outputs=[self.a, self.a_new],
|
|
343
|
+
)
|
|
344
|
+
|
|
345
|
+
# MPI Warmup
|
|
346
|
+
wp.synchronize_device()
|
|
347
|
+
|
|
348
|
+
for _mpi_warmup in range(10):
|
|
349
|
+
self.apply_periodic_bc()
|
|
350
|
+
self.a, self.a_new = self.a_new, self.a
|
|
351
|
+
|
|
352
|
+
wp.synchronize_device()
|
|
353
|
+
|
|
354
|
+
if not self.csv and self.mpi_rank == 0:
|
|
355
|
+
print(
|
|
356
|
+
f"Jacobi relaxation: {self.iter_max} iterations on {self.ny} x {self.nx} mesh with norm check "
|
|
357
|
+
f"every {self.nccheck} iterations"
|
|
358
|
+
)
|
|
359
|
+
|
|
360
|
+
def apply_periodic_bc(self) -> None:
|
|
361
|
+
"""Apply periodic boundary conditions to the array.
|
|
362
|
+
|
|
363
|
+
This function sends the first row of owned data to the lower neighbor
|
|
364
|
+
and the last row of owned data to the upper neighbor.
|
|
365
|
+
"""
|
|
366
|
+
# Send the first row of owned data to the lower neighbor
|
|
367
|
+
self.mpi_comm.Sendrecv(
|
|
368
|
+
self.a_new[self.iy_start], self.lower_neighbor, 0, self.a_new[self.iy_end], self.upper_neighbor, 0
|
|
369
|
+
)
|
|
370
|
+
# Send the last row of owned data to the upper neighbor
|
|
371
|
+
self.mpi_comm.Sendrecv(
|
|
372
|
+
self.a_new[self.iy_end - 1], self.upper_neighbor, 0, self.a_new[0], self.lower_neighbor, 0
|
|
373
|
+
)
|
|
374
|
+
|
|
375
|
+
def step(self, calculate_norm: bool) -> None:
|
|
376
|
+
"""Perform a single Jacobi iteration step."""
|
|
377
|
+
with wp.ScopedStream(self.compute_stream):
|
|
378
|
+
self.l2_norm_d.zero_()
|
|
379
|
+
wp.launch(
|
|
380
|
+
jacobi_update,
|
|
381
|
+
self.update_shape,
|
|
382
|
+
inputs=[self.a, self.iy_start, self.iy_end, self.nx, calculate_norm],
|
|
383
|
+
outputs=[self.a_new, self.l2_norm_d],
|
|
384
|
+
)
|
|
385
|
+
wp.record_event(self.compute_done)
|
|
386
|
+
|
|
387
|
+
def run(self) -> None:
|
|
388
|
+
"""Run the Jacobi relaxation on multiple GPUs using MPI and compare with single-GPU results."""
|
|
389
|
+
iter = 0
|
|
390
|
+
l2_norm = np.array([1.0], dtype=warp_type_to_np_dtype[wptype])
|
|
391
|
+
|
|
392
|
+
start_time = MPI.Wtime()
|
|
393
|
+
|
|
394
|
+
while l2_norm > tol and iter < self.iter_max:
|
|
395
|
+
calculate_norm = (iter % self.nccheck == 0) or (not self.csv and iter % 100 == 0)
|
|
396
|
+
|
|
397
|
+
self.step(calculate_norm)
|
|
398
|
+
|
|
399
|
+
if calculate_norm:
|
|
400
|
+
wp.copy(self.l2_norm_h, self.l2_norm_d, stream=self.compute_stream)
|
|
401
|
+
|
|
402
|
+
wp.synchronize_event(self.compute_done)
|
|
403
|
+
|
|
404
|
+
self.apply_periodic_bc()
|
|
405
|
+
|
|
406
|
+
if calculate_norm:
|
|
407
|
+
wp.synchronize_stream(self.compute_stream)
|
|
408
|
+
|
|
409
|
+
self.mpi_comm.Allreduce(self.l2_norm_h.numpy(), l2_norm)
|
|
410
|
+
l2_norm = np.sqrt(l2_norm)
|
|
411
|
+
|
|
412
|
+
if not self.csv and self.mpi_rank == 0 and iter % 100 == 0:
|
|
413
|
+
print(f"{iter:5d}, {l2_norm[0]:.6f}")
|
|
414
|
+
|
|
415
|
+
# Swap arrays
|
|
416
|
+
self.a, self.a_new = self.a_new, self.a
|
|
417
|
+
|
|
418
|
+
iter += 1
|
|
419
|
+
|
|
420
|
+
wp.synchronize_device()
|
|
421
|
+
stop_time = MPI.Wtime()
|
|
422
|
+
|
|
423
|
+
result_correct = self.check_results(tol)
|
|
424
|
+
global_result_correct = self.mpi_comm.allreduce(result_correct, op=MPI.MIN)
|
|
425
|
+
|
|
426
|
+
if not global_result_correct:
|
|
427
|
+
sys.exit(1)
|
|
428
|
+
elif global_result_correct and self.mpi_rank == 0:
|
|
429
|
+
if self.csv:
|
|
430
|
+
print(
|
|
431
|
+
f"mpi, {self.nx}, {self.ny}, {self.iter_max}, {self.nccheck}, {self.mpi_size}, 1, "
|
|
432
|
+
f"{stop_time - start_time}, {self.runtime_serial}"
|
|
433
|
+
)
|
|
434
|
+
else:
|
|
435
|
+
print(f"Num GPUs: {self.mpi_size}")
|
|
436
|
+
print(
|
|
437
|
+
f"{self.ny}x{self.nx}: 1 GPU: {self.runtime_serial:8.4f} s, "
|
|
438
|
+
f"{self.mpi_size} GPUs {stop_time - start_time:8.4f} s, "
|
|
439
|
+
f"speedup: {self.runtime_serial / (stop_time - start_time):8.2f}, "
|
|
440
|
+
f"efficiency: {self.runtime_serial / (stop_time - start_time) / self.mpi_size * 100:8.2f}"
|
|
441
|
+
)
|
|
442
|
+
|
|
443
|
+
def check_results(self, tol: float = 1e-8) -> bool:
|
|
444
|
+
"""Returns ``True`` if multi-GPU result is within ``tol`` of the single-GPU result.
|
|
445
|
+
|
|
446
|
+
Comparison is performed on the host in a serial manner.
|
|
447
|
+
"""
|
|
448
|
+
result_correct = True
|
|
449
|
+
|
|
450
|
+
wp.copy(
|
|
451
|
+
self.a_h,
|
|
452
|
+
self.a,
|
|
453
|
+
dest_offset=self.iy_start_global * self.nx,
|
|
454
|
+
src_offset=self.nx,
|
|
455
|
+
count=self.num_local_rows * self.nx,
|
|
456
|
+
)
|
|
457
|
+
|
|
458
|
+
a_ref_np = self.a_ref_h.numpy()
|
|
459
|
+
a_np = self.a_h.numpy()
|
|
460
|
+
|
|
461
|
+
for iy in range(self.iy_start_global, self.iy_start_global + self.num_local_rows):
|
|
462
|
+
if not result_correct:
|
|
463
|
+
break
|
|
464
|
+
for ix in range(1, self.nx - 1):
|
|
465
|
+
if math.fabs(a_ref_np[iy, ix] - a_np[iy, ix]) > tol:
|
|
466
|
+
result_correct = False
|
|
467
|
+
print(
|
|
468
|
+
f"ERROR on rank {self.mpi_rank}: a[{iy},{ix}] = {a_np[iy, ix]} does not match "
|
|
469
|
+
f"{a_ref_np[iy, ix]} (reference)"
|
|
470
|
+
)
|
|
471
|
+
break
|
|
472
|
+
|
|
473
|
+
return result_correct
|
|
474
|
+
|
|
475
|
+
|
|
476
|
+
if __name__ == "__main__":
|
|
477
|
+
import argparse
|
|
478
|
+
|
|
479
|
+
parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
|
|
480
|
+
|
|
481
|
+
parser.add_argument("--itermax", type=int, default=1000, help="Maximum number of Jacobi iterations.")
|
|
482
|
+
parser.add_argument("--nccheck", type=int, default=1, help="Check convergence every nccheck iterations.")
|
|
483
|
+
parser.add_argument("--nx", type=int, default=16384, help="Total resolution in x.")
|
|
484
|
+
parser.add_argument("--ny", type=int, default=16384, help="Total resolution in y.")
|
|
485
|
+
parser.add_argument("-csv", action="store_true", help="Print results as CSV values.")
|
|
486
|
+
parser.add_argument(
|
|
487
|
+
"--visualize",
|
|
488
|
+
action="store_true",
|
|
489
|
+
help="Display the final solution in a graphical window using matplotlib.",
|
|
490
|
+
)
|
|
491
|
+
|
|
492
|
+
args = parser.parse_known_args()[0]
|
|
493
|
+
|
|
494
|
+
example = Example(args.nx, args.ny, args.itermax, args.nccheck, args.csv)
|
|
495
|
+
|
|
496
|
+
example.run()
|
|
497
|
+
|
|
498
|
+
if args.visualize:
|
|
499
|
+
import matplotlib.pyplot as plt
|
|
500
|
+
|
|
501
|
+
# Plot the final result
|
|
502
|
+
plt.imshow(example.a.numpy(), cmap="viridis", origin="lower", vmin=-1, vmax=1)
|
|
503
|
+
plt.colorbar(label="Value")
|
|
504
|
+
plt.title(f"Rank {example.mpi_rank} Jacobi Iteration Result")
|
|
505
|
+
plt.xlabel("X-axis")
|
|
506
|
+
plt.ylabel("Y-axis")
|
|
507
|
+
plt.show()
|
warp/fem/cache.py
CHANGED
|
@@ -100,8 +100,8 @@ def get_struct(struct: type, suffix: str, use_qualified_name: bool = False):
|
|
|
100
100
|
if key not in _struct_cache:
|
|
101
101
|
module = wp.get_module(struct.__module__)
|
|
102
102
|
_struct_cache[key] = wp.codegen.Struct(
|
|
103
|
-
cls=struct,
|
|
104
103
|
key=key,
|
|
104
|
+
cls=struct,
|
|
105
105
|
module=module,
|
|
106
106
|
)
|
|
107
107
|
|
warp/fem/field/field.py
CHANGED
|
@@ -180,7 +180,7 @@ class SpaceField(GeometryField):
|
|
|
180
180
|
|
|
181
181
|
@property
|
|
182
182
|
def gradient_dtype(self):
|
|
183
|
-
"""Return type of the gradient operator. Assumes self.gradient_valid()"""
|
|
183
|
+
"""Return type of the (world space) gradient operator. Assumes self.gradient_valid()"""
|
|
184
184
|
if wp.types.type_is_vector(self.dtype):
|
|
185
185
|
return cache.cached_mat_type(
|
|
186
186
|
shape=(wp.types.type_length(self.dtype), self.geometry.dimension),
|
|
@@ -188,6 +188,16 @@ class SpaceField(GeometryField):
|
|
|
188
188
|
)
|
|
189
189
|
return cache.cached_vec_type(length=self.geometry.dimension, dtype=wp.types.type_scalar_type(self.dtype))
|
|
190
190
|
|
|
191
|
+
@property
|
|
192
|
+
def reference_gradient_dtype(self):
|
|
193
|
+
"""Return type of the reference space gradient operator. Assumes self.gradient_valid()"""
|
|
194
|
+
if wp.types.type_is_vector(self.dtype):
|
|
195
|
+
return cache.cached_mat_type(
|
|
196
|
+
shape=(wp.types.type_length(self.dtype), self.geometry.cell_dimension),
|
|
197
|
+
dtype=wp.types.type_scalar_type(self.dtype),
|
|
198
|
+
)
|
|
199
|
+
return cache.cached_vec_type(length=self.geometry.cell_dimension, dtype=wp.types.type_scalar_type(self.dtype))
|
|
200
|
+
|
|
191
201
|
@property
|
|
192
202
|
def divergence_dtype(self):
|
|
193
203
|
"""Return type of the divergence operator. Assumes self.gradient_valid()"""
|
warp/fem/field/nodal_field.py
CHANGED
|
@@ -103,14 +103,16 @@ class NodalFieldBase(DiscreteField):
|
|
|
103
103
|
if not self.space.gradient_valid():
|
|
104
104
|
return None
|
|
105
105
|
|
|
106
|
-
|
|
106
|
+
gradient_dtype = self.gradient_dtype if world_space else self.reference_gradient_dtype
|
|
107
|
+
|
|
108
|
+
@cache.dynamic_func(suffix=f"{self.name}{world_space}")
|
|
107
109
|
def eval_grad_inner(args: self.ElementEvalArg, s: Sample, grad_transform: Any):
|
|
108
110
|
local_value_map = self.space.local_value_map_inner(args.elt_arg, s.element_index, s.element_coords)
|
|
109
111
|
node_count = self.space.topology.element_node_count(
|
|
110
112
|
args.elt_arg, args.eval_arg.topology_arg, s.element_index
|
|
111
113
|
)
|
|
112
114
|
|
|
113
|
-
res =
|
|
115
|
+
res = gradient_dtype(0.0)
|
|
114
116
|
for k in range(node_count):
|
|
115
117
|
res += self.space.space_gradient(
|
|
116
118
|
self._read_node_value(args, s.element_index, k),
|
|
@@ -122,17 +124,22 @@ class NodalFieldBase(DiscreteField):
|
|
|
122
124
|
)
|
|
123
125
|
return res
|
|
124
126
|
|
|
125
|
-
|
|
126
|
-
def eval_grad_inner_ref_space(args: self.ElementEvalArg, s: Sample):
|
|
127
|
-
grad_transform = 1.0
|
|
128
|
-
return eval_grad_inner(args, s, grad_transform)
|
|
127
|
+
if world_space:
|
|
129
128
|
|
|
130
|
-
|
|
131
|
-
|
|
132
|
-
|
|
133
|
-
|
|
129
|
+
@cache.dynamic_func(suffix=self.name)
|
|
130
|
+
def eval_grad_inner_world_space(args: self.ElementEvalArg, s: Sample):
|
|
131
|
+
grad_transform = self.space.element_inner_reference_gradient_transform(args.elt_arg, s)
|
|
132
|
+
return eval_grad_inner(args, s, grad_transform)
|
|
133
|
+
|
|
134
|
+
return eval_grad_inner_world_space
|
|
135
|
+
else:
|
|
136
|
+
|
|
137
|
+
@cache.dynamic_func(suffix=self.name)
|
|
138
|
+
def eval_grad_inner_ref_space(args: self.ElementEvalArg, s: Sample):
|
|
139
|
+
grad_transform = 1.0
|
|
140
|
+
return eval_grad_inner(args, s, grad_transform)
|
|
134
141
|
|
|
135
|
-
|
|
142
|
+
return eval_grad_inner_ref_space
|
|
136
143
|
|
|
137
144
|
def _make_eval_div_inner(self):
|
|
138
145
|
if not self.divergence_valid():
|
|
@@ -185,14 +192,16 @@ class NodalFieldBase(DiscreteField):
|
|
|
185
192
|
if not self.space.gradient_valid():
|
|
186
193
|
return None
|
|
187
194
|
|
|
188
|
-
|
|
195
|
+
gradient_dtype = self.gradient_dtype if world_space else self.reference_gradient_dtype
|
|
196
|
+
|
|
197
|
+
@cache.dynamic_func(suffix=f"{self.name}{world_space}")
|
|
189
198
|
def eval_grad_outer(args: self.ElementEvalArg, s: Sample, grad_transform: Any):
|
|
190
199
|
local_value_map = self.space.local_value_map_outer(args.elt_arg, s.element_index, s.element_coords)
|
|
191
200
|
node_count = self.space.topology.element_node_count(
|
|
192
201
|
args.elt_arg, args.eval_arg.topology_arg, s.element_index
|
|
193
202
|
)
|
|
194
203
|
|
|
195
|
-
res =
|
|
204
|
+
res = gradient_dtype(0.0)
|
|
196
205
|
for k in range(node_count):
|
|
197
206
|
res += self.space.space_gradient(
|
|
198
207
|
self._read_node_value(args, s.element_index, k),
|
|
@@ -204,17 +213,22 @@ class NodalFieldBase(DiscreteField):
|
|
|
204
213
|
)
|
|
205
214
|
return res
|
|
206
215
|
|
|
207
|
-
|
|
208
|
-
def eval_grad_outer_ref_space(args: self.ElementEvalArg, s: Sample):
|
|
209
|
-
grad_transform = 1.0
|
|
210
|
-
return eval_grad_outer_ref_space(args, s, grad_transform)
|
|
216
|
+
if world_space:
|
|
211
217
|
|
|
212
|
-
|
|
213
|
-
|
|
214
|
-
|
|
215
|
-
|
|
218
|
+
@cache.dynamic_func(suffix=self.name)
|
|
219
|
+
def eval_grad_outer_world_space(args: self.ElementEvalArg, s: Sample):
|
|
220
|
+
grad_transform = self.space.element_outer_reference_gradient_transform(args.elt_arg, s)
|
|
221
|
+
return eval_grad_outer_ref_space(args, s, grad_transform)
|
|
222
|
+
|
|
223
|
+
return eval_grad_outer_world_space
|
|
224
|
+
else:
|
|
225
|
+
|
|
226
|
+
@cache.dynamic_func(suffix=self.name)
|
|
227
|
+
def eval_grad_outer_ref_space(args: self.ElementEvalArg, s: Sample):
|
|
228
|
+
grad_transform = 1.0
|
|
229
|
+
return eval_grad_outer_ref_space(args, s, grad_transform)
|
|
216
230
|
|
|
217
|
-
|
|
231
|
+
return eval_grad_outer_ref_space
|
|
218
232
|
|
|
219
233
|
def _make_eval_div_outer(self):
|
|
220
234
|
if not self.divergence_valid():
|
|
@@ -184,9 +184,13 @@ class AdaptiveNanogrid(Geometry):
|
|
|
184
184
|
|
|
185
185
|
@wp.func
|
|
186
186
|
def cell_position(args: CellArg, s: Sample):
|
|
187
|
-
|
|
188
|
-
|
|
189
|
-
|
|
187
|
+
cell_idx = s.element_index
|
|
188
|
+
scale = AdaptiveNanogrid._cell_scale(args, cell_idx)
|
|
189
|
+
cell_coords = s.element_coords
|
|
190
|
+
cell_ijk = args.cell_ijk[cell_idx]
|
|
191
|
+
uvw = wp.vec3(cell_ijk) + cell_coords * scale
|
|
192
|
+
grid_id = args.cell_grid
|
|
193
|
+
return wp.volume_index_to_world(grid_id, uvw - wp.vec3(0.5))
|
|
190
194
|
|
|
191
195
|
@wp.func
|
|
192
196
|
def cell_deformation_gradient(args: CellArg, s: Sample):
|