warp-lang 1.6.2__py3-none-macosx_10_13_universal2.whl → 1.7.1__py3-none-macosx_10_13_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of warp-lang might be problematic. Click here for more details.

Files changed (191) hide show
  1. warp/__init__.py +7 -1
  2. warp/autograd.py +12 -2
  3. warp/bin/libwarp-clang.dylib +0 -0
  4. warp/bin/libwarp.dylib +0 -0
  5. warp/build.py +410 -0
  6. warp/build_dll.py +6 -14
  7. warp/builtins.py +463 -372
  8. warp/codegen.py +196 -124
  9. warp/config.py +42 -6
  10. warp/context.py +496 -271
  11. warp/dlpack.py +8 -6
  12. warp/examples/assets/nonuniform.usd +0 -0
  13. warp/examples/assets/nvidia_logo.png +0 -0
  14. warp/examples/benchmarks/benchmark_cloth.py +1 -1
  15. warp/examples/benchmarks/benchmark_tile_load_store.py +103 -0
  16. warp/examples/core/example_sample_mesh.py +300 -0
  17. warp/examples/distributed/example_jacobi_mpi.py +507 -0
  18. warp/examples/fem/example_apic_fluid.py +1 -1
  19. warp/examples/fem/example_burgers.py +2 -2
  20. warp/examples/fem/example_deformed_geometry.py +1 -1
  21. warp/examples/fem/example_distortion_energy.py +1 -1
  22. warp/examples/fem/example_magnetostatics.py +6 -6
  23. warp/examples/fem/utils.py +9 -3
  24. warp/examples/interop/example_jax_callable.py +116 -0
  25. warp/examples/interop/example_jax_ffi_callback.py +132 -0
  26. warp/examples/interop/example_jax_kernel.py +205 -0
  27. warp/examples/optim/example_fluid_checkpoint.py +497 -0
  28. warp/examples/tile/example_tile_matmul.py +2 -4
  29. warp/fem/__init__.py +11 -1
  30. warp/fem/adaptivity.py +4 -4
  31. warp/fem/field/field.py +11 -1
  32. warp/fem/field/nodal_field.py +56 -88
  33. warp/fem/field/virtual.py +62 -23
  34. warp/fem/geometry/adaptive_nanogrid.py +16 -13
  35. warp/fem/geometry/closest_point.py +1 -1
  36. warp/fem/geometry/deformed_geometry.py +5 -2
  37. warp/fem/geometry/geometry.py +5 -0
  38. warp/fem/geometry/grid_2d.py +12 -12
  39. warp/fem/geometry/grid_3d.py +12 -15
  40. warp/fem/geometry/hexmesh.py +5 -7
  41. warp/fem/geometry/nanogrid.py +9 -11
  42. warp/fem/geometry/quadmesh.py +13 -13
  43. warp/fem/geometry/tetmesh.py +3 -4
  44. warp/fem/geometry/trimesh.py +7 -20
  45. warp/fem/integrate.py +262 -93
  46. warp/fem/linalg.py +5 -5
  47. warp/fem/quadrature/pic_quadrature.py +37 -22
  48. warp/fem/quadrature/quadrature.py +194 -25
  49. warp/fem/space/__init__.py +1 -1
  50. warp/fem/space/basis_function_space.py +4 -2
  51. warp/fem/space/basis_space.py +25 -18
  52. warp/fem/space/hexmesh_function_space.py +2 -2
  53. warp/fem/space/partition.py +6 -2
  54. warp/fem/space/quadmesh_function_space.py +8 -8
  55. warp/fem/space/shape/cube_shape_function.py +23 -23
  56. warp/fem/space/shape/square_shape_function.py +12 -12
  57. warp/fem/space/shape/triangle_shape_function.py +1 -1
  58. warp/fem/space/tetmesh_function_space.py +3 -3
  59. warp/fem/space/trimesh_function_space.py +2 -2
  60. warp/fem/utils.py +12 -6
  61. warp/jax.py +14 -1
  62. warp/jax_experimental/__init__.py +16 -0
  63. warp/{jax_experimental.py → jax_experimental/custom_call.py} +28 -29
  64. warp/jax_experimental/ffi.py +702 -0
  65. warp/jax_experimental/xla_ffi.py +602 -0
  66. warp/math.py +89 -0
  67. warp/native/array.h +13 -0
  68. warp/native/builtin.h +29 -3
  69. warp/native/bvh.cpp +3 -1
  70. warp/native/bvh.cu +42 -14
  71. warp/native/bvh.h +2 -1
  72. warp/native/clang/clang.cpp +30 -3
  73. warp/native/cuda_util.cpp +14 -0
  74. warp/native/cuda_util.h +2 -0
  75. warp/native/exports.h +68 -63
  76. warp/native/intersect.h +26 -26
  77. warp/native/intersect_adj.h +33 -33
  78. warp/native/marching.cu +1 -1
  79. warp/native/mat.h +513 -9
  80. warp/native/mesh.h +10 -10
  81. warp/native/quat.h +99 -11
  82. warp/native/rand.h +6 -0
  83. warp/native/sort.cpp +122 -59
  84. warp/native/sort.cu +152 -15
  85. warp/native/sort.h +8 -1
  86. warp/native/sparse.cpp +43 -22
  87. warp/native/sparse.cu +52 -17
  88. warp/native/svd.h +116 -0
  89. warp/native/tile.h +312 -116
  90. warp/native/tile_reduce.h +46 -3
  91. warp/native/vec.h +68 -7
  92. warp/native/volume.cpp +85 -113
  93. warp/native/volume_builder.cu +25 -10
  94. warp/native/volume_builder.h +6 -0
  95. warp/native/warp.cpp +5 -6
  96. warp/native/warp.cu +100 -11
  97. warp/native/warp.h +19 -10
  98. warp/optim/linear.py +10 -10
  99. warp/render/render_opengl.py +19 -17
  100. warp/render/render_usd.py +93 -3
  101. warp/sim/articulation.py +4 -4
  102. warp/sim/collide.py +32 -19
  103. warp/sim/import_mjcf.py +449 -155
  104. warp/sim/import_urdf.py +32 -12
  105. warp/sim/inertia.py +189 -156
  106. warp/sim/integrator_euler.py +8 -5
  107. warp/sim/integrator_featherstone.py +3 -10
  108. warp/sim/integrator_vbd.py +207 -2
  109. warp/sim/integrator_xpbd.py +8 -5
  110. warp/sim/model.py +71 -25
  111. warp/sim/render.py +4 -0
  112. warp/sim/utils.py +2 -2
  113. warp/sparse.py +642 -555
  114. warp/stubs.py +217 -20
  115. warp/tests/__main__.py +0 -15
  116. warp/tests/assets/torus.usda +1 -1
  117. warp/tests/cuda/__init__.py +0 -0
  118. warp/tests/{test_mempool.py → cuda/test_mempool.py} +39 -0
  119. warp/tests/{test_streams.py → cuda/test_streams.py} +71 -0
  120. warp/tests/geometry/__init__.py +0 -0
  121. warp/tests/{test_mesh_query_point.py → geometry/test_mesh_query_point.py} +66 -63
  122. warp/tests/{test_mesh_query_ray.py → geometry/test_mesh_query_ray.py} +1 -1
  123. warp/tests/{test_volume.py → geometry/test_volume.py} +41 -6
  124. warp/tests/interop/__init__.py +0 -0
  125. warp/tests/{test_dlpack.py → interop/test_dlpack.py} +28 -5
  126. warp/tests/sim/__init__.py +0 -0
  127. warp/tests/{disabled_kinematics.py → sim/disabled_kinematics.py} +9 -10
  128. warp/tests/{test_collision.py → sim/test_collision.py} +236 -205
  129. warp/tests/sim/test_inertia.py +161 -0
  130. warp/tests/{test_model.py → sim/test_model.py} +40 -0
  131. warp/tests/{flaky_test_sim_grad.py → sim/test_sim_grad.py} +4 -0
  132. warp/tests/{test_sim_kinematics.py → sim/test_sim_kinematics.py} +2 -1
  133. warp/tests/sim/test_vbd.py +597 -0
  134. warp/tests/sim/test_xpbd.py +399 -0
  135. warp/tests/test_bool.py +1 -1
  136. warp/tests/test_codegen.py +24 -3
  137. warp/tests/test_examples.py +40 -38
  138. warp/tests/test_fem.py +98 -14
  139. warp/tests/test_linear_solvers.py +0 -11
  140. warp/tests/test_mat.py +577 -156
  141. warp/tests/test_mat_scalar_ops.py +4 -4
  142. warp/tests/test_overwrite.py +0 -60
  143. warp/tests/test_quat.py +356 -151
  144. warp/tests/test_rand.py +44 -37
  145. warp/tests/test_sparse.py +47 -6
  146. warp/tests/test_spatial.py +75 -0
  147. warp/tests/test_static.py +1 -1
  148. warp/tests/test_utils.py +84 -4
  149. warp/tests/test_vec.py +336 -178
  150. warp/tests/tile/__init__.py +0 -0
  151. warp/tests/{test_tile.py → tile/test_tile.py} +136 -51
  152. warp/tests/{test_tile_load.py → tile/test_tile_load.py} +98 -1
  153. warp/tests/{test_tile_mathdx.py → tile/test_tile_mathdx.py} +9 -6
  154. warp/tests/{test_tile_mlp.py → tile/test_tile_mlp.py} +25 -14
  155. warp/tests/{test_tile_reduce.py → tile/test_tile_reduce.py} +60 -1
  156. warp/tests/{test_tile_view.py → tile/test_tile_view.py} +1 -1
  157. warp/tests/unittest_serial.py +1 -0
  158. warp/tests/unittest_suites.py +45 -62
  159. warp/tests/unittest_utils.py +2 -1
  160. warp/thirdparty/unittest_parallel.py +3 -1
  161. warp/types.py +175 -666
  162. warp/utils.py +137 -72
  163. {warp_lang-1.6.2.dist-info → warp_lang-1.7.1.dist-info}/METADATA +46 -12
  164. {warp_lang-1.6.2.dist-info → warp_lang-1.7.1.dist-info}/RECORD +184 -171
  165. {warp_lang-1.6.2.dist-info → warp_lang-1.7.1.dist-info}/WHEEL +1 -1
  166. {warp_lang-1.6.2.dist-info → warp_lang-1.7.1.dist-info/licenses}/LICENSE.md +0 -26
  167. warp/examples/optim/example_walker.py +0 -317
  168. warp/native/cutlass_gemm.cpp +0 -43
  169. warp/native/cutlass_gemm.cu +0 -382
  170. warp/tests/test_matmul.py +0 -511
  171. warp/tests/test_matmul_lite.py +0 -411
  172. warp/tests/test_vbd.py +0 -386
  173. warp/tests/unused_test_misc.py +0 -77
  174. /warp/tests/{test_async.py → cuda/test_async.py} +0 -0
  175. /warp/tests/{test_ipc.py → cuda/test_ipc.py} +0 -0
  176. /warp/tests/{test_multigpu.py → cuda/test_multigpu.py} +0 -0
  177. /warp/tests/{test_peer.py → cuda/test_peer.py} +0 -0
  178. /warp/tests/{test_pinned.py → cuda/test_pinned.py} +0 -0
  179. /warp/tests/{test_bvh.py → geometry/test_bvh.py} +0 -0
  180. /warp/tests/{test_hash_grid.py → geometry/test_hash_grid.py} +0 -0
  181. /warp/tests/{test_marching_cubes.py → geometry/test_marching_cubes.py} +0 -0
  182. /warp/tests/{test_mesh.py → geometry/test_mesh.py} +0 -0
  183. /warp/tests/{test_mesh_query_aabb.py → geometry/test_mesh_query_aabb.py} +0 -0
  184. /warp/tests/{test_volume_write.py → geometry/test_volume_write.py} +0 -0
  185. /warp/tests/{test_jax.py → interop/test_jax.py} +0 -0
  186. /warp/tests/{test_paddle.py → interop/test_paddle.py} +0 -0
  187. /warp/tests/{test_torch.py → interop/test_torch.py} +0 -0
  188. /warp/tests/{test_coloring.py → sim/test_coloring.py} +0 -0
  189. /warp/tests/{test_sim_grad_bounce_linear.py → sim/test_sim_grad_bounce_linear.py} +0 -0
  190. /warp/tests/{test_tile_shared_memory.py → tile/test_tile_shared_memory.py} +0 -0
  191. {warp_lang-1.6.2.dist-info → warp_lang-1.7.1.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,507 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+ """An example implementation of a distributed Jacobi solver using MPI.
16
+
17
+ This example shows how to solve the Laplace equation using Jacobi iteration on
18
+ multiple GPUs using Warp and mpi4py. This example is based on the basic "mpi"
19
+ example from the Multi GPU Programming Models repository.
20
+
21
+ This example requires mpi4py and a CUDA-aware MPI implementation. We suggest
22
+ downloading and installing NVIDIA HPC-X, followed by installing mpi4py from its
23
+ source distribution: python -m pip install mpi4py
24
+
25
+ Usage:
26
+ mpirun -n 2 python example_jacobi_mpi.py
27
+
28
+ References:
29
+ https://github.com/NVIDIA/multi-gpu-programming-models
30
+ https://developer.nvidia.com/networking/hpc-x
31
+ https://github.com/mpi4py/mpi4py
32
+ """
33
+
34
+ import math
35
+ import sys
36
+ from typing import Tuple
37
+
38
+ import numpy as np
39
+ from mpi4py import MPI
40
+
41
+ import warp as wp
42
+ import warp.context
43
+ from warp.types import warp_type_to_np_dtype
44
+
45
+ wp.config.quiet = True # Suppress wp.init() output
46
+
47
+
48
+ tol = 1e-8
49
+ wptype = wp.float32 # Global precision setting, can set wp.float64 here for double precision
50
+ pi = wptype(math.pi) # GitHub #485
51
+
52
+
53
+ def calc_default_device(mpi_comm: "MPI.Comm") -> warp.context.Device:
54
+ """Return the device that should be used for the current rank.
55
+
56
+ This function is used to ensure that multiple MPI ranks running on the same
57
+ node are assigned to different GPUs.
58
+
59
+ Args:
60
+ mpi_comm: The MPI communicator.
61
+
62
+ Returns:
63
+ The Warp device that should be used for the current rank.
64
+
65
+ Raises:
66
+ RuntimeError: If the number of visible devices is less than the number of ranks on the node.
67
+ """
68
+
69
+ # Find the local rank and size
70
+ local_mpi_comm = mpi_comm.Split_type(MPI.COMM_TYPE_SHARED)
71
+
72
+ local_size = local_mpi_comm.Get_size()
73
+ local_rank = local_mpi_comm.Get_rank()
74
+
75
+ num_cuda_devices = warp.get_cuda_device_count()
76
+
77
+ if 1 < num_cuda_devices < local_size:
78
+ raise RuntimeError(
79
+ f"Number of visible devices ({num_cuda_devices}) is less than number of ranks on the node ({local_size})"
80
+ )
81
+
82
+ if 1 < num_cuda_devices:
83
+ # Get the device based on local_rank
84
+ return warp.get_cuda_device(local_rank)
85
+ else:
86
+ return warp.get_device()
87
+
88
+
89
+ def calc_decomp_1d(total_points: int, rank: int, total_ranks: int) -> Tuple[int, int]:
90
+ """Calculate a 1-D decomposition to divide ``total_points`` among ``total_ranks`` domains.
91
+
92
+ Returns a tuple containing the starting index of the decomposition followed
93
+ by number of points in the domain.
94
+
95
+ If ``total_points`` can not be evenly divided among ``total_ranks``,
96
+ the first ``total_points % total_ranks`` domains will contain one additional
97
+ point.
98
+ """
99
+
100
+ if rank < total_points % total_ranks:
101
+ num_domain_points = total_points // total_ranks + 1
102
+ start_index = rank * num_domain_points
103
+ else:
104
+ num_domain_points = total_points // total_ranks
105
+ start_index = total_points - (total_ranks - rank) * num_domain_points
106
+
107
+ return (start_index, num_domain_points)
108
+
109
+
110
+ @wp.kernel
111
+ def jacobi_update(
112
+ a: wp.array2d(dtype=wptype),
113
+ iy_start: int,
114
+ iy_end: int,
115
+ nx: int,
116
+ calculate_norm: bool,
117
+ a_new: wp.array2d(dtype=wptype),
118
+ l2_norm: wp.array(dtype=wptype),
119
+ ):
120
+ i, j = wp.tid()
121
+
122
+ # Convert from local thread indices to the indices used to access the arrays
123
+
124
+ iy = i + iy_start
125
+ ix = j + 1
126
+
127
+ local_l2_norm = wptype(0.0)
128
+
129
+ if iy < iy_end and ix < nx - 1:
130
+ new_val = wptype(0.25) * (a[iy - 1, ix] + a[iy + 1, ix] + a[iy, ix - 1] + a[iy, ix + 1])
131
+ a_new[iy, ix] = new_val
132
+
133
+ if calculate_norm:
134
+ residue = new_val - a[iy, ix]
135
+ local_l2_norm = residue * residue
136
+
137
+ if calculate_norm:
138
+ t = wp.tile(local_l2_norm)
139
+ s = wp.tile_sum(t)
140
+ wp.tile_atomic_add(l2_norm, s)
141
+
142
+
143
+ @wp.kernel
144
+ def initialize_boundaries(
145
+ nx: int,
146
+ ny: int,
147
+ offset: int,
148
+ a: wp.array2d(dtype=wptype),
149
+ a_new: wp.array2d(dtype=wptype),
150
+ ):
151
+ i = wp.tid()
152
+
153
+ boundary_val = wp.sin(wptype(2.0) * pi * wptype(i + offset) / wptype(ny - 1))
154
+
155
+ a[i, 0] = boundary_val
156
+ a[i, nx - 1] = boundary_val
157
+ a_new[i, 0] = boundary_val
158
+ a_new[i, nx - 1] = boundary_val
159
+
160
+
161
+ def benchmark_single_gpu(nx: int, ny: int, iter_max: int, nccheck: int = 1, verbose: bool = False):
162
+ """Compute the solution on a single GPU for performance and correctness comparisons.
163
+
164
+ Args:
165
+ nx: The number of points in the x-direction.
166
+ ny: The number of points in the y-direction.
167
+ iter_max: The maximum number of Jacobi iterations.
168
+ nccheck: The number of iterations between norm checks. Defaults to 1.
169
+ verbose: Whether to print verbose output. Defaults to False.
170
+
171
+ Returns:
172
+ tuple: A tuple containing:
173
+ - runtime (float): The execution time of the solution in seconds.
174
+ - solution (warp.array2d): The solution as a Warp array on the host
175
+ with dimensions ``(ny, nx)``.
176
+ """
177
+
178
+ a = wp.zeros((ny, nx), dtype=wptype)
179
+ a_new = wp.zeros_like(a)
180
+
181
+ l2_norm_d = wp.zeros((1,), dtype=wptype)
182
+ l2_norm_h = wp.ones_like(l2_norm_d, device="cpu", pinned=True)
183
+
184
+ compute_stream = wp.Stream()
185
+ push_top_stream = wp.Stream()
186
+ push_bottom_stream = wp.Stream()
187
+
188
+ compute_done = wp.Event()
189
+ push_top_done = wp.Event()
190
+ push_bottom_done = wp.Event()
191
+
192
+ iy_start = 1
193
+ iy_end = ny - 1
194
+ update_shape = (iy_end - iy_start, nx - 2)
195
+
196
+ wp.launch(initialize_boundaries, dim=(ny,), inputs=[nx, ny, 0], outputs=[a, a_new])
197
+
198
+ if verbose:
199
+ print(
200
+ f"Single GPU jacobi relaxation: {iter_max} iterations on {ny} x {nx} mesh with norm check every {nccheck}"
201
+ " iterations"
202
+ )
203
+
204
+ iter = 0
205
+ l2_norm = 1.0
206
+
207
+ start_time = MPI.Wtime()
208
+
209
+ while l2_norm > tol and iter < iter_max:
210
+ calculate_norm = (iter % nccheck == 0) or (iter % 100 == 0)
211
+
212
+ with wp.ScopedStream(compute_stream):
213
+ l2_norm_d.zero_()
214
+
215
+ compute_stream.wait_event(push_top_done)
216
+ compute_stream.wait_event(push_bottom_done)
217
+
218
+ wp.launch(
219
+ jacobi_update,
220
+ update_shape,
221
+ inputs=[a, iy_start, iy_end, nx, calculate_norm],
222
+ outputs=[a_new, l2_norm_d],
223
+ )
224
+ wp.record_event(compute_done)
225
+
226
+ if calculate_norm:
227
+ wp.copy(l2_norm_h, l2_norm_d, stream=compute_stream)
228
+
229
+ # Apply periodic boundary conditions
230
+ push_top_stream.wait_event(compute_done)
231
+ wp.copy(a_new[0], a_new[iy_end - 1], stream=push_top_stream)
232
+ push_top_stream.record_event(push_top_done)
233
+
234
+ push_bottom_stream.wait_event(compute_done)
235
+ wp.copy(a_new[iy_end], a_new[iy_start], stream=push_bottom_stream)
236
+ push_bottom_stream.record_event(push_bottom_done)
237
+
238
+ if calculate_norm:
239
+ wp.synchronize_stream(compute_stream)
240
+
241
+ l2_norm = math.sqrt(l2_norm_h.numpy()[0])
242
+
243
+ if verbose and iter % 100 == 0:
244
+ print(f"{iter:5d}, {l2_norm:.6f}")
245
+
246
+ # Swap arrays
247
+ a, a_new = a_new, a
248
+
249
+ iter += 1
250
+
251
+ wp.synchronize_device()
252
+ stop_time = MPI.Wtime()
253
+
254
+ a_ref_h = wp.empty((ny, nx), dtype=wptype, device="cpu")
255
+ wp.copy(a_ref_h, a)
256
+
257
+ return stop_time - start_time, a_ref_h
258
+
259
+
260
+ class Example:
261
+ def __init__(
262
+ self,
263
+ nx: int = 16384,
264
+ ny: int = 16384,
265
+ iter_max: int = 1000,
266
+ nccheck: int = 1,
267
+ csv: bool = False,
268
+ ):
269
+ self.iter_max = iter_max
270
+ self.nx = nx # Global resolution
271
+ self.ny = ny # Global resolution
272
+ self.nccheck = nccheck
273
+ self.csv = csv
274
+
275
+ self.mpi_comm = MPI.COMM_WORLD
276
+ self.mpi_rank = self.mpi_comm.Get_rank()
277
+ self.mpi_size = self.mpi_comm.Get_size()
278
+
279
+ # Set the default device on the current rank
280
+ self.device = calc_default_device(self.mpi_comm)
281
+ wp.set_device(self.device)
282
+
283
+ # We need to disable memory pools for peer-to-peer transfers using MPI
284
+ # wp.set_mempool_enabled(wp.get_cuda_device(), False)
285
+ self.compute_stream = wp.Stream()
286
+ self.compute_done = wp.Event()
287
+
288
+ # Compute the solution on a single GPU for comparisons
289
+ self.runtime_serial, self.a_ref_h = benchmark_single_gpu(
290
+ self.nx, self.ny, self.iter_max, self.nccheck, not self.csv and self.mpi_rank == 0
291
+ )
292
+
293
+ # num_local_rows: Number of rows from the full (self.ny, self.nx) solution that
294
+ # this rank will calculate (excludes halo regions)
295
+ # iy_start_global: Allows us to go from a local index to a global index.
296
+
297
+ # self.ny-2 rows are distributed among the ranks for comparison with single-GPU case,
298
+ # which reserves the first and last rows for the boundary conditions
299
+ iy_decomp_start, self.num_local_rows = calc_decomp_1d(self.ny - 2, self.mpi_rank, self.mpi_size)
300
+
301
+ # Add 1 to get the global start index since the 1-D decomposition excludes the boundaries
302
+ self.iy_start_global = iy_decomp_start + 1
303
+
304
+ self.mpi_comm.Barrier()
305
+ if not self.csv:
306
+ print(
307
+ f"Rank {self.mpi_rank} on device {wp.get_cuda_device().pci_bus_id}: "
308
+ f"{self.num_local_rows} rows from y = {self.iy_start_global} to y = {self.iy_start_global + self.num_local_rows - 1}"
309
+ )
310
+ self.mpi_comm.Barrier()
311
+
312
+ # Allocate local array (the +2 is for the halo layer on each side)
313
+ self.a = wp.zeros((self.num_local_rows + 2, self.nx), dtype=wptype)
314
+ self.a_new = wp.zeros_like(self.a)
315
+
316
+ # Allocate host array for the final result
317
+ self.a_h = wp.empty((self.ny, self.nx), dtype=wptype, device="cpu")
318
+
319
+ self.l2_norm_d = wp.zeros((1,), dtype=wptype)
320
+ self.l2_norm_h = wp.ones_like(self.l2_norm_d, device="cpu", pinned=True)
321
+
322
+ # Boundary Conditions
323
+ # - y-boundaries (iy=0 and iy=self.ny-1): Periodic
324
+ # - x-boundaries (ix=0 and ix=self.nx-1): Dirichlet
325
+
326
+ # Local Indices
327
+ self.iy_start = 1
328
+ self.iy_end = self.iy_start + self.num_local_rows # Last owned row begins at [iy_end-1, 0]
329
+
330
+ # Don't need to loop over the Dirichlet boundaries in the Jacobi iteration
331
+ self.update_shape = (self.num_local_rows, self.nx - 2)
332
+
333
+ # Used for inter-rank communication
334
+ self.lower_neighbor = (self.mpi_rank + 1) % self.mpi_size
335
+ self.upper_neighbor = self.mpi_rank - 1 if self.mpi_rank > 0 else self.mpi_size - 1
336
+
337
+ # Apply Dirichlet boundary conditions to both a and a_new
338
+ wp.launch(
339
+ initialize_boundaries,
340
+ dim=(self.num_local_rows + 2,),
341
+ inputs=[self.nx, self.ny, self.iy_start_global - 1],
342
+ outputs=[self.a, self.a_new],
343
+ )
344
+
345
+ # MPI Warmup
346
+ wp.synchronize_device()
347
+
348
+ for _mpi_warmup in range(10):
349
+ self.apply_periodic_bc()
350
+ self.a, self.a_new = self.a_new, self.a
351
+
352
+ wp.synchronize_device()
353
+
354
+ if not self.csv and self.mpi_rank == 0:
355
+ print(
356
+ f"Jacobi relaxation: {self.iter_max} iterations on {self.ny} x {self.nx} mesh with norm check "
357
+ f"every {self.nccheck} iterations"
358
+ )
359
+
360
+ def apply_periodic_bc(self) -> None:
361
+ """Apply periodic boundary conditions to the array.
362
+
363
+ This function sends the first row of owned data to the lower neighbor
364
+ and the last row of owned data to the upper neighbor.
365
+ """
366
+ # Send the first row of owned data to the lower neighbor
367
+ self.mpi_comm.Sendrecv(
368
+ self.a_new[self.iy_start], self.lower_neighbor, 0, self.a_new[self.iy_end], self.upper_neighbor, 0
369
+ )
370
+ # Send the last row of owned data to the upper neighbor
371
+ self.mpi_comm.Sendrecv(
372
+ self.a_new[self.iy_end - 1], self.upper_neighbor, 0, self.a_new[0], self.lower_neighbor, 0
373
+ )
374
+
375
+ def step(self, calculate_norm: bool) -> None:
376
+ """Perform a single Jacobi iteration step."""
377
+ with wp.ScopedStream(self.compute_stream):
378
+ self.l2_norm_d.zero_()
379
+ wp.launch(
380
+ jacobi_update,
381
+ self.update_shape,
382
+ inputs=[self.a, self.iy_start, self.iy_end, self.nx, calculate_norm],
383
+ outputs=[self.a_new, self.l2_norm_d],
384
+ )
385
+ wp.record_event(self.compute_done)
386
+
387
+ def run(self) -> None:
388
+ """Run the Jacobi relaxation on multiple GPUs using MPI and compare with single-GPU results."""
389
+ iter = 0
390
+ l2_norm = np.array([1.0], dtype=warp_type_to_np_dtype[wptype])
391
+
392
+ start_time = MPI.Wtime()
393
+
394
+ while l2_norm > tol and iter < self.iter_max:
395
+ calculate_norm = (iter % self.nccheck == 0) or (not self.csv and iter % 100 == 0)
396
+
397
+ self.step(calculate_norm)
398
+
399
+ if calculate_norm:
400
+ wp.copy(self.l2_norm_h, self.l2_norm_d, stream=self.compute_stream)
401
+
402
+ wp.synchronize_event(self.compute_done)
403
+
404
+ self.apply_periodic_bc()
405
+
406
+ if calculate_norm:
407
+ wp.synchronize_stream(self.compute_stream)
408
+
409
+ self.mpi_comm.Allreduce(self.l2_norm_h.numpy(), l2_norm)
410
+ l2_norm = np.sqrt(l2_norm)
411
+
412
+ if not self.csv and self.mpi_rank == 0 and iter % 100 == 0:
413
+ print(f"{iter:5d}, {l2_norm[0]:.6f}")
414
+
415
+ # Swap arrays
416
+ self.a, self.a_new = self.a_new, self.a
417
+
418
+ iter += 1
419
+
420
+ wp.synchronize_device()
421
+ stop_time = MPI.Wtime()
422
+
423
+ result_correct = self.check_results(tol)
424
+ global_result_correct = self.mpi_comm.allreduce(result_correct, op=MPI.MIN)
425
+
426
+ if not global_result_correct:
427
+ sys.exit(1)
428
+ elif global_result_correct and self.mpi_rank == 0:
429
+ if self.csv:
430
+ print(
431
+ f"mpi, {self.nx}, {self.ny}, {self.iter_max}, {self.nccheck}, {self.mpi_size}, 1, "
432
+ f"{stop_time - start_time}, {self.runtime_serial}"
433
+ )
434
+ else:
435
+ print(f"Num GPUs: {self.mpi_size}")
436
+ print(
437
+ f"{self.ny}x{self.nx}: 1 GPU: {self.runtime_serial:8.4f} s, "
438
+ f"{self.mpi_size} GPUs {stop_time - start_time:8.4f} s, "
439
+ f"speedup: {self.runtime_serial / (stop_time - start_time):8.2f}, "
440
+ f"efficiency: {self.runtime_serial / (stop_time - start_time) / self.mpi_size * 100:8.2f}"
441
+ )
442
+
443
+ def check_results(self, tol: float = 1e-8) -> bool:
444
+ """Returns ``True`` if multi-GPU result is within ``tol`` of the single-GPU result.
445
+
446
+ Comparison is performed on the host in a serial manner.
447
+ """
448
+ result_correct = True
449
+
450
+ wp.copy(
451
+ self.a_h,
452
+ self.a,
453
+ dest_offset=self.iy_start_global * self.nx,
454
+ src_offset=self.nx,
455
+ count=self.num_local_rows * self.nx,
456
+ )
457
+
458
+ a_ref_np = self.a_ref_h.numpy()
459
+ a_np = self.a_h.numpy()
460
+
461
+ for iy in range(self.iy_start_global, self.iy_start_global + self.num_local_rows):
462
+ if not result_correct:
463
+ break
464
+ for ix in range(1, self.nx - 1):
465
+ if math.fabs(a_ref_np[iy, ix] - a_np[iy, ix]) > tol:
466
+ result_correct = False
467
+ print(
468
+ f"ERROR on rank {self.mpi_rank}: a[{iy},{ix}] = {a_np[iy, ix]} does not match "
469
+ f"{a_ref_np[iy, ix]} (reference)"
470
+ )
471
+ break
472
+
473
+ return result_correct
474
+
475
+
476
+ if __name__ == "__main__":
477
+ import argparse
478
+
479
+ parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter)
480
+
481
+ parser.add_argument("--itermax", type=int, default=1000, help="Maximum number of Jacobi iterations.")
482
+ parser.add_argument("--nccheck", type=int, default=1, help="Check convergence every nccheck iterations.")
483
+ parser.add_argument("--nx", type=int, default=16384, help="Total resolution in x.")
484
+ parser.add_argument("--ny", type=int, default=16384, help="Total resolution in y.")
485
+ parser.add_argument("-csv", action="store_true", help="Print results as CSV values.")
486
+ parser.add_argument(
487
+ "--visualize",
488
+ action="store_true",
489
+ help="Display the final solution in a graphical window using matplotlib.",
490
+ )
491
+
492
+ args = parser.parse_known_args()[0]
493
+
494
+ example = Example(args.nx, args.ny, args.itermax, args.nccheck, args.csv)
495
+
496
+ example.run()
497
+
498
+ if args.visualize:
499
+ import matplotlib.pyplot as plt
500
+
501
+ # Plot the final result
502
+ plt.imshow(example.a.numpy(), cmap="viridis", origin="lower", vmin=-1, vmax=1)
503
+ plt.colorbar(label="Value")
504
+ plt.title(f"Rank {example.mpi_rank} Jacobi Iteration Result")
505
+ plt.xlabel("X-axis")
506
+ plt.ylabel("Y-axis")
507
+ plt.show()
@@ -117,7 +117,7 @@ def divergence_form(s: Sample, domain: Domain, u: Field, psi: Field):
117
117
  def invert_volume_kernel(values: wp.array(dtype=float)):
118
118
  i = wp.tid()
119
119
  m = values[i]
120
- values[i] = wp.select(m == 0.0, 1.0 / m, 0.0)
120
+ values[i] = wp.where(m == 0.0, 0.0, 1.0 / m)
121
121
 
122
122
 
123
123
  @wp.kernel
@@ -75,7 +75,7 @@ def cell_transport_form(s: fem.Sample, domain: fem.Domain, u: fem.Field, v: fem.
75
75
  def initial_condition(s: fem.Sample, domain: fem.Domain):
76
76
  x = domain(s)[0] * 2.0
77
77
  wave = wp.sin(x * wp.pi)
78
- return wp.vec2(wp.select(x <= 1.0, 0.0, wave), 0.0)
78
+ return wp.vec2(wp.where(x <= 1.0, wave, 0.0), 0.0)
79
79
 
80
80
 
81
81
  @fem.integrand
@@ -87,7 +87,7 @@ def velocity_norm(s: fem.Sample, u: fem.Field):
87
87
  def minmod(a: float, b: float):
88
88
  sa = wp.sign(a)
89
89
  sb = wp.sign(b)
90
- return wp.select(sa == sb, 0.0, sa * wp.min(wp.abs(a), wp.abs(b)))
90
+ return wp.where(sa == sb, sa * wp.min(wp.abs(a), wp.abs(b)), 0.0)
91
91
 
92
92
 
93
93
  @fem.integrand
@@ -57,7 +57,7 @@ def boundary_projector_form(
57
57
  Bilinear boundary condition projector form, non-zero on radial edges
58
58
  """
59
59
  nor = fem.normal(domain, s)
60
- active = wp.select(nor[0] < -0.9999 or nor[1] < -0.9999, 0.0, 1.0)
60
+ active = wp.where(nor[0] < -0.9999 or nor[1] < -0.9999, 1.0, 0.0)
61
61
  return active * u(s) * v(s)
62
62
 
63
63
 
@@ -82,7 +82,7 @@ def boundary_projector_form(
82
82
  ):
83
83
  # Fix a single point
84
84
  # (underconstrained, solution up to a rotation in UV space)
85
- w = wp.select(s.qp_index == 0, 0.0, 1.0)
85
+ w = wp.where(s.qp_index == 0, 1.0, 0.0)
86
86
  return w * wp.dot(u(s), v(s))
87
87
 
88
88
 
@@ -60,8 +60,8 @@ def cube_to_cylinder_grad(x: wp.vec3):
60
60
  dir_grad = (wp.identity(n=3, dtype=float) - wp.outer(dir_xz, dir_xz)) / wp.length(pos_xz)
61
61
 
62
62
  abs_xz = wp.abs(pos_xz)
63
- xinf_grad = wp.select(
64
- abs_xz[0] > abs_xz[2], wp.vec3(0.0, 0.0, wp.sign(pos_xz[2])), wp.vec(wp.sign(pos_xz[0]), 0.0, 0.0)
63
+ xinf_grad = wp.where(
64
+ abs_xz[0] > abs_xz[2], wp.vec(wp.sign(pos_xz[0]), 0.0, 0.0), wp.vec3(0.0, 0.0, wp.sign(pos_xz[2]))
65
65
  )
66
66
  grad = dir_grad * wp.max(abs_xz) + wp.outer(dir_xz, xinf_grad)
67
67
 
@@ -85,10 +85,10 @@ def permeability_field(
85
85
  r = wp.sqrt(x * x + z * z)
86
86
 
87
87
  if r <= core_radius:
88
- return wp.select(y < core_height, MU_0, MU_i)
88
+ return wp.where(y < core_height, MU_i, MU_0)
89
89
 
90
90
  if r >= coil_internal_radius and r <= coil_external_radius:
91
- return wp.select(y < coil_height, MU_0, MU_c)
91
+ return wp.where(y < coil_height, MU_c, MU_0)
92
92
 
93
93
  return MU_0
94
94
 
@@ -107,10 +107,10 @@ def current_field(
107
107
 
108
108
  r = wp.sqrt(x * x + z * z)
109
109
 
110
- return wp.select(
110
+ return wp.where(
111
111
  y < coil_height and r >= coil_internal_radius and r <= coil_external_radius,
112
- wp.vec3(0.0),
113
112
  wp.vec3(z, 0.0, -x) * current / r,
113
+ wp.vec3(0.0),
114
114
  )
115
115
 
116
116
 
@@ -34,6 +34,9 @@ __all__ = [
34
34
  "Plot",
35
35
  ]
36
36
 
37
+ # matrix inversion routines contain nested loops,
38
+ # default unrolling leads to code explosion
39
+ wp.set_module_options({"max_unroll": 6})
37
40
 
38
41
  #
39
42
  # Mesh utilities
@@ -225,6 +228,7 @@ def bsr_cg(
225
228
  mv_routine=None,
226
229
  quiet=False,
227
230
  method: str = "cg",
231
+ M: BsrMatrix = None,
228
232
  ) -> Tuple[float, int]:
229
233
  """Solves the linear system A x = b using an iterative solver, optionally with diagonal preconditioning
230
234
 
@@ -245,7 +249,9 @@ def bsr_cg(
245
249
 
246
250
  """
247
251
 
248
- if mv_routine is None:
252
+ if M is not None:
253
+ M = aslinearoperator(M)
254
+ elif mv_routine is None:
249
255
  M = preconditioner(A, "diag") if use_diag_precond else None
250
256
  else:
251
257
  A = LinearOperator(A.shape, A.dtype, A.device, matvec=mv_routine)
@@ -458,7 +464,7 @@ def bsr_solve_saddle(
458
464
  return err, end_iter
459
465
 
460
466
 
461
- @wp.kernel
467
+ @wp.kernel(enable_backward=False)
462
468
  def _compute_schur_inverse_diagonal(
463
469
  B_offsets: wp.array(dtype=int),
464
470
  B_indices: wp.array(dtype=int),
@@ -500,7 +506,7 @@ def invert_diagonal_bsr_matrix(A: BsrMatrix):
500
506
  )
501
507
 
502
508
 
503
- @wp.kernel
509
+ @wp.kernel(enable_backward=False)
504
510
  def _block_diagonal_invert(values: wp.array(dtype=Any)):
505
511
  i = wp.tid()
506
512
  values[i] = fem.utils.inverse_qr(values[i])