warp-lang 0.11.0__py3-none-manylinux2014_x86_64.whl → 1.0.0__py3-none-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of warp-lang might be problematic. Click here for more details.

Files changed (170) hide show
  1. warp/__init__.py +8 -0
  2. warp/bin/warp-clang.so +0 -0
  3. warp/bin/warp.so +0 -0
  4. warp/build.py +7 -6
  5. warp/build_dll.py +70 -79
  6. warp/builtins.py +10 -6
  7. warp/codegen.py +51 -19
  8. warp/config.py +7 -8
  9. warp/constants.py +3 -0
  10. warp/context.py +948 -245
  11. warp/dlpack.py +198 -113
  12. warp/examples/assets/bunny.usd +0 -0
  13. warp/examples/assets/cartpole.urdf +110 -0
  14. warp/examples/assets/crazyflie.usd +0 -0
  15. warp/examples/assets/cube.usda +42 -0
  16. warp/examples/assets/nv_ant.xml +92 -0
  17. warp/examples/assets/nv_humanoid.xml +183 -0
  18. warp/examples/assets/quadruped.urdf +268 -0
  19. warp/examples/assets/rocks.nvdb +0 -0
  20. warp/examples/assets/rocks.usd +0 -0
  21. warp/examples/assets/sphere.usda +56 -0
  22. warp/examples/assets/torus.usda +105 -0
  23. warp/examples/benchmarks/benchmark_api.py +383 -0
  24. warp/examples/benchmarks/benchmark_cloth.py +279 -0
  25. warp/examples/benchmarks/benchmark_cloth_cupy.py +88 -0
  26. warp/examples/benchmarks/benchmark_cloth_jax.py +100 -0
  27. warp/examples/benchmarks/benchmark_cloth_numba.py +142 -0
  28. warp/examples/benchmarks/benchmark_cloth_numpy.py +77 -0
  29. warp/examples/benchmarks/benchmark_cloth_pytorch.py +86 -0
  30. warp/examples/benchmarks/benchmark_cloth_taichi.py +112 -0
  31. warp/examples/benchmarks/benchmark_cloth_warp.py +146 -0
  32. warp/examples/benchmarks/benchmark_launches.py +295 -0
  33. warp/examples/core/example_dem.py +221 -0
  34. warp/examples/core/example_fluid.py +267 -0
  35. warp/examples/core/example_graph_capture.py +129 -0
  36. warp/examples/core/example_marching_cubes.py +177 -0
  37. warp/examples/core/example_mesh.py +154 -0
  38. warp/examples/core/example_mesh_intersect.py +193 -0
  39. warp/examples/core/example_nvdb.py +169 -0
  40. warp/examples/core/example_raycast.py +89 -0
  41. warp/examples/core/example_raymarch.py +178 -0
  42. warp/examples/core/example_render_opengl.py +141 -0
  43. warp/examples/core/example_sph.py +389 -0
  44. warp/examples/core/example_torch.py +181 -0
  45. warp/examples/core/example_wave.py +249 -0
  46. warp/examples/fem/bsr_utils.py +380 -0
  47. warp/examples/fem/example_apic_fluid.py +391 -0
  48. warp/examples/fem/example_convection_diffusion.py +168 -0
  49. warp/examples/fem/example_convection_diffusion_dg.py +209 -0
  50. warp/examples/fem/example_convection_diffusion_dg0.py +194 -0
  51. warp/examples/fem/example_deformed_geometry.py +159 -0
  52. warp/examples/fem/example_diffusion.py +173 -0
  53. warp/examples/fem/example_diffusion_3d.py +152 -0
  54. warp/examples/fem/example_diffusion_mgpu.py +214 -0
  55. warp/examples/fem/example_mixed_elasticity.py +222 -0
  56. warp/examples/fem/example_navier_stokes.py +243 -0
  57. warp/examples/fem/example_stokes.py +192 -0
  58. warp/examples/fem/example_stokes_transfer.py +249 -0
  59. warp/examples/fem/mesh_utils.py +109 -0
  60. warp/examples/fem/plot_utils.py +287 -0
  61. warp/examples/optim/example_bounce.py +248 -0
  62. warp/examples/optim/example_cloth_throw.py +210 -0
  63. warp/examples/optim/example_diffray.py +535 -0
  64. warp/examples/optim/example_drone.py +850 -0
  65. warp/examples/optim/example_inverse_kinematics.py +169 -0
  66. warp/examples/optim/example_inverse_kinematics_torch.py +170 -0
  67. warp/examples/optim/example_spring_cage.py +234 -0
  68. warp/examples/optim/example_trajectory.py +201 -0
  69. warp/examples/sim/example_cartpole.py +128 -0
  70. warp/examples/sim/example_cloth.py +184 -0
  71. warp/examples/sim/example_granular.py +113 -0
  72. warp/examples/sim/example_granular_collision_sdf.py +185 -0
  73. warp/examples/sim/example_jacobian_ik.py +213 -0
  74. warp/examples/sim/example_particle_chain.py +106 -0
  75. warp/examples/sim/example_quadruped.py +179 -0
  76. warp/examples/sim/example_rigid_chain.py +191 -0
  77. warp/examples/sim/example_rigid_contact.py +176 -0
  78. warp/examples/sim/example_rigid_force.py +126 -0
  79. warp/examples/sim/example_rigid_gyroscopic.py +97 -0
  80. warp/examples/sim/example_rigid_soft_contact.py +124 -0
  81. warp/examples/sim/example_soft_body.py +178 -0
  82. warp/fabric.py +29 -20
  83. warp/fem/cache.py +0 -1
  84. warp/fem/dirichlet.py +0 -2
  85. warp/fem/integrate.py +0 -1
  86. warp/jax.py +45 -0
  87. warp/jax_experimental.py +339 -0
  88. warp/native/builtin.h +12 -0
  89. warp/native/bvh.cu +18 -18
  90. warp/native/clang/clang.cpp +8 -3
  91. warp/native/cuda_util.cpp +94 -5
  92. warp/native/cuda_util.h +35 -6
  93. warp/native/cutlass_gemm.cpp +1 -1
  94. warp/native/cutlass_gemm.cu +4 -1
  95. warp/native/error.cpp +66 -0
  96. warp/native/error.h +27 -0
  97. warp/native/mesh.cu +2 -2
  98. warp/native/reduce.cu +4 -4
  99. warp/native/runlength_encode.cu +2 -2
  100. warp/native/scan.cu +2 -2
  101. warp/native/sparse.cu +0 -1
  102. warp/native/temp_buffer.h +2 -2
  103. warp/native/warp.cpp +95 -60
  104. warp/native/warp.cu +1053 -218
  105. warp/native/warp.h +49 -32
  106. warp/optim/linear.py +33 -16
  107. warp/render/render_opengl.py +202 -101
  108. warp/render/render_usd.py +82 -40
  109. warp/sim/__init__.py +13 -4
  110. warp/sim/articulation.py +4 -5
  111. warp/sim/collide.py +320 -175
  112. warp/sim/import_mjcf.py +25 -30
  113. warp/sim/import_urdf.py +94 -63
  114. warp/sim/import_usd.py +51 -36
  115. warp/sim/inertia.py +3 -2
  116. warp/sim/integrator.py +233 -0
  117. warp/sim/integrator_euler.py +447 -469
  118. warp/sim/integrator_featherstone.py +1991 -0
  119. warp/sim/integrator_xpbd.py +1420 -640
  120. warp/sim/model.py +765 -487
  121. warp/sim/particles.py +2 -1
  122. warp/sim/render.py +35 -13
  123. warp/sim/utils.py +222 -11
  124. warp/stubs.py +8 -0
  125. warp/tape.py +16 -1
  126. warp/tests/aux_test_grad_customs.py +23 -0
  127. warp/tests/test_array.py +190 -1
  128. warp/tests/test_async.py +656 -0
  129. warp/tests/test_bool.py +50 -0
  130. warp/tests/test_dlpack.py +164 -11
  131. warp/tests/test_examples.py +166 -74
  132. warp/tests/test_fem.py +8 -1
  133. warp/tests/test_generics.py +15 -5
  134. warp/tests/test_grad.py +1 -1
  135. warp/tests/test_grad_customs.py +172 -12
  136. warp/tests/test_jax.py +254 -0
  137. warp/tests/test_large.py +29 -6
  138. warp/tests/test_launch.py +25 -0
  139. warp/tests/test_linear_solvers.py +20 -3
  140. warp/tests/test_matmul.py +61 -16
  141. warp/tests/test_matmul_lite.py +13 -13
  142. warp/tests/test_mempool.py +186 -0
  143. warp/tests/test_multigpu.py +3 -0
  144. warp/tests/test_options.py +16 -2
  145. warp/tests/test_peer.py +137 -0
  146. warp/tests/test_print.py +3 -1
  147. warp/tests/test_quat.py +23 -0
  148. warp/tests/test_sim_kinematics.py +97 -0
  149. warp/tests/test_snippet.py +126 -3
  150. warp/tests/test_streams.py +108 -79
  151. warp/tests/test_torch.py +16 -8
  152. warp/tests/test_utils.py +32 -27
  153. warp/tests/test_verify_fp.py +65 -0
  154. warp/tests/test_volume.py +1 -1
  155. warp/tests/unittest_serial.py +2 -0
  156. warp/tests/unittest_suites.py +12 -0
  157. warp/tests/unittest_utils.py +14 -7
  158. warp/thirdparty/unittest_parallel.py +15 -3
  159. warp/torch.py +10 -8
  160. warp/types.py +363 -246
  161. warp/utils.py +143 -19
  162. warp_lang-1.0.0.dist-info/LICENSE.md +126 -0
  163. warp_lang-1.0.0.dist-info/METADATA +394 -0
  164. {warp_lang-0.11.0.dist-info → warp_lang-1.0.0.dist-info}/RECORD +167 -86
  165. warp/sim/optimizer.py +0 -138
  166. warp_lang-0.11.0.dist-info/LICENSE.md +0 -36
  167. warp_lang-0.11.0.dist-info/METADATA +0 -238
  168. /warp/tests/{walkthough_debug.py → walkthrough_debug.py} +0 -0
  169. {warp_lang-0.11.0.dist-info → warp_lang-1.0.0.dist-info}/WHEEL +0 -0
  170. {warp_lang-0.11.0.dist-info → warp_lang-1.0.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,97 @@
1
+ # Copyright (c) 2022 NVIDIA CORPORATION. All rights reserved.
2
+ # NVIDIA CORPORATION and its licensors retain all intellectual property
3
+ # and proprietary rights in and to this software, related documentation
4
+ # and any modifications thereto. Any use, reproduction, disclosure or
5
+ # distribution of this software and related documentation without an express
6
+ # license agreement from NVIDIA CORPORATION is strictly prohibited.
7
+
8
+ import unittest
9
+
10
+ import warp as wp
11
+ from warp.tests.unittest_utils import *
12
+
13
+ import math
14
+ import os
15
+
16
+ import numpy as np
17
+
18
+ import warp as wp
19
+ import warp.sim
20
+
21
+ wp.init()
22
+
23
+
24
+ def test_fk_ik(test, device):
25
+
26
+ builder = wp.sim.ModelBuilder()
27
+
28
+ num_envs = 1
29
+
30
+ for i in range(num_envs):
31
+ wp.sim.parse_mjcf(
32
+ os.path.join(os.path.dirname(__file__), "../examples/assets/nv_ant.xml"),
33
+ builder,
34
+ stiffness=0.0,
35
+ damping=1.0,
36
+ armature=0.1,
37
+ contact_ke=1.0e4,
38
+ contact_kd=1.0e2,
39
+ contact_kf=1.0e2,
40
+ contact_mu=0.75,
41
+ limit_ke=1.0e3,
42
+ limit_kd=1.0e1,
43
+ up_axis="y",
44
+ )
45
+
46
+ coord_count = 15
47
+ dof_count = 14
48
+
49
+ coord_start = i * coord_count
50
+ dof_start = i * dof_count
51
+
52
+ # base
53
+ builder.joint_q[coord_start : coord_start + 3] = [i * 2.0, 0.70, 0.0]
54
+ builder.joint_q[coord_start + 3 : coord_start + 7] = wp.quat_from_axis_angle(
55
+ wp.vec3(1.0, 0.0, 0.0), -math.pi * 0.5
56
+ )
57
+
58
+ # joints
59
+ builder.joint_q[coord_start + 7 : coord_start + coord_count] = [0.0, 1.0, 0.0, -1.0, 0.0, -1.0, 0.0, 1.0]
60
+ builder.joint_qd[dof_start + 6 : dof_start + dof_count] = [1.0, 1.0, 1.0, -1.0, 1.0, -1.0, 1.0, 1.0]
61
+
62
+ # finalize model
63
+ model = builder.finalize()
64
+ model.ground = True
65
+ model.joint_attach_ke *= 16.0
66
+ model.joint_attach_kd *= 4.0
67
+
68
+ state = model.state()
69
+
70
+ # save a copy of joint values
71
+ q_fk = model.joint_q.numpy()
72
+ qd_fk = model.joint_qd.numpy()
73
+
74
+ wp.sim.eval_fk(model, model.joint_q, model.joint_qd, None, state)
75
+
76
+ q_ik = wp.zeros_like(model.joint_q)
77
+ qd_ik = wp.zeros_like(model.joint_qd)
78
+
79
+ wp.sim.eval_ik(model, state, q_ik, qd_ik)
80
+
81
+ assert_np_equal(q_fk, q_ik.numpy(), tol=1e-6)
82
+ assert_np_equal(qd_fk, qd_ik.numpy(), tol=1e-6)
83
+
84
+
85
+ devices = get_test_devices()
86
+
87
+
88
+ class TestSimKinematics(unittest.TestCase):
89
+ pass
90
+
91
+
92
+ add_function_test(TestSimKinematics, "test_fk_ik", test_fk_ik, devices=devices)
93
+
94
+
95
+ if __name__ == "__main__":
96
+ wp.build.clear_kernel_cache()
97
+ unittest.main(verbosity=2, failfast=True)
@@ -13,9 +13,9 @@ def test_basic(test, device):
13
13
  out[tid] = a * x[tid] + y[tid];
14
14
  """
15
15
  adj_snippet = """
16
- adj_a = x[tid] * adj_out[tid];
17
- adj_x[tid] = a * adj_out[tid];
18
- adj_y[tid] = adj_out[tid];
16
+ adj_a += x[tid] * adj_out[tid];
17
+ adj_x[tid] += a * adj_out[tid];
18
+ adj_y[tid] += adj_out[tid];
19
19
  """
20
20
 
21
21
  @wp.func_native(snippet, adj_snippet)
@@ -86,6 +86,7 @@ def test_shared_memory(test, device):
86
86
 
87
87
  @wp.func_native(snippet)
88
88
  def reverse(d: wp.array(dtype=int), N: int, tid: int):
89
+ """Reverse the array d in place using shared memory."""
89
90
  return
90
91
 
91
92
  @wp.kernel
@@ -100,6 +101,7 @@ def test_shared_memory(test, device):
100
101
  wp.launch(kernel=reverse_kernel, dim=N, inputs=[x, N], device=device)
101
102
 
102
103
  assert_np_equal(x.numpy(), y)
104
+ assert reverse.__doc__ == "Reverse the array d in place using shared memory."
103
105
 
104
106
 
105
107
  def test_cpu_snippet(test, device):
@@ -130,6 +132,124 @@ def test_cpu_snippet(test, device):
130
132
  assert_np_equal(out.numpy(), np.arange(1, N + 1, 1, dtype=np.int32))
131
133
 
132
134
 
135
+ def test_custom_replay_grad(test, device):
136
+ num_threads = 16
137
+ counter = wp.zeros(1, dtype=wp.int32, device=device)
138
+ thread_ids = wp.zeros(num_threads, dtype=wp.int32, device=device)
139
+ inputs = wp.array(np.arange(num_threads, dtype=np.float32), device=device, requires_grad=True)
140
+ outputs = wp.zeros_like(inputs)
141
+
142
+ snippet = """
143
+ int next_index = atomicAdd(counter, 1);
144
+ thread_values[tid] = next_index;
145
+ """
146
+ replay_snippet = ""
147
+
148
+ @wp.func_native(snippet, replay_snippet=replay_snippet)
149
+ def reversible_increment(
150
+ counter: wp.array(dtype=int), thread_values: wp.array(dtype=int), tid: int
151
+ ):
152
+ ...
153
+
154
+ @wp.kernel
155
+ def run_atomic_add(
156
+ input: wp.array(dtype=float),
157
+ counter: wp.array(dtype=int),
158
+ thread_values: wp.array(dtype=int),
159
+ output: wp.array(dtype=float),
160
+ ):
161
+ tid = wp.tid()
162
+ reversible_increment(counter, thread_values, tid)
163
+ idx = thread_values[tid]
164
+ output[idx] = input[idx] ** 2.0
165
+
166
+ tape = wp.Tape()
167
+ with tape:
168
+ wp.launch(
169
+ run_atomic_add, dim=num_threads, inputs=[inputs, counter, thread_ids], outputs=[outputs], device=device
170
+ )
171
+
172
+ tape.backward(grads={outputs: wp.array(np.ones(num_threads, dtype=np.float32), device=device)})
173
+ assert_np_equal(inputs.grad.numpy(), 2.0 * inputs.numpy(), tol=1e-4)
174
+
175
+
176
+ def test_replay_simplification(test, device):
177
+ num_threads = 8
178
+ x = wp.array(1.0 + np.arange(num_threads, dtype=np.float32), device=device, requires_grad=True)
179
+ y = wp.zeros_like(x)
180
+ z = wp.zeros_like(x)
181
+
182
+ snippet = "y[tid] = powf(x[tid], 2.0);"
183
+ replay_snippet = "y[tid] = x[tid];"
184
+ adj_snippet = "adj_x[tid] += 2.0 * adj_y[tid];"
185
+
186
+ @wp.func_native(snippet, adj_snippet=adj_snippet, replay_snippet=replay_snippet)
187
+ def square(x: wp.array(dtype=float), y: wp.array(dtype=float), tid: int):
188
+ ...
189
+
190
+ @wp.kernel
191
+ def log_square_kernel(
192
+ x: wp.array(dtype=float),
193
+ y: wp.array(dtype=float),
194
+ z: wp.array(dtype=float)
195
+ ):
196
+ tid = wp.tid()
197
+ square(x, y, tid)
198
+ z[tid] = wp.log(y[tid])
199
+
200
+ tape = wp.Tape()
201
+ with tape:
202
+ wp.launch(log_square_kernel, dim=num_threads, inputs=[x, y], outputs=[z], device=device)
203
+
204
+ tape.backward(grads={z: wp.array(np.ones(num_threads, dtype=np.float32), device=device)})
205
+ assert_np_equal(x.grad.numpy(), 2.0 / (1.0 + np.arange(num_threads)), tol=1e-6)
206
+
207
+
208
+ def test_recompile_snippet(test, device):
209
+ snippet = """
210
+ int inc = 1;
211
+ out[tid] = x[tid] + inc;
212
+ """
213
+
214
+ @wp.func_native(snippet)
215
+ def increment_snippet(
216
+ x: wp.array(dtype=wp.int32),
217
+ out: wp.array(dtype=wp.int32),
218
+ tid: int,
219
+ ):
220
+ ...
221
+
222
+ @wp.kernel
223
+ def increment(x: wp.array(dtype=wp.int32), out: wp.array(dtype=wp.int32)):
224
+ tid = wp.tid()
225
+ increment_snippet(x, out, tid)
226
+
227
+ N = 128
228
+ x = wp.array(np.arange(N, dtype=np.int32), dtype=wp.int32, device=device)
229
+ out = wp.zeros(N, dtype=wp.int32, device=device)
230
+
231
+ wp.launch(kernel=increment, dim=N, inputs=[x], outputs=[out], device=device)
232
+
233
+ assert_np_equal(out.numpy(), np.arange(1, N + 1, 1, dtype=np.int32))
234
+
235
+ snippet = """
236
+ int inc = 2;
237
+ out[tid] = x[tid] + inc;
238
+ """
239
+
240
+ @wp.func_native(snippet)
241
+ def increment_snippet(
242
+ x: wp.array(dtype=wp.int32),
243
+ out: wp.array(dtype=wp.int32),
244
+ tid: int,
245
+ ):
246
+ ...
247
+
248
+ wp.launch(kernel=increment, dim=N, inputs=[x], outputs=[out], device=device)
249
+
250
+ assert_np_equal(out.numpy(), 1 + np.arange(1, N + 1, 1, dtype=np.int32))
251
+
252
+
133
253
  class TestSnippets(unittest.TestCase):
134
254
  pass
135
255
 
@@ -137,6 +257,9 @@ class TestSnippets(unittest.TestCase):
137
257
  add_function_test(TestSnippets, "test_basic", test_basic, devices=get_unique_cuda_test_devices())
138
258
  add_function_test(TestSnippets, "test_shared_memory", test_shared_memory, devices=get_unique_cuda_test_devices())
139
259
  add_function_test(TestSnippets, "test_cpu_snippet", test_cpu_snippet, devices=["cpu"])
260
+ add_function_test(TestSnippets, "test_custom_replay_grad", test_custom_replay_grad, devices=get_unique_cuda_test_devices())
261
+ add_function_test(TestSnippets, "test_replay_simplification", test_replay_simplification, devices=get_unique_cuda_test_devices())
262
+ add_function_test(TestSnippets, "test_recompile_snippet", test_recompile_snippet, devices=get_unique_cuda_test_devices())
140
263
 
141
264
 
142
265
  if __name__ == "__main__":
@@ -10,6 +10,7 @@ import unittest
10
10
  import numpy as np
11
11
 
12
12
  import warp as wp
13
+ from warp.utils import check_iommu
13
14
  from warp.tests.unittest_utils import *
14
15
 
15
16
  wp.init()
@@ -37,19 +38,34 @@ def sum(a: wp.array(dtype=float), b: wp.array(dtype=float), c: wp.array(dtype=fl
37
38
  N = 10 * 1024 * 1024
38
39
 
39
40
 
40
- def test_stream_arg_implicit_sync(test, device):
41
- # wp.zeros() and array.numpy() should not require explicit sync
41
+ def test_stream_set(test, device):
42
42
 
43
+ device = wp.get_device(device)
44
+
45
+ old_stream = device.stream
46
+ new_stream = wp.Stream(device)
47
+
48
+ try:
49
+ wp.set_stream(new_stream, device)
50
+
51
+ test.assertTrue(device.has_stream)
52
+ test.assertEqual(device.stream, new_stream)
53
+
54
+ finally:
55
+ # restore original stream
56
+ wp.set_stream(old_stream, device)
57
+
58
+
59
+ def test_stream_arg_explicit_sync(test, device):
43
60
  a = wp.zeros(N, dtype=float, device=device)
44
- b = wp.empty(N, dtype=float, device=device)
61
+ b = wp.full(N, 42, dtype=float, device=device)
45
62
  c = wp.empty(N, dtype=float, device=device)
46
63
 
64
+ old_stream = wp.get_stream(device)
47
65
  new_stream = wp.Stream(device)
48
66
 
49
- # Exercise code path
50
- wp.set_stream(new_stream, device)
51
-
52
- test.assertTrue(wp.get_device(device).has_stream)
67
+ # allocations need to be explicitly synced before launching work using stream arguments
68
+ new_stream.wait_stream(old_stream)
53
69
 
54
70
  # launch work on new stream
55
71
  wp.launch(inc, dim=a.size, inputs=[a], stream=new_stream)
@@ -64,17 +80,17 @@ def test_stream_arg_implicit_sync(test, device):
64
80
 
65
81
 
66
82
  def test_stream_scope_implicit_sync(test, device):
67
- # wp.zeros() and array.numpy() should not require explicit sync
68
83
 
69
84
  with wp.ScopedDevice(device):
70
85
  a = wp.zeros(N, dtype=float)
71
- b = wp.empty(N, dtype=float)
86
+ b = wp.full(N, 42, dtype=float)
72
87
  c = wp.empty(N, dtype=float)
73
88
 
74
89
  old_stream = wp.get_stream()
75
90
  new_stream = wp.Stream()
76
91
 
77
92
  # launch work on new stream
93
+ # allocations are implicitly synced when entering wp.ScopedStream
78
94
  with wp.ScopedStream(new_stream):
79
95
  assert wp.get_stream() == new_stream
80
96
 
@@ -309,103 +325,116 @@ class TestStreams(unittest.TestCase):
309
325
  cpu_stream = cpu_device.stream # noqa: F841
310
326
 
311
327
  @unittest.skipUnless(len(wp.get_cuda_devices()) > 1, "Requires at least two CUDA devices")
328
+ @unittest.skipUnless(check_iommu(), "IOMMU seems enabled")
312
329
  def test_stream_arg_graph_mgpu(self):
313
330
  wp.load_module(device="cuda:0")
314
331
  wp.load_module(device="cuda:1")
315
332
 
316
- # resources on GPU 0
317
- stream0 = wp.get_stream("cuda:0")
318
- a0 = wp.zeros(N, dtype=float, device="cuda:0")
319
- b0 = wp.empty(N, dtype=float, device="cuda:0")
320
- c0 = wp.empty(N, dtype=float, device="cuda:0")
333
+ # Peer-to-peer copies are not possible during graph capture if the arrays were
334
+ # allocated using pooled allocators and mempool access is not enabled.
335
+ # Here, we force default CUDA allocators and pre-allocate the memory.
336
+ with wp.ScopedMempool("cuda:0", False), wp.ScopedMempool("cuda:1", False):
321
337
 
322
- # resources on GPU 1
323
- stream1 = wp.get_stream("cuda:1")
324
- a1 = wp.zeros(N, dtype=float, device="cuda:1")
338
+ # resources on GPU 0
339
+ stream0 = wp.get_stream("cuda:0")
340
+ a0 = wp.zeros(N, dtype=float, device="cuda:0")
341
+ b0 = wp.empty(N, dtype=float, device="cuda:0")
342
+ c0 = wp.empty(N, dtype=float, device="cuda:0")
325
343
 
326
- # start recording on stream0
327
- wp.capture_begin(stream=stream0, force_module_load=False)
328
- try:
329
- # branch into stream1
330
- stream1.wait_stream(stream0)
344
+ # resources on GPU 1
345
+ stream1 = wp.get_stream("cuda:1")
346
+ a1 = wp.zeros(N, dtype=float, device="cuda:1")
331
347
 
332
- # launch concurrent kernels on each stream
333
- wp.launch(inc, dim=N, inputs=[a0], stream=stream0)
334
- wp.launch(inc, dim=N, inputs=[a1], stream=stream1)
348
+ # start recording on stream0
349
+ wp.capture_begin(stream=stream0, force_module_load=False)
350
+ try:
351
+ # branch into stream1
352
+ stream1.wait_stream(stream0)
335
353
 
336
- # wait for stream1 to finish
337
- stream0.wait_stream(stream1)
354
+ # launch concurrent kernels on each stream
355
+ wp.launch(inc, dim=N, inputs=[a0], stream=stream0)
356
+ wp.launch(inc, dim=N, inputs=[a1], stream=stream1)
338
357
 
339
- # copy values from stream1
340
- wp.copy(b0, a1, stream=stream0)
358
+ # wait for stream1 to finish
359
+ stream0.wait_stream(stream1)
360
+
361
+ # copy values from stream1
362
+ wp.copy(b0, a1, stream=stream0)
341
363
 
342
- # compute sum
343
- wp.launch(sum, dim=N, inputs=[a0, b0, c0], stream=stream0)
344
- finally:
345
- # finish recording on stream0
346
- g = wp.capture_end(stream=stream0)
364
+ # compute sum
365
+ wp.launch(sum, dim=N, inputs=[a0, b0, c0], stream=stream0)
366
+ finally:
367
+ # finish recording on stream0
368
+ g = wp.capture_end(stream=stream0)
347
369
 
348
- # replay
349
- num_iters = 10
350
- for _ in range(num_iters):
351
- wp.capture_launch(g, stream=stream0)
370
+ # replay
371
+ num_iters = 10
372
+ for _ in range(num_iters):
373
+ wp.capture_launch(g, stream=stream0)
352
374
 
353
- # check results
354
- assert_np_equal(c0.numpy(), np.full(N, fill_value=2 * num_iters))
375
+ # check results
376
+ assert_np_equal(c0.numpy(), np.full(N, fill_value=2 * num_iters))
355
377
 
356
378
  @unittest.skipUnless(len(wp.get_cuda_devices()) > 1, "Requires at least two CUDA devices")
379
+ @unittest.skipUnless(check_iommu(), "IOMMU seems enabled")
357
380
  def test_stream_scope_graph_mgpu(self):
358
381
  wp.load_module(device="cuda:0")
359
382
  wp.load_module(device="cuda:1")
360
383
 
361
- # resources on GPU 0
362
- with wp.ScopedDevice("cuda:0"):
363
- stream0 = wp.get_stream()
364
- a0 = wp.zeros(N, dtype=float)
365
- b0 = wp.empty(N, dtype=float)
366
- c0 = wp.empty(N, dtype=float)
367
-
368
- # resources on GPU 1
369
- with wp.ScopedDevice("cuda:1"):
370
- stream1 = wp.get_stream()
371
- a1 = wp.zeros(N, dtype=float)
372
-
373
- # capture graph
374
- with wp.ScopedDevice("cuda:0"):
375
- # start recording
376
- wp.capture_begin(force_module_load=False)
377
- try:
378
- with wp.ScopedDevice("cuda:1"):
379
- # branch into stream1
380
- wp.wait_stream(stream0)
384
+ # Peer-to-peer copies are not possible during graph capture if the arrays were
385
+ # allocated using pooled allocators and mempool access is not enabled.
386
+ # Here, we force default CUDA allocators and pre-allocate the memory.
387
+ with wp.ScopedMempool("cuda:0", False), wp.ScopedMempool("cuda:1", False):
381
388
 
382
- wp.launch(inc, dim=N, inputs=[a1])
389
+ # resources on GPU 0
390
+ with wp.ScopedDevice("cuda:0"):
391
+ stream0 = wp.get_stream()
392
+ a0 = wp.zeros(N, dtype=float)
393
+ b0 = wp.empty(N, dtype=float)
394
+ c0 = wp.empty(N, dtype=float)
383
395
 
384
- wp.launch(inc, dim=N, inputs=[a0])
396
+ # resources on GPU 1
397
+ with wp.ScopedDevice("cuda:1"):
398
+ stream1 = wp.get_stream()
399
+ a1 = wp.zeros(N, dtype=float)
385
400
 
386
- # wait for stream1 to finish
387
- wp.wait_stream(stream1)
401
+ # capture graph
402
+ with wp.ScopedDevice("cuda:0"):
403
+ # start recording
404
+ wp.capture_begin(force_module_load=False)
405
+ try:
406
+ with wp.ScopedDevice("cuda:1"):
407
+ # branch into stream1
408
+ wp.wait_stream(stream0)
388
409
 
389
- # copy values from stream1
390
- wp.copy(b0, a1)
410
+ wp.launch(inc, dim=N, inputs=[a1])
391
411
 
392
- # compute sum
393
- wp.launch(sum, dim=N, inputs=[a0, b0, c0])
394
- finally:
395
- # finish recording
396
- g = wp.capture_end()
412
+ wp.launch(inc, dim=N, inputs=[a0])
397
413
 
398
- # replay
399
- with wp.ScopedDevice("cuda:0"):
400
- num_iters = 10
401
- for _ in range(num_iters):
402
- wp.capture_launch(g)
414
+ # wait for stream1 to finish
415
+ wp.wait_stream(stream1)
416
+
417
+ # copy values from stream1
418
+ wp.copy(b0, a1)
419
+
420
+ # compute sum
421
+ wp.launch(sum, dim=N, inputs=[a0, b0, c0])
422
+ finally:
423
+ # finish recording
424
+ g = wp.capture_end()
425
+
426
+ # replay
427
+ with wp.ScopedDevice("cuda:0"):
428
+ num_iters = 10
429
+ for _ in range(num_iters):
430
+ wp.capture_launch(g)
403
431
 
404
- # check results
405
- assert_np_equal(c0.numpy(), np.full(N, fill_value=2 * num_iters))
432
+ # check results
433
+ assert_np_equal(c0.numpy(), np.full(N, fill_value=2 * num_iters))
406
434
 
407
435
 
408
- add_function_test(TestStreams, "test_stream_arg_implicit_sync", test_stream_arg_implicit_sync, devices=devices)
436
+ add_function_test(TestStreams, "test_stream_set", test_stream_set, devices=devices)
437
+ add_function_test(TestStreams, "test_stream_arg_explicit_sync", test_stream_arg_explicit_sync, devices=devices)
409
438
  add_function_test(TestStreams, "test_stream_scope_implicit_sync", test_stream_scope_implicit_sync, devices=devices)
410
439
 
411
440
  add_function_test(TestStreams, "test_stream_arg_synchronize", test_stream_arg_synchronize, devices=devices)
warp/tests/test_torch.py CHANGED
@@ -490,10 +490,14 @@ def test_torch_graph_torch_stream(test, device):
490
490
 
491
491
  # capture graph
492
492
  with wp.ScopedStream(warp_stream), torch.cuda.graph(g, stream=torch_stream):
493
- t += 1.0
494
- wp.launch(inc, dim=n, inputs=[a])
495
- t += 1.0
496
- wp.launch(inc, dim=n, inputs=[a])
493
+ wp.capture_begin(force_module_load=False, external=True)
494
+ try:
495
+ t += 1.0
496
+ wp.launch(inc, dim=n, inputs=[a])
497
+ t += 1.0
498
+ wp.launch(inc, dim=n, inputs=[a])
499
+ finally:
500
+ wp.capture_end()
497
501
 
498
502
  # replay graph
499
503
  num_iters = 10
@@ -522,10 +526,14 @@ def test_torch_graph_warp_stream(test, device):
522
526
 
523
527
  # capture graph
524
528
  with wp.ScopedDevice(device), torch.cuda.graph(g, stream=torch_stream):
525
- t += 1.0
526
- wp.launch(inc, dim=n, inputs=[a])
527
- t += 1.0
528
- wp.launch(inc, dim=n, inputs=[a])
529
+ wp.capture_begin(force_module_load=False, external=True)
530
+ try:
531
+ t += 1.0
532
+ wp.launch(inc, dim=n, inputs=[a])
533
+ t += 1.0
534
+ wp.launch(inc, dim=n, inputs=[a])
535
+ finally:
536
+ wp.capture_end()
529
537
 
530
538
  # replay graph
531
539
  num_iters = 10
warp/tests/test_utils.py CHANGED
@@ -267,55 +267,60 @@ class TestUtils(unittest.TestCase):
267
267
  def test_warn(self):
268
268
  # Multiple warnings get printed out each time.
269
269
  with contextlib.redirect_stdout(io.StringIO()) as f:
270
- frame_info = inspect.getframeinfo(inspect.currentframe())
271
270
  wp.utils.warn("hello, world!")
272
271
  wp.utils.warn("hello, world!")
273
272
 
274
273
  expected = (
275
- "{}:{}: {}\n"
276
- "{}:{}: {}\n"
277
- ).format(
278
- frame_info.filename,
279
- frame_info.lineno + 1,
280
- "UserWarning: hello, world!\n wp.utils.warn(\"hello, world!\")",
281
- frame_info.filename,
282
- frame_info.lineno + 2,
283
- "UserWarning: hello, world!\n wp.utils.warn(\"hello, world!\")",
274
+ "Warp UserWarning: hello, world!\n"
275
+ "Warp UserWarning: hello, world!\n"
284
276
  )
277
+
285
278
  self.assertEqual(f.getvalue(), expected)
286
279
 
280
+ # Test verbose warnings
281
+ saved_verbosity = wp.config.verbose_warnings
282
+ try:
283
+ wp.config.verbose_warnings = True
284
+ with contextlib.redirect_stdout(io.StringIO()) as f:
285
+ frame_info = inspect.getframeinfo(inspect.currentframe())
286
+ wp.utils.warn("hello, world!")
287
+ wp.utils.warn("hello, world!")
288
+
289
+ expected = (
290
+ f"Warp UserWarning: hello, world! ({frame_info.filename}:{frame_info.lineno + 1})\n"
291
+ " wp.utils.warn(\"hello, world!\")\n"
292
+ f"Warp UserWarning: hello, world! ({frame_info.filename}:{frame_info.lineno + 2})\n"
293
+ " wp.utils.warn(\"hello, world!\")\n"
294
+ )
295
+
296
+ self.assertEqual(f.getvalue(), expected)
297
+
298
+ finally:
299
+ # make sure to restore warning verbosity
300
+ wp.config.verbose_warnings = saved_verbosity
301
+
302
+
287
303
  # Multiple similar deprecation warnings get printed out only once.
288
304
  with contextlib.redirect_stdout(io.StringIO()) as f:
289
- frame_info = inspect.getframeinfo(inspect.currentframe())
290
305
  wp.utils.warn("hello, world!", category=DeprecationWarning)
291
306
  wp.utils.warn("hello, world!", category=DeprecationWarning)
292
307
 
293
308
  expected = (
294
- "{}:{}: {}\n"
295
- ).format(
296
- frame_info.filename,
297
- frame_info.lineno + 1,
298
- "DeprecationWarning: hello, world!\n wp.utils.warn(\"hello, world!\", category=DeprecationWarning)",
309
+ "Warp DeprecationWarning: hello, world!\n"
299
310
  )
311
+
300
312
  self.assertEqual(f.getvalue(), expected)
301
313
 
302
314
  # Multiple different deprecation warnings get printed out each time.
303
315
  with contextlib.redirect_stdout(io.StringIO()) as f:
304
- frame_info = inspect.getframeinfo(inspect.currentframe())
305
316
  wp.utils.warn("foo", category=DeprecationWarning)
306
317
  wp.utils.warn("bar", category=DeprecationWarning)
307
318
 
308
319
  expected = (
309
- "{}:{}: {}\n"
310
- "{}:{}: {}\n"
311
- ).format(
312
- frame_info.filename,
313
- frame_info.lineno + 1,
314
- "DeprecationWarning: foo\n wp.utils.warn(\"foo\", category=DeprecationWarning)",
315
- frame_info.filename,
316
- frame_info.lineno + 2,
317
- "DeprecationWarning: bar\n wp.utils.warn(\"bar\", category=DeprecationWarning)",
320
+ "Warp DeprecationWarning: foo\n"
321
+ "Warp DeprecationWarning: bar\n"
318
322
  )
323
+
319
324
  self.assertEqual(f.getvalue(), expected)
320
325
 
321
326
  def test_transform_expand(self):
@@ -425,7 +430,7 @@ class TestUtils(unittest.TestCase):
425
430
  with wp.ScopedTimer("hello", detailed=True):
426
431
  pass
427
432
 
428
- self.assertRegex(f.getvalue(), r"^ 2 function calls in \d+\.\d+ seconds")
433
+ self.assertRegex(f.getvalue(), r"^ 4 function calls in \d+\.\d+ seconds")
429
434
  self.assertRegex(f.getvalue(), r"hello took \d+\.\d+ ms$")
430
435
 
431
436