warp-lang 0.11.0__py3-none-manylinux2014_x86_64.whl → 1.0.0__py3-none-manylinux2014_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of warp-lang might be problematic. Click here for more details.
- warp/__init__.py +8 -0
- warp/bin/warp-clang.so +0 -0
- warp/bin/warp.so +0 -0
- warp/build.py +7 -6
- warp/build_dll.py +70 -79
- warp/builtins.py +10 -6
- warp/codegen.py +51 -19
- warp/config.py +7 -8
- warp/constants.py +3 -0
- warp/context.py +948 -245
- warp/dlpack.py +198 -113
- warp/examples/assets/bunny.usd +0 -0
- warp/examples/assets/cartpole.urdf +110 -0
- warp/examples/assets/crazyflie.usd +0 -0
- warp/examples/assets/cube.usda +42 -0
- warp/examples/assets/nv_ant.xml +92 -0
- warp/examples/assets/nv_humanoid.xml +183 -0
- warp/examples/assets/quadruped.urdf +268 -0
- warp/examples/assets/rocks.nvdb +0 -0
- warp/examples/assets/rocks.usd +0 -0
- warp/examples/assets/sphere.usda +56 -0
- warp/examples/assets/torus.usda +105 -0
- warp/examples/benchmarks/benchmark_api.py +383 -0
- warp/examples/benchmarks/benchmark_cloth.py +279 -0
- warp/examples/benchmarks/benchmark_cloth_cupy.py +88 -0
- warp/examples/benchmarks/benchmark_cloth_jax.py +100 -0
- warp/examples/benchmarks/benchmark_cloth_numba.py +142 -0
- warp/examples/benchmarks/benchmark_cloth_numpy.py +77 -0
- warp/examples/benchmarks/benchmark_cloth_pytorch.py +86 -0
- warp/examples/benchmarks/benchmark_cloth_taichi.py +112 -0
- warp/examples/benchmarks/benchmark_cloth_warp.py +146 -0
- warp/examples/benchmarks/benchmark_launches.py +295 -0
- warp/examples/core/example_dem.py +221 -0
- warp/examples/core/example_fluid.py +267 -0
- warp/examples/core/example_graph_capture.py +129 -0
- warp/examples/core/example_marching_cubes.py +177 -0
- warp/examples/core/example_mesh.py +154 -0
- warp/examples/core/example_mesh_intersect.py +193 -0
- warp/examples/core/example_nvdb.py +169 -0
- warp/examples/core/example_raycast.py +89 -0
- warp/examples/core/example_raymarch.py +178 -0
- warp/examples/core/example_render_opengl.py +141 -0
- warp/examples/core/example_sph.py +389 -0
- warp/examples/core/example_torch.py +181 -0
- warp/examples/core/example_wave.py +249 -0
- warp/examples/fem/bsr_utils.py +380 -0
- warp/examples/fem/example_apic_fluid.py +391 -0
- warp/examples/fem/example_convection_diffusion.py +168 -0
- warp/examples/fem/example_convection_diffusion_dg.py +209 -0
- warp/examples/fem/example_convection_diffusion_dg0.py +194 -0
- warp/examples/fem/example_deformed_geometry.py +159 -0
- warp/examples/fem/example_diffusion.py +173 -0
- warp/examples/fem/example_diffusion_3d.py +152 -0
- warp/examples/fem/example_diffusion_mgpu.py +214 -0
- warp/examples/fem/example_mixed_elasticity.py +222 -0
- warp/examples/fem/example_navier_stokes.py +243 -0
- warp/examples/fem/example_stokes.py +192 -0
- warp/examples/fem/example_stokes_transfer.py +249 -0
- warp/examples/fem/mesh_utils.py +109 -0
- warp/examples/fem/plot_utils.py +287 -0
- warp/examples/optim/example_bounce.py +248 -0
- warp/examples/optim/example_cloth_throw.py +210 -0
- warp/examples/optim/example_diffray.py +535 -0
- warp/examples/optim/example_drone.py +850 -0
- warp/examples/optim/example_inverse_kinematics.py +169 -0
- warp/examples/optim/example_inverse_kinematics_torch.py +170 -0
- warp/examples/optim/example_spring_cage.py +234 -0
- warp/examples/optim/example_trajectory.py +201 -0
- warp/examples/sim/example_cartpole.py +128 -0
- warp/examples/sim/example_cloth.py +184 -0
- warp/examples/sim/example_granular.py +113 -0
- warp/examples/sim/example_granular_collision_sdf.py +185 -0
- warp/examples/sim/example_jacobian_ik.py +213 -0
- warp/examples/sim/example_particle_chain.py +106 -0
- warp/examples/sim/example_quadruped.py +179 -0
- warp/examples/sim/example_rigid_chain.py +191 -0
- warp/examples/sim/example_rigid_contact.py +176 -0
- warp/examples/sim/example_rigid_force.py +126 -0
- warp/examples/sim/example_rigid_gyroscopic.py +97 -0
- warp/examples/sim/example_rigid_soft_contact.py +124 -0
- warp/examples/sim/example_soft_body.py +178 -0
- warp/fabric.py +29 -20
- warp/fem/cache.py +0 -1
- warp/fem/dirichlet.py +0 -2
- warp/fem/integrate.py +0 -1
- warp/jax.py +45 -0
- warp/jax_experimental.py +339 -0
- warp/native/builtin.h +12 -0
- warp/native/bvh.cu +18 -18
- warp/native/clang/clang.cpp +8 -3
- warp/native/cuda_util.cpp +94 -5
- warp/native/cuda_util.h +35 -6
- warp/native/cutlass_gemm.cpp +1 -1
- warp/native/cutlass_gemm.cu +4 -1
- warp/native/error.cpp +66 -0
- warp/native/error.h +27 -0
- warp/native/mesh.cu +2 -2
- warp/native/reduce.cu +4 -4
- warp/native/runlength_encode.cu +2 -2
- warp/native/scan.cu +2 -2
- warp/native/sparse.cu +0 -1
- warp/native/temp_buffer.h +2 -2
- warp/native/warp.cpp +95 -60
- warp/native/warp.cu +1053 -218
- warp/native/warp.h +49 -32
- warp/optim/linear.py +33 -16
- warp/render/render_opengl.py +202 -101
- warp/render/render_usd.py +82 -40
- warp/sim/__init__.py +13 -4
- warp/sim/articulation.py +4 -5
- warp/sim/collide.py +320 -175
- warp/sim/import_mjcf.py +25 -30
- warp/sim/import_urdf.py +94 -63
- warp/sim/import_usd.py +51 -36
- warp/sim/inertia.py +3 -2
- warp/sim/integrator.py +233 -0
- warp/sim/integrator_euler.py +447 -469
- warp/sim/integrator_featherstone.py +1991 -0
- warp/sim/integrator_xpbd.py +1420 -640
- warp/sim/model.py +765 -487
- warp/sim/particles.py +2 -1
- warp/sim/render.py +35 -13
- warp/sim/utils.py +222 -11
- warp/stubs.py +8 -0
- warp/tape.py +16 -1
- warp/tests/aux_test_grad_customs.py +23 -0
- warp/tests/test_array.py +190 -1
- warp/tests/test_async.py +656 -0
- warp/tests/test_bool.py +50 -0
- warp/tests/test_dlpack.py +164 -11
- warp/tests/test_examples.py +166 -74
- warp/tests/test_fem.py +8 -1
- warp/tests/test_generics.py +15 -5
- warp/tests/test_grad.py +1 -1
- warp/tests/test_grad_customs.py +172 -12
- warp/tests/test_jax.py +254 -0
- warp/tests/test_large.py +29 -6
- warp/tests/test_launch.py +25 -0
- warp/tests/test_linear_solvers.py +20 -3
- warp/tests/test_matmul.py +61 -16
- warp/tests/test_matmul_lite.py +13 -13
- warp/tests/test_mempool.py +186 -0
- warp/tests/test_multigpu.py +3 -0
- warp/tests/test_options.py +16 -2
- warp/tests/test_peer.py +137 -0
- warp/tests/test_print.py +3 -1
- warp/tests/test_quat.py +23 -0
- warp/tests/test_sim_kinematics.py +97 -0
- warp/tests/test_snippet.py +126 -3
- warp/tests/test_streams.py +108 -79
- warp/tests/test_torch.py +16 -8
- warp/tests/test_utils.py +32 -27
- warp/tests/test_verify_fp.py +65 -0
- warp/tests/test_volume.py +1 -1
- warp/tests/unittest_serial.py +2 -0
- warp/tests/unittest_suites.py +12 -0
- warp/tests/unittest_utils.py +14 -7
- warp/thirdparty/unittest_parallel.py +15 -3
- warp/torch.py +10 -8
- warp/types.py +363 -246
- warp/utils.py +143 -19
- warp_lang-1.0.0.dist-info/LICENSE.md +126 -0
- warp_lang-1.0.0.dist-info/METADATA +394 -0
- {warp_lang-0.11.0.dist-info → warp_lang-1.0.0.dist-info}/RECORD +167 -86
- warp/sim/optimizer.py +0 -138
- warp_lang-0.11.0.dist-info/LICENSE.md +0 -36
- warp_lang-0.11.0.dist-info/METADATA +0 -238
- /warp/tests/{walkthough_debug.py → walkthrough_debug.py} +0 -0
- {warp_lang-0.11.0.dist-info → warp_lang-1.0.0.dist-info}/WHEEL +0 -0
- {warp_lang-0.11.0.dist-info → warp_lang-1.0.0.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,97 @@
|
|
|
1
|
+
# Copyright (c) 2022 NVIDIA CORPORATION. All rights reserved.
|
|
2
|
+
# NVIDIA CORPORATION and its licensors retain all intellectual property
|
|
3
|
+
# and proprietary rights in and to this software, related documentation
|
|
4
|
+
# and any modifications thereto. Any use, reproduction, disclosure or
|
|
5
|
+
# distribution of this software and related documentation without an express
|
|
6
|
+
# license agreement from NVIDIA CORPORATION is strictly prohibited.
|
|
7
|
+
|
|
8
|
+
import unittest
|
|
9
|
+
|
|
10
|
+
import warp as wp
|
|
11
|
+
from warp.tests.unittest_utils import *
|
|
12
|
+
|
|
13
|
+
import math
|
|
14
|
+
import os
|
|
15
|
+
|
|
16
|
+
import numpy as np
|
|
17
|
+
|
|
18
|
+
import warp as wp
|
|
19
|
+
import warp.sim
|
|
20
|
+
|
|
21
|
+
wp.init()
|
|
22
|
+
|
|
23
|
+
|
|
24
|
+
def test_fk_ik(test, device):
|
|
25
|
+
|
|
26
|
+
builder = wp.sim.ModelBuilder()
|
|
27
|
+
|
|
28
|
+
num_envs = 1
|
|
29
|
+
|
|
30
|
+
for i in range(num_envs):
|
|
31
|
+
wp.sim.parse_mjcf(
|
|
32
|
+
os.path.join(os.path.dirname(__file__), "../examples/assets/nv_ant.xml"),
|
|
33
|
+
builder,
|
|
34
|
+
stiffness=0.0,
|
|
35
|
+
damping=1.0,
|
|
36
|
+
armature=0.1,
|
|
37
|
+
contact_ke=1.0e4,
|
|
38
|
+
contact_kd=1.0e2,
|
|
39
|
+
contact_kf=1.0e2,
|
|
40
|
+
contact_mu=0.75,
|
|
41
|
+
limit_ke=1.0e3,
|
|
42
|
+
limit_kd=1.0e1,
|
|
43
|
+
up_axis="y",
|
|
44
|
+
)
|
|
45
|
+
|
|
46
|
+
coord_count = 15
|
|
47
|
+
dof_count = 14
|
|
48
|
+
|
|
49
|
+
coord_start = i * coord_count
|
|
50
|
+
dof_start = i * dof_count
|
|
51
|
+
|
|
52
|
+
# base
|
|
53
|
+
builder.joint_q[coord_start : coord_start + 3] = [i * 2.0, 0.70, 0.0]
|
|
54
|
+
builder.joint_q[coord_start + 3 : coord_start + 7] = wp.quat_from_axis_angle(
|
|
55
|
+
wp.vec3(1.0, 0.0, 0.0), -math.pi * 0.5
|
|
56
|
+
)
|
|
57
|
+
|
|
58
|
+
# joints
|
|
59
|
+
builder.joint_q[coord_start + 7 : coord_start + coord_count] = [0.0, 1.0, 0.0, -1.0, 0.0, -1.0, 0.0, 1.0]
|
|
60
|
+
builder.joint_qd[dof_start + 6 : dof_start + dof_count] = [1.0, 1.0, 1.0, -1.0, 1.0, -1.0, 1.0, 1.0]
|
|
61
|
+
|
|
62
|
+
# finalize model
|
|
63
|
+
model = builder.finalize()
|
|
64
|
+
model.ground = True
|
|
65
|
+
model.joint_attach_ke *= 16.0
|
|
66
|
+
model.joint_attach_kd *= 4.0
|
|
67
|
+
|
|
68
|
+
state = model.state()
|
|
69
|
+
|
|
70
|
+
# save a copy of joint values
|
|
71
|
+
q_fk = model.joint_q.numpy()
|
|
72
|
+
qd_fk = model.joint_qd.numpy()
|
|
73
|
+
|
|
74
|
+
wp.sim.eval_fk(model, model.joint_q, model.joint_qd, None, state)
|
|
75
|
+
|
|
76
|
+
q_ik = wp.zeros_like(model.joint_q)
|
|
77
|
+
qd_ik = wp.zeros_like(model.joint_qd)
|
|
78
|
+
|
|
79
|
+
wp.sim.eval_ik(model, state, q_ik, qd_ik)
|
|
80
|
+
|
|
81
|
+
assert_np_equal(q_fk, q_ik.numpy(), tol=1e-6)
|
|
82
|
+
assert_np_equal(qd_fk, qd_ik.numpy(), tol=1e-6)
|
|
83
|
+
|
|
84
|
+
|
|
85
|
+
devices = get_test_devices()
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
class TestSimKinematics(unittest.TestCase):
|
|
89
|
+
pass
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
add_function_test(TestSimKinematics, "test_fk_ik", test_fk_ik, devices=devices)
|
|
93
|
+
|
|
94
|
+
|
|
95
|
+
if __name__ == "__main__":
|
|
96
|
+
wp.build.clear_kernel_cache()
|
|
97
|
+
unittest.main(verbosity=2, failfast=True)
|
warp/tests/test_snippet.py
CHANGED
|
@@ -13,9 +13,9 @@ def test_basic(test, device):
|
|
|
13
13
|
out[tid] = a * x[tid] + y[tid];
|
|
14
14
|
"""
|
|
15
15
|
adj_snippet = """
|
|
16
|
-
adj_a
|
|
17
|
-
adj_x[tid]
|
|
18
|
-
adj_y[tid]
|
|
16
|
+
adj_a += x[tid] * adj_out[tid];
|
|
17
|
+
adj_x[tid] += a * adj_out[tid];
|
|
18
|
+
adj_y[tid] += adj_out[tid];
|
|
19
19
|
"""
|
|
20
20
|
|
|
21
21
|
@wp.func_native(snippet, adj_snippet)
|
|
@@ -86,6 +86,7 @@ def test_shared_memory(test, device):
|
|
|
86
86
|
|
|
87
87
|
@wp.func_native(snippet)
|
|
88
88
|
def reverse(d: wp.array(dtype=int), N: int, tid: int):
|
|
89
|
+
"""Reverse the array d in place using shared memory."""
|
|
89
90
|
return
|
|
90
91
|
|
|
91
92
|
@wp.kernel
|
|
@@ -100,6 +101,7 @@ def test_shared_memory(test, device):
|
|
|
100
101
|
wp.launch(kernel=reverse_kernel, dim=N, inputs=[x, N], device=device)
|
|
101
102
|
|
|
102
103
|
assert_np_equal(x.numpy(), y)
|
|
104
|
+
assert reverse.__doc__ == "Reverse the array d in place using shared memory."
|
|
103
105
|
|
|
104
106
|
|
|
105
107
|
def test_cpu_snippet(test, device):
|
|
@@ -130,6 +132,124 @@ def test_cpu_snippet(test, device):
|
|
|
130
132
|
assert_np_equal(out.numpy(), np.arange(1, N + 1, 1, dtype=np.int32))
|
|
131
133
|
|
|
132
134
|
|
|
135
|
+
def test_custom_replay_grad(test, device):
|
|
136
|
+
num_threads = 16
|
|
137
|
+
counter = wp.zeros(1, dtype=wp.int32, device=device)
|
|
138
|
+
thread_ids = wp.zeros(num_threads, dtype=wp.int32, device=device)
|
|
139
|
+
inputs = wp.array(np.arange(num_threads, dtype=np.float32), device=device, requires_grad=True)
|
|
140
|
+
outputs = wp.zeros_like(inputs)
|
|
141
|
+
|
|
142
|
+
snippet = """
|
|
143
|
+
int next_index = atomicAdd(counter, 1);
|
|
144
|
+
thread_values[tid] = next_index;
|
|
145
|
+
"""
|
|
146
|
+
replay_snippet = ""
|
|
147
|
+
|
|
148
|
+
@wp.func_native(snippet, replay_snippet=replay_snippet)
|
|
149
|
+
def reversible_increment(
|
|
150
|
+
counter: wp.array(dtype=int), thread_values: wp.array(dtype=int), tid: int
|
|
151
|
+
):
|
|
152
|
+
...
|
|
153
|
+
|
|
154
|
+
@wp.kernel
|
|
155
|
+
def run_atomic_add(
|
|
156
|
+
input: wp.array(dtype=float),
|
|
157
|
+
counter: wp.array(dtype=int),
|
|
158
|
+
thread_values: wp.array(dtype=int),
|
|
159
|
+
output: wp.array(dtype=float),
|
|
160
|
+
):
|
|
161
|
+
tid = wp.tid()
|
|
162
|
+
reversible_increment(counter, thread_values, tid)
|
|
163
|
+
idx = thread_values[tid]
|
|
164
|
+
output[idx] = input[idx] ** 2.0
|
|
165
|
+
|
|
166
|
+
tape = wp.Tape()
|
|
167
|
+
with tape:
|
|
168
|
+
wp.launch(
|
|
169
|
+
run_atomic_add, dim=num_threads, inputs=[inputs, counter, thread_ids], outputs=[outputs], device=device
|
|
170
|
+
)
|
|
171
|
+
|
|
172
|
+
tape.backward(grads={outputs: wp.array(np.ones(num_threads, dtype=np.float32), device=device)})
|
|
173
|
+
assert_np_equal(inputs.grad.numpy(), 2.0 * inputs.numpy(), tol=1e-4)
|
|
174
|
+
|
|
175
|
+
|
|
176
|
+
def test_replay_simplification(test, device):
|
|
177
|
+
num_threads = 8
|
|
178
|
+
x = wp.array(1.0 + np.arange(num_threads, dtype=np.float32), device=device, requires_grad=True)
|
|
179
|
+
y = wp.zeros_like(x)
|
|
180
|
+
z = wp.zeros_like(x)
|
|
181
|
+
|
|
182
|
+
snippet = "y[tid] = powf(x[tid], 2.0);"
|
|
183
|
+
replay_snippet = "y[tid] = x[tid];"
|
|
184
|
+
adj_snippet = "adj_x[tid] += 2.0 * adj_y[tid];"
|
|
185
|
+
|
|
186
|
+
@wp.func_native(snippet, adj_snippet=adj_snippet, replay_snippet=replay_snippet)
|
|
187
|
+
def square(x: wp.array(dtype=float), y: wp.array(dtype=float), tid: int):
|
|
188
|
+
...
|
|
189
|
+
|
|
190
|
+
@wp.kernel
|
|
191
|
+
def log_square_kernel(
|
|
192
|
+
x: wp.array(dtype=float),
|
|
193
|
+
y: wp.array(dtype=float),
|
|
194
|
+
z: wp.array(dtype=float)
|
|
195
|
+
):
|
|
196
|
+
tid = wp.tid()
|
|
197
|
+
square(x, y, tid)
|
|
198
|
+
z[tid] = wp.log(y[tid])
|
|
199
|
+
|
|
200
|
+
tape = wp.Tape()
|
|
201
|
+
with tape:
|
|
202
|
+
wp.launch(log_square_kernel, dim=num_threads, inputs=[x, y], outputs=[z], device=device)
|
|
203
|
+
|
|
204
|
+
tape.backward(grads={z: wp.array(np.ones(num_threads, dtype=np.float32), device=device)})
|
|
205
|
+
assert_np_equal(x.grad.numpy(), 2.0 / (1.0 + np.arange(num_threads)), tol=1e-6)
|
|
206
|
+
|
|
207
|
+
|
|
208
|
+
def test_recompile_snippet(test, device):
|
|
209
|
+
snippet = """
|
|
210
|
+
int inc = 1;
|
|
211
|
+
out[tid] = x[tid] + inc;
|
|
212
|
+
"""
|
|
213
|
+
|
|
214
|
+
@wp.func_native(snippet)
|
|
215
|
+
def increment_snippet(
|
|
216
|
+
x: wp.array(dtype=wp.int32),
|
|
217
|
+
out: wp.array(dtype=wp.int32),
|
|
218
|
+
tid: int,
|
|
219
|
+
):
|
|
220
|
+
...
|
|
221
|
+
|
|
222
|
+
@wp.kernel
|
|
223
|
+
def increment(x: wp.array(dtype=wp.int32), out: wp.array(dtype=wp.int32)):
|
|
224
|
+
tid = wp.tid()
|
|
225
|
+
increment_snippet(x, out, tid)
|
|
226
|
+
|
|
227
|
+
N = 128
|
|
228
|
+
x = wp.array(np.arange(N, dtype=np.int32), dtype=wp.int32, device=device)
|
|
229
|
+
out = wp.zeros(N, dtype=wp.int32, device=device)
|
|
230
|
+
|
|
231
|
+
wp.launch(kernel=increment, dim=N, inputs=[x], outputs=[out], device=device)
|
|
232
|
+
|
|
233
|
+
assert_np_equal(out.numpy(), np.arange(1, N + 1, 1, dtype=np.int32))
|
|
234
|
+
|
|
235
|
+
snippet = """
|
|
236
|
+
int inc = 2;
|
|
237
|
+
out[tid] = x[tid] + inc;
|
|
238
|
+
"""
|
|
239
|
+
|
|
240
|
+
@wp.func_native(snippet)
|
|
241
|
+
def increment_snippet(
|
|
242
|
+
x: wp.array(dtype=wp.int32),
|
|
243
|
+
out: wp.array(dtype=wp.int32),
|
|
244
|
+
tid: int,
|
|
245
|
+
):
|
|
246
|
+
...
|
|
247
|
+
|
|
248
|
+
wp.launch(kernel=increment, dim=N, inputs=[x], outputs=[out], device=device)
|
|
249
|
+
|
|
250
|
+
assert_np_equal(out.numpy(), 1 + np.arange(1, N + 1, 1, dtype=np.int32))
|
|
251
|
+
|
|
252
|
+
|
|
133
253
|
class TestSnippets(unittest.TestCase):
|
|
134
254
|
pass
|
|
135
255
|
|
|
@@ -137,6 +257,9 @@ class TestSnippets(unittest.TestCase):
|
|
|
137
257
|
add_function_test(TestSnippets, "test_basic", test_basic, devices=get_unique_cuda_test_devices())
|
|
138
258
|
add_function_test(TestSnippets, "test_shared_memory", test_shared_memory, devices=get_unique_cuda_test_devices())
|
|
139
259
|
add_function_test(TestSnippets, "test_cpu_snippet", test_cpu_snippet, devices=["cpu"])
|
|
260
|
+
add_function_test(TestSnippets, "test_custom_replay_grad", test_custom_replay_grad, devices=get_unique_cuda_test_devices())
|
|
261
|
+
add_function_test(TestSnippets, "test_replay_simplification", test_replay_simplification, devices=get_unique_cuda_test_devices())
|
|
262
|
+
add_function_test(TestSnippets, "test_recompile_snippet", test_recompile_snippet, devices=get_unique_cuda_test_devices())
|
|
140
263
|
|
|
141
264
|
|
|
142
265
|
if __name__ == "__main__":
|
warp/tests/test_streams.py
CHANGED
|
@@ -10,6 +10,7 @@ import unittest
|
|
|
10
10
|
import numpy as np
|
|
11
11
|
|
|
12
12
|
import warp as wp
|
|
13
|
+
from warp.utils import check_iommu
|
|
13
14
|
from warp.tests.unittest_utils import *
|
|
14
15
|
|
|
15
16
|
wp.init()
|
|
@@ -37,19 +38,34 @@ def sum(a: wp.array(dtype=float), b: wp.array(dtype=float), c: wp.array(dtype=fl
|
|
|
37
38
|
N = 10 * 1024 * 1024
|
|
38
39
|
|
|
39
40
|
|
|
40
|
-
def
|
|
41
|
-
# wp.zeros() and array.numpy() should not require explicit sync
|
|
41
|
+
def test_stream_set(test, device):
|
|
42
42
|
|
|
43
|
+
device = wp.get_device(device)
|
|
44
|
+
|
|
45
|
+
old_stream = device.stream
|
|
46
|
+
new_stream = wp.Stream(device)
|
|
47
|
+
|
|
48
|
+
try:
|
|
49
|
+
wp.set_stream(new_stream, device)
|
|
50
|
+
|
|
51
|
+
test.assertTrue(device.has_stream)
|
|
52
|
+
test.assertEqual(device.stream, new_stream)
|
|
53
|
+
|
|
54
|
+
finally:
|
|
55
|
+
# restore original stream
|
|
56
|
+
wp.set_stream(old_stream, device)
|
|
57
|
+
|
|
58
|
+
|
|
59
|
+
def test_stream_arg_explicit_sync(test, device):
|
|
43
60
|
a = wp.zeros(N, dtype=float, device=device)
|
|
44
|
-
b = wp.
|
|
61
|
+
b = wp.full(N, 42, dtype=float, device=device)
|
|
45
62
|
c = wp.empty(N, dtype=float, device=device)
|
|
46
63
|
|
|
64
|
+
old_stream = wp.get_stream(device)
|
|
47
65
|
new_stream = wp.Stream(device)
|
|
48
66
|
|
|
49
|
-
#
|
|
50
|
-
|
|
51
|
-
|
|
52
|
-
test.assertTrue(wp.get_device(device).has_stream)
|
|
67
|
+
# allocations need to be explicitly synced before launching work using stream arguments
|
|
68
|
+
new_stream.wait_stream(old_stream)
|
|
53
69
|
|
|
54
70
|
# launch work on new stream
|
|
55
71
|
wp.launch(inc, dim=a.size, inputs=[a], stream=new_stream)
|
|
@@ -64,17 +80,17 @@ def test_stream_arg_implicit_sync(test, device):
|
|
|
64
80
|
|
|
65
81
|
|
|
66
82
|
def test_stream_scope_implicit_sync(test, device):
|
|
67
|
-
# wp.zeros() and array.numpy() should not require explicit sync
|
|
68
83
|
|
|
69
84
|
with wp.ScopedDevice(device):
|
|
70
85
|
a = wp.zeros(N, dtype=float)
|
|
71
|
-
b = wp.
|
|
86
|
+
b = wp.full(N, 42, dtype=float)
|
|
72
87
|
c = wp.empty(N, dtype=float)
|
|
73
88
|
|
|
74
89
|
old_stream = wp.get_stream()
|
|
75
90
|
new_stream = wp.Stream()
|
|
76
91
|
|
|
77
92
|
# launch work on new stream
|
|
93
|
+
# allocations are implicitly synced when entering wp.ScopedStream
|
|
78
94
|
with wp.ScopedStream(new_stream):
|
|
79
95
|
assert wp.get_stream() == new_stream
|
|
80
96
|
|
|
@@ -309,103 +325,116 @@ class TestStreams(unittest.TestCase):
|
|
|
309
325
|
cpu_stream = cpu_device.stream # noqa: F841
|
|
310
326
|
|
|
311
327
|
@unittest.skipUnless(len(wp.get_cuda_devices()) > 1, "Requires at least two CUDA devices")
|
|
328
|
+
@unittest.skipUnless(check_iommu(), "IOMMU seems enabled")
|
|
312
329
|
def test_stream_arg_graph_mgpu(self):
|
|
313
330
|
wp.load_module(device="cuda:0")
|
|
314
331
|
wp.load_module(device="cuda:1")
|
|
315
332
|
|
|
316
|
-
#
|
|
317
|
-
|
|
318
|
-
|
|
319
|
-
|
|
320
|
-
c0 = wp.empty(N, dtype=float, device="cuda:0")
|
|
333
|
+
# Peer-to-peer copies are not possible during graph capture if the arrays were
|
|
334
|
+
# allocated using pooled allocators and mempool access is not enabled.
|
|
335
|
+
# Here, we force default CUDA allocators and pre-allocate the memory.
|
|
336
|
+
with wp.ScopedMempool("cuda:0", False), wp.ScopedMempool("cuda:1", False):
|
|
321
337
|
|
|
322
|
-
|
|
323
|
-
|
|
324
|
-
|
|
338
|
+
# resources on GPU 0
|
|
339
|
+
stream0 = wp.get_stream("cuda:0")
|
|
340
|
+
a0 = wp.zeros(N, dtype=float, device="cuda:0")
|
|
341
|
+
b0 = wp.empty(N, dtype=float, device="cuda:0")
|
|
342
|
+
c0 = wp.empty(N, dtype=float, device="cuda:0")
|
|
325
343
|
|
|
326
|
-
|
|
327
|
-
|
|
328
|
-
|
|
329
|
-
# branch into stream1
|
|
330
|
-
stream1.wait_stream(stream0)
|
|
344
|
+
# resources on GPU 1
|
|
345
|
+
stream1 = wp.get_stream("cuda:1")
|
|
346
|
+
a1 = wp.zeros(N, dtype=float, device="cuda:1")
|
|
331
347
|
|
|
332
|
-
#
|
|
333
|
-
wp.
|
|
334
|
-
|
|
348
|
+
# start recording on stream0
|
|
349
|
+
wp.capture_begin(stream=stream0, force_module_load=False)
|
|
350
|
+
try:
|
|
351
|
+
# branch into stream1
|
|
352
|
+
stream1.wait_stream(stream0)
|
|
335
353
|
|
|
336
|
-
|
|
337
|
-
|
|
354
|
+
# launch concurrent kernels on each stream
|
|
355
|
+
wp.launch(inc, dim=N, inputs=[a0], stream=stream0)
|
|
356
|
+
wp.launch(inc, dim=N, inputs=[a1], stream=stream1)
|
|
338
357
|
|
|
339
|
-
|
|
340
|
-
|
|
358
|
+
# wait for stream1 to finish
|
|
359
|
+
stream0.wait_stream(stream1)
|
|
360
|
+
|
|
361
|
+
# copy values from stream1
|
|
362
|
+
wp.copy(b0, a1, stream=stream0)
|
|
341
363
|
|
|
342
|
-
|
|
343
|
-
|
|
344
|
-
|
|
345
|
-
|
|
346
|
-
|
|
364
|
+
# compute sum
|
|
365
|
+
wp.launch(sum, dim=N, inputs=[a0, b0, c0], stream=stream0)
|
|
366
|
+
finally:
|
|
367
|
+
# finish recording on stream0
|
|
368
|
+
g = wp.capture_end(stream=stream0)
|
|
347
369
|
|
|
348
|
-
|
|
349
|
-
|
|
350
|
-
|
|
351
|
-
|
|
370
|
+
# replay
|
|
371
|
+
num_iters = 10
|
|
372
|
+
for _ in range(num_iters):
|
|
373
|
+
wp.capture_launch(g, stream=stream0)
|
|
352
374
|
|
|
353
|
-
|
|
354
|
-
|
|
375
|
+
# check results
|
|
376
|
+
assert_np_equal(c0.numpy(), np.full(N, fill_value=2 * num_iters))
|
|
355
377
|
|
|
356
378
|
@unittest.skipUnless(len(wp.get_cuda_devices()) > 1, "Requires at least two CUDA devices")
|
|
379
|
+
@unittest.skipUnless(check_iommu(), "IOMMU seems enabled")
|
|
357
380
|
def test_stream_scope_graph_mgpu(self):
|
|
358
381
|
wp.load_module(device="cuda:0")
|
|
359
382
|
wp.load_module(device="cuda:1")
|
|
360
383
|
|
|
361
|
-
#
|
|
362
|
-
|
|
363
|
-
|
|
364
|
-
|
|
365
|
-
b0 = wp.empty(N, dtype=float)
|
|
366
|
-
c0 = wp.empty(N, dtype=float)
|
|
367
|
-
|
|
368
|
-
# resources on GPU 1
|
|
369
|
-
with wp.ScopedDevice("cuda:1"):
|
|
370
|
-
stream1 = wp.get_stream()
|
|
371
|
-
a1 = wp.zeros(N, dtype=float)
|
|
372
|
-
|
|
373
|
-
# capture graph
|
|
374
|
-
with wp.ScopedDevice("cuda:0"):
|
|
375
|
-
# start recording
|
|
376
|
-
wp.capture_begin(force_module_load=False)
|
|
377
|
-
try:
|
|
378
|
-
with wp.ScopedDevice("cuda:1"):
|
|
379
|
-
# branch into stream1
|
|
380
|
-
wp.wait_stream(stream0)
|
|
384
|
+
# Peer-to-peer copies are not possible during graph capture if the arrays were
|
|
385
|
+
# allocated using pooled allocators and mempool access is not enabled.
|
|
386
|
+
# Here, we force default CUDA allocators and pre-allocate the memory.
|
|
387
|
+
with wp.ScopedMempool("cuda:0", False), wp.ScopedMempool("cuda:1", False):
|
|
381
388
|
|
|
382
|
-
|
|
389
|
+
# resources on GPU 0
|
|
390
|
+
with wp.ScopedDevice("cuda:0"):
|
|
391
|
+
stream0 = wp.get_stream()
|
|
392
|
+
a0 = wp.zeros(N, dtype=float)
|
|
393
|
+
b0 = wp.empty(N, dtype=float)
|
|
394
|
+
c0 = wp.empty(N, dtype=float)
|
|
383
395
|
|
|
384
|
-
|
|
396
|
+
# resources on GPU 1
|
|
397
|
+
with wp.ScopedDevice("cuda:1"):
|
|
398
|
+
stream1 = wp.get_stream()
|
|
399
|
+
a1 = wp.zeros(N, dtype=float)
|
|
385
400
|
|
|
386
|
-
|
|
387
|
-
|
|
401
|
+
# capture graph
|
|
402
|
+
with wp.ScopedDevice("cuda:0"):
|
|
403
|
+
# start recording
|
|
404
|
+
wp.capture_begin(force_module_load=False)
|
|
405
|
+
try:
|
|
406
|
+
with wp.ScopedDevice("cuda:1"):
|
|
407
|
+
# branch into stream1
|
|
408
|
+
wp.wait_stream(stream0)
|
|
388
409
|
|
|
389
|
-
|
|
390
|
-
wp.copy(b0, a1)
|
|
410
|
+
wp.launch(inc, dim=N, inputs=[a1])
|
|
391
411
|
|
|
392
|
-
|
|
393
|
-
wp.launch(sum, dim=N, inputs=[a0, b0, c0])
|
|
394
|
-
finally:
|
|
395
|
-
# finish recording
|
|
396
|
-
g = wp.capture_end()
|
|
412
|
+
wp.launch(inc, dim=N, inputs=[a0])
|
|
397
413
|
|
|
398
|
-
|
|
399
|
-
|
|
400
|
-
|
|
401
|
-
|
|
402
|
-
|
|
414
|
+
# wait for stream1 to finish
|
|
415
|
+
wp.wait_stream(stream1)
|
|
416
|
+
|
|
417
|
+
# copy values from stream1
|
|
418
|
+
wp.copy(b0, a1)
|
|
419
|
+
|
|
420
|
+
# compute sum
|
|
421
|
+
wp.launch(sum, dim=N, inputs=[a0, b0, c0])
|
|
422
|
+
finally:
|
|
423
|
+
# finish recording
|
|
424
|
+
g = wp.capture_end()
|
|
425
|
+
|
|
426
|
+
# replay
|
|
427
|
+
with wp.ScopedDevice("cuda:0"):
|
|
428
|
+
num_iters = 10
|
|
429
|
+
for _ in range(num_iters):
|
|
430
|
+
wp.capture_launch(g)
|
|
403
431
|
|
|
404
|
-
|
|
405
|
-
|
|
432
|
+
# check results
|
|
433
|
+
assert_np_equal(c0.numpy(), np.full(N, fill_value=2 * num_iters))
|
|
406
434
|
|
|
407
435
|
|
|
408
|
-
add_function_test(TestStreams, "
|
|
436
|
+
add_function_test(TestStreams, "test_stream_set", test_stream_set, devices=devices)
|
|
437
|
+
add_function_test(TestStreams, "test_stream_arg_explicit_sync", test_stream_arg_explicit_sync, devices=devices)
|
|
409
438
|
add_function_test(TestStreams, "test_stream_scope_implicit_sync", test_stream_scope_implicit_sync, devices=devices)
|
|
410
439
|
|
|
411
440
|
add_function_test(TestStreams, "test_stream_arg_synchronize", test_stream_arg_synchronize, devices=devices)
|
warp/tests/test_torch.py
CHANGED
|
@@ -490,10 +490,14 @@ def test_torch_graph_torch_stream(test, device):
|
|
|
490
490
|
|
|
491
491
|
# capture graph
|
|
492
492
|
with wp.ScopedStream(warp_stream), torch.cuda.graph(g, stream=torch_stream):
|
|
493
|
-
|
|
494
|
-
|
|
495
|
-
|
|
496
|
-
|
|
493
|
+
wp.capture_begin(force_module_load=False, external=True)
|
|
494
|
+
try:
|
|
495
|
+
t += 1.0
|
|
496
|
+
wp.launch(inc, dim=n, inputs=[a])
|
|
497
|
+
t += 1.0
|
|
498
|
+
wp.launch(inc, dim=n, inputs=[a])
|
|
499
|
+
finally:
|
|
500
|
+
wp.capture_end()
|
|
497
501
|
|
|
498
502
|
# replay graph
|
|
499
503
|
num_iters = 10
|
|
@@ -522,10 +526,14 @@ def test_torch_graph_warp_stream(test, device):
|
|
|
522
526
|
|
|
523
527
|
# capture graph
|
|
524
528
|
with wp.ScopedDevice(device), torch.cuda.graph(g, stream=torch_stream):
|
|
525
|
-
|
|
526
|
-
|
|
527
|
-
|
|
528
|
-
|
|
529
|
+
wp.capture_begin(force_module_load=False, external=True)
|
|
530
|
+
try:
|
|
531
|
+
t += 1.0
|
|
532
|
+
wp.launch(inc, dim=n, inputs=[a])
|
|
533
|
+
t += 1.0
|
|
534
|
+
wp.launch(inc, dim=n, inputs=[a])
|
|
535
|
+
finally:
|
|
536
|
+
wp.capture_end()
|
|
529
537
|
|
|
530
538
|
# replay graph
|
|
531
539
|
num_iters = 10
|
warp/tests/test_utils.py
CHANGED
|
@@ -267,55 +267,60 @@ class TestUtils(unittest.TestCase):
|
|
|
267
267
|
def test_warn(self):
|
|
268
268
|
# Multiple warnings get printed out each time.
|
|
269
269
|
with contextlib.redirect_stdout(io.StringIO()) as f:
|
|
270
|
-
frame_info = inspect.getframeinfo(inspect.currentframe())
|
|
271
270
|
wp.utils.warn("hello, world!")
|
|
272
271
|
wp.utils.warn("hello, world!")
|
|
273
272
|
|
|
274
273
|
expected = (
|
|
275
|
-
"
|
|
276
|
-
"
|
|
277
|
-
).format(
|
|
278
|
-
frame_info.filename,
|
|
279
|
-
frame_info.lineno + 1,
|
|
280
|
-
"UserWarning: hello, world!\n wp.utils.warn(\"hello, world!\")",
|
|
281
|
-
frame_info.filename,
|
|
282
|
-
frame_info.lineno + 2,
|
|
283
|
-
"UserWarning: hello, world!\n wp.utils.warn(\"hello, world!\")",
|
|
274
|
+
"Warp UserWarning: hello, world!\n"
|
|
275
|
+
"Warp UserWarning: hello, world!\n"
|
|
284
276
|
)
|
|
277
|
+
|
|
285
278
|
self.assertEqual(f.getvalue(), expected)
|
|
286
279
|
|
|
280
|
+
# Test verbose warnings
|
|
281
|
+
saved_verbosity = wp.config.verbose_warnings
|
|
282
|
+
try:
|
|
283
|
+
wp.config.verbose_warnings = True
|
|
284
|
+
with contextlib.redirect_stdout(io.StringIO()) as f:
|
|
285
|
+
frame_info = inspect.getframeinfo(inspect.currentframe())
|
|
286
|
+
wp.utils.warn("hello, world!")
|
|
287
|
+
wp.utils.warn("hello, world!")
|
|
288
|
+
|
|
289
|
+
expected = (
|
|
290
|
+
f"Warp UserWarning: hello, world! ({frame_info.filename}:{frame_info.lineno + 1})\n"
|
|
291
|
+
" wp.utils.warn(\"hello, world!\")\n"
|
|
292
|
+
f"Warp UserWarning: hello, world! ({frame_info.filename}:{frame_info.lineno + 2})\n"
|
|
293
|
+
" wp.utils.warn(\"hello, world!\")\n"
|
|
294
|
+
)
|
|
295
|
+
|
|
296
|
+
self.assertEqual(f.getvalue(), expected)
|
|
297
|
+
|
|
298
|
+
finally:
|
|
299
|
+
# make sure to restore warning verbosity
|
|
300
|
+
wp.config.verbose_warnings = saved_verbosity
|
|
301
|
+
|
|
302
|
+
|
|
287
303
|
# Multiple similar deprecation warnings get printed out only once.
|
|
288
304
|
with contextlib.redirect_stdout(io.StringIO()) as f:
|
|
289
|
-
frame_info = inspect.getframeinfo(inspect.currentframe())
|
|
290
305
|
wp.utils.warn("hello, world!", category=DeprecationWarning)
|
|
291
306
|
wp.utils.warn("hello, world!", category=DeprecationWarning)
|
|
292
307
|
|
|
293
308
|
expected = (
|
|
294
|
-
"
|
|
295
|
-
).format(
|
|
296
|
-
frame_info.filename,
|
|
297
|
-
frame_info.lineno + 1,
|
|
298
|
-
"DeprecationWarning: hello, world!\n wp.utils.warn(\"hello, world!\", category=DeprecationWarning)",
|
|
309
|
+
"Warp DeprecationWarning: hello, world!\n"
|
|
299
310
|
)
|
|
311
|
+
|
|
300
312
|
self.assertEqual(f.getvalue(), expected)
|
|
301
313
|
|
|
302
314
|
# Multiple different deprecation warnings get printed out each time.
|
|
303
315
|
with contextlib.redirect_stdout(io.StringIO()) as f:
|
|
304
|
-
frame_info = inspect.getframeinfo(inspect.currentframe())
|
|
305
316
|
wp.utils.warn("foo", category=DeprecationWarning)
|
|
306
317
|
wp.utils.warn("bar", category=DeprecationWarning)
|
|
307
318
|
|
|
308
319
|
expected = (
|
|
309
|
-
"
|
|
310
|
-
"
|
|
311
|
-
).format(
|
|
312
|
-
frame_info.filename,
|
|
313
|
-
frame_info.lineno + 1,
|
|
314
|
-
"DeprecationWarning: foo\n wp.utils.warn(\"foo\", category=DeprecationWarning)",
|
|
315
|
-
frame_info.filename,
|
|
316
|
-
frame_info.lineno + 2,
|
|
317
|
-
"DeprecationWarning: bar\n wp.utils.warn(\"bar\", category=DeprecationWarning)",
|
|
320
|
+
"Warp DeprecationWarning: foo\n"
|
|
321
|
+
"Warp DeprecationWarning: bar\n"
|
|
318
322
|
)
|
|
323
|
+
|
|
319
324
|
self.assertEqual(f.getvalue(), expected)
|
|
320
325
|
|
|
321
326
|
def test_transform_expand(self):
|
|
@@ -425,7 +430,7 @@ class TestUtils(unittest.TestCase):
|
|
|
425
430
|
with wp.ScopedTimer("hello", detailed=True):
|
|
426
431
|
pass
|
|
427
432
|
|
|
428
|
-
self.assertRegex(f.getvalue(), r"^
|
|
433
|
+
self.assertRegex(f.getvalue(), r"^ 4 function calls in \d+\.\d+ seconds")
|
|
429
434
|
self.assertRegex(f.getvalue(), r"hello took \d+\.\d+ ms$")
|
|
430
435
|
|
|
431
436
|
|