warp-lang 1.4.2__py3-none-manylinux2014_aarch64.whl → 1.5.1__py3-none-manylinux2014_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of warp-lang might be problematic. Click here for more details.
- warp/__init__.py +4 -0
- warp/autograd.py +43 -8
- warp/bin/warp-clang.so +0 -0
- warp/bin/warp.so +0 -0
- warp/build.py +21 -2
- warp/build_dll.py +23 -6
- warp/builtins.py +1819 -7
- warp/codegen.py +197 -61
- warp/config.py +2 -2
- warp/context.py +379 -107
- warp/examples/assets/pixel.jpg +0 -0
- warp/examples/benchmarks/benchmark_cloth_paddle.py +86 -0
- warp/examples/benchmarks/benchmark_gemm.py +121 -0
- warp/examples/benchmarks/benchmark_interop_paddle.py +158 -0
- warp/examples/benchmarks/benchmark_tile.py +179 -0
- warp/examples/fem/example_adaptive_grid.py +37 -10
- warp/examples/fem/example_apic_fluid.py +3 -2
- warp/examples/fem/example_convection_diffusion_dg.py +4 -5
- warp/examples/fem/example_deformed_geometry.py +1 -1
- warp/examples/fem/example_diffusion_3d.py +47 -4
- warp/examples/fem/example_distortion_energy.py +220 -0
- warp/examples/fem/example_magnetostatics.py +127 -85
- warp/examples/fem/example_nonconforming_contact.py +5 -5
- warp/examples/fem/example_stokes.py +3 -1
- warp/examples/fem/example_streamlines.py +12 -19
- warp/examples/fem/utils.py +38 -15
- warp/examples/sim/example_cloth.py +4 -25
- warp/examples/sim/example_quadruped.py +2 -1
- warp/examples/tile/example_tile_convolution.py +58 -0
- warp/examples/tile/example_tile_fft.py +47 -0
- warp/examples/tile/example_tile_filtering.py +105 -0
- warp/examples/tile/example_tile_matmul.py +79 -0
- warp/examples/tile/example_tile_mlp.py +375 -0
- warp/fem/__init__.py +8 -0
- warp/fem/cache.py +16 -12
- warp/fem/dirichlet.py +1 -1
- warp/fem/domain.py +44 -1
- warp/fem/field/__init__.py +1 -2
- warp/fem/field/field.py +31 -19
- warp/fem/field/nodal_field.py +101 -49
- warp/fem/field/virtual.py +794 -0
- warp/fem/geometry/__init__.py +2 -2
- warp/fem/geometry/deformed_geometry.py +3 -105
- warp/fem/geometry/element.py +13 -0
- warp/fem/geometry/geometry.py +165 -7
- warp/fem/geometry/grid_2d.py +3 -6
- warp/fem/geometry/grid_3d.py +31 -28
- warp/fem/geometry/hexmesh.py +3 -46
- warp/fem/geometry/nanogrid.py +3 -2
- warp/fem/geometry/{quadmesh_2d.py → quadmesh.py} +280 -159
- warp/fem/geometry/tetmesh.py +2 -43
- warp/fem/geometry/{trimesh_2d.py → trimesh.py} +354 -186
- warp/fem/integrate.py +683 -261
- warp/fem/linalg.py +404 -0
- warp/fem/operator.py +101 -18
- warp/fem/polynomial.py +5 -5
- warp/fem/quadrature/quadrature.py +45 -21
- warp/fem/space/__init__.py +45 -11
- warp/fem/space/basis_function_space.py +451 -0
- warp/fem/space/basis_space.py +58 -11
- warp/fem/space/function_space.py +146 -5
- warp/fem/space/grid_2d_function_space.py +80 -66
- warp/fem/space/grid_3d_function_space.py +113 -68
- warp/fem/space/hexmesh_function_space.py +96 -108
- warp/fem/space/nanogrid_function_space.py +62 -110
- warp/fem/space/quadmesh_function_space.py +208 -0
- warp/fem/space/shape/__init__.py +45 -7
- warp/fem/space/shape/cube_shape_function.py +328 -54
- warp/fem/space/shape/shape_function.py +10 -1
- warp/fem/space/shape/square_shape_function.py +328 -60
- warp/fem/space/shape/tet_shape_function.py +269 -19
- warp/fem/space/shape/triangle_shape_function.py +238 -19
- warp/fem/space/tetmesh_function_space.py +69 -37
- warp/fem/space/topology.py +38 -0
- warp/fem/space/trimesh_function_space.py +179 -0
- warp/fem/utils.py +6 -331
- warp/jax_experimental.py +3 -1
- warp/native/array.h +15 -0
- warp/native/builtin.h +66 -26
- warp/native/bvh.h +4 -0
- warp/native/coloring.cpp +604 -0
- warp/native/cuda_util.cpp +68 -51
- warp/native/cuda_util.h +2 -1
- warp/native/fabric.h +8 -0
- warp/native/hashgrid.h +4 -0
- warp/native/marching.cu +8 -0
- warp/native/mat.h +14 -3
- warp/native/mathdx.cpp +59 -0
- warp/native/mesh.h +4 -0
- warp/native/range.h +13 -1
- warp/native/reduce.cpp +9 -1
- warp/native/reduce.cu +7 -0
- warp/native/runlength_encode.cpp +9 -1
- warp/native/runlength_encode.cu +7 -1
- warp/native/scan.cpp +8 -0
- warp/native/scan.cu +8 -0
- warp/native/scan.h +8 -1
- warp/native/sparse.cpp +8 -0
- warp/native/sparse.cu +8 -0
- warp/native/temp_buffer.h +7 -0
- warp/native/tile.h +1854 -0
- warp/native/tile_gemm.h +341 -0
- warp/native/tile_reduce.h +210 -0
- warp/native/volume_builder.cu +8 -0
- warp/native/volume_builder.h +8 -0
- warp/native/warp.cpp +10 -2
- warp/native/warp.cu +369 -15
- warp/native/warp.h +12 -2
- warp/optim/adam.py +39 -4
- warp/paddle.py +29 -12
- warp/render/render_opengl.py +140 -67
- warp/sim/graph_coloring.py +292 -0
- warp/sim/import_urdf.py +8 -8
- warp/sim/integrator_euler.py +4 -2
- warp/sim/integrator_featherstone.py +115 -44
- warp/sim/integrator_vbd.py +6 -0
- warp/sim/model.py +109 -32
- warp/sparse.py +1 -1
- warp/stubs.py +569 -4
- warp/tape.py +12 -7
- warp/tests/assets/pixel.npy +0 -0
- warp/tests/aux_test_instancing_gc.py +18 -0
- warp/tests/test_array.py +39 -0
- warp/tests/test_codegen.py +81 -1
- warp/tests/test_codegen_instancing.py +30 -0
- warp/tests/test_collision.py +110 -0
- warp/tests/test_coloring.py +251 -0
- warp/tests/test_context.py +34 -0
- warp/tests/test_examples.py +21 -5
- warp/tests/test_fem.py +453 -113
- warp/tests/test_func.py +34 -4
- warp/tests/test_generics.py +52 -0
- warp/tests/test_iter.py +68 -0
- warp/tests/test_lerp.py +13 -87
- warp/tests/test_mat_scalar_ops.py +1 -1
- warp/tests/test_matmul.py +6 -9
- warp/tests/test_matmul_lite.py +6 -11
- warp/tests/test_mesh_query_point.py +1 -1
- warp/tests/test_module_hashing.py +23 -0
- warp/tests/test_overwrite.py +45 -0
- warp/tests/test_paddle.py +27 -87
- warp/tests/test_print.py +56 -1
- warp/tests/test_smoothstep.py +17 -83
- warp/tests/test_spatial.py +1 -1
- warp/tests/test_static.py +3 -3
- warp/tests/test_tile.py +744 -0
- warp/tests/test_tile_mathdx.py +144 -0
- warp/tests/test_tile_mlp.py +383 -0
- warp/tests/test_tile_reduce.py +374 -0
- warp/tests/test_tile_shared_memory.py +190 -0
- warp/tests/test_vbd.py +12 -20
- warp/tests/test_volume.py +43 -0
- warp/tests/unittest_suites.py +19 -2
- warp/tests/unittest_utils.py +4 -2
- warp/types.py +340 -74
- warp/utils.py +23 -3
- {warp_lang-1.4.2.dist-info → warp_lang-1.5.1.dist-info}/METADATA +32 -7
- {warp_lang-1.4.2.dist-info → warp_lang-1.5.1.dist-info}/RECORD +161 -134
- {warp_lang-1.4.2.dist-info → warp_lang-1.5.1.dist-info}/WHEEL +1 -1
- warp/fem/field/test.py +0 -180
- warp/fem/field/trial.py +0 -183
- warp/fem/space/collocated_function_space.py +0 -102
- warp/fem/space/quadmesh_2d_function_space.py +0 -261
- warp/fem/space/trimesh_2d_function_space.py +0 -153
- {warp_lang-1.4.2.dist-info → warp_lang-1.5.1.dist-info}/LICENSE.md +0 -0
- {warp_lang-1.4.2.dist-info → warp_lang-1.5.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,374 @@
|
|
|
1
|
+
# Copyright (c) 2024 NVIDIA CORPORATION. All rights reserved.
|
|
2
|
+
# NVIDIA CORPORATION and its licensors retain all intellectual property
|
|
3
|
+
# and proprietary rights in and to this software, related documentation
|
|
4
|
+
# and any modifications thereto. Any use, reproduction, disclosure or
|
|
5
|
+
# distribution of this software and related documentation without an express
|
|
6
|
+
# license agreement from NVIDIA CORPORATION is strictly prohibited.
|
|
7
|
+
|
|
8
|
+
import unittest
|
|
9
|
+
|
|
10
|
+
import numpy as np
|
|
11
|
+
|
|
12
|
+
import warp as wp
|
|
13
|
+
from warp.tests.unittest_utils import *
|
|
14
|
+
|
|
15
|
+
TILE_M = wp.constant(8)
|
|
16
|
+
TILE_N = wp.constant(4)
|
|
17
|
+
TILE_K = wp.constant(8)
|
|
18
|
+
|
|
19
|
+
# num threads per-tile
|
|
20
|
+
TILE_DIM = 64
|
|
21
|
+
|
|
22
|
+
|
|
23
|
+
@wp.kernel
|
|
24
|
+
def tile_sum_kernel(input: wp.array2d(dtype=float), output: wp.array(dtype=float)):
|
|
25
|
+
# output tile index
|
|
26
|
+
i = wp.tid()
|
|
27
|
+
|
|
28
|
+
n = input.shape[1]
|
|
29
|
+
count = int(n / TILE_DIM)
|
|
30
|
+
|
|
31
|
+
s = wp.tile_zeros(m=1, n=1, dtype=float)
|
|
32
|
+
|
|
33
|
+
for j in range(count):
|
|
34
|
+
a = wp.tile_load(input, i, j, m=1, n=TILE_DIM)
|
|
35
|
+
s += wp.tile_sum(a) * 0.5
|
|
36
|
+
|
|
37
|
+
wp.tile_store(output, i, s)
|
|
38
|
+
|
|
39
|
+
|
|
40
|
+
def test_tile_reduce_sum(test, device):
|
|
41
|
+
batch_count = 56
|
|
42
|
+
|
|
43
|
+
N = TILE_DIM * 3
|
|
44
|
+
|
|
45
|
+
rng = np.random.default_rng(42)
|
|
46
|
+
input = rng.random((batch_count, N), dtype=np.float32)
|
|
47
|
+
|
|
48
|
+
input_wp = wp.array(input, requires_grad=True, device=device)
|
|
49
|
+
output_wp = wp.zeros(batch_count, requires_grad=True, device=device)
|
|
50
|
+
|
|
51
|
+
with wp.Tape() as tape:
|
|
52
|
+
wp.launch_tiled(
|
|
53
|
+
tile_sum_kernel, dim=[batch_count], inputs=[input_wp, output_wp], block_dim=TILE_DIM, device=device
|
|
54
|
+
)
|
|
55
|
+
|
|
56
|
+
sum_wp = output_wp.numpy()
|
|
57
|
+
for i in range(batch_count):
|
|
58
|
+
sum_np = np.sum(input[i]) * 0.5
|
|
59
|
+
test.assertAlmostEqual(sum_wp[i], sum_np, places=4)
|
|
60
|
+
|
|
61
|
+
output_wp.grad.fill_(1.0)
|
|
62
|
+
|
|
63
|
+
tape.backward()
|
|
64
|
+
|
|
65
|
+
assert_np_equal(input_wp.grad.numpy(), np.ones_like(input) * 0.5, tol=1.0e-4)
|
|
66
|
+
|
|
67
|
+
|
|
68
|
+
@wp.kernel
|
|
69
|
+
def tile_min_kernel(input: wp.array2d(dtype=float), output: wp.array(dtype=float)):
|
|
70
|
+
# output tile index
|
|
71
|
+
i = wp.tid()
|
|
72
|
+
|
|
73
|
+
a = wp.tile_load(input, i, 0, m=1, n=TILE_DIM)
|
|
74
|
+
m = wp.tile_min(a)
|
|
75
|
+
|
|
76
|
+
wp.tile_store(output, i, m)
|
|
77
|
+
|
|
78
|
+
|
|
79
|
+
def test_tile_reduce_min(test, device):
|
|
80
|
+
batch_count = 56
|
|
81
|
+
|
|
82
|
+
N = TILE_DIM
|
|
83
|
+
|
|
84
|
+
rng = np.random.default_rng(42)
|
|
85
|
+
input = rng.random((batch_count, N), dtype=np.float32)
|
|
86
|
+
|
|
87
|
+
input_wp = wp.array(input, requires_grad=True, device=device)
|
|
88
|
+
output_wp = wp.zeros(batch_count, requires_grad=True, device=device)
|
|
89
|
+
|
|
90
|
+
with wp.Tape() as tape:
|
|
91
|
+
wp.launch_tiled(
|
|
92
|
+
tile_min_kernel, dim=[batch_count], inputs=[input_wp, output_wp], block_dim=TILE_DIM, device=device
|
|
93
|
+
)
|
|
94
|
+
|
|
95
|
+
min_wp = output_wp.numpy()
|
|
96
|
+
for i in range(batch_count):
|
|
97
|
+
min_np = np.min(input[i])
|
|
98
|
+
test.assertAlmostEqual(min_wp[i], min_np, places=4)
|
|
99
|
+
|
|
100
|
+
|
|
101
|
+
@wp.kernel
|
|
102
|
+
def tile_max_kernel(input: wp.array2d(dtype=float), output: wp.array(dtype=float)):
|
|
103
|
+
# output tile index
|
|
104
|
+
i = wp.tid()
|
|
105
|
+
|
|
106
|
+
a = wp.tile_load(input, i, 0, m=1, n=TILE_DIM)
|
|
107
|
+
m = wp.tile_max(a)
|
|
108
|
+
|
|
109
|
+
wp.tile_store(output, i, m)
|
|
110
|
+
|
|
111
|
+
|
|
112
|
+
def test_tile_reduce_max(test, device):
|
|
113
|
+
batch_count = 56
|
|
114
|
+
|
|
115
|
+
N = TILE_DIM
|
|
116
|
+
|
|
117
|
+
rng = np.random.default_rng(42)
|
|
118
|
+
input = rng.random((batch_count, N), dtype=np.float32)
|
|
119
|
+
|
|
120
|
+
input_wp = wp.array(input, requires_grad=True, device=device)
|
|
121
|
+
output_wp = wp.zeros(batch_count, requires_grad=True, device=device)
|
|
122
|
+
|
|
123
|
+
with wp.Tape() as tape:
|
|
124
|
+
wp.launch_tiled(
|
|
125
|
+
tile_max_kernel, dim=[batch_count], inputs=[input_wp, output_wp], block_dim=TILE_DIM, device=device
|
|
126
|
+
)
|
|
127
|
+
|
|
128
|
+
max_wp = output_wp.numpy()
|
|
129
|
+
for i in range(batch_count):
|
|
130
|
+
max_np = np.max(input[i])
|
|
131
|
+
test.assertAlmostEqual(max_wp[i], max_np, places=4)
|
|
132
|
+
|
|
133
|
+
|
|
134
|
+
@wp.kernel
|
|
135
|
+
def tile_reduce_custom_kernel(input: wp.array2d(dtype=float), output: wp.array(dtype=float)):
|
|
136
|
+
# output tile index
|
|
137
|
+
i = wp.tid()
|
|
138
|
+
|
|
139
|
+
a = wp.tile_load(input, i, 0, m=1, n=TILE_DIM)
|
|
140
|
+
m = wp.tile_reduce(wp.mul, a)
|
|
141
|
+
|
|
142
|
+
wp.tile_store(output, i, m)
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def test_tile_reduce_custom(test, device):
|
|
146
|
+
batch_count = 56
|
|
147
|
+
|
|
148
|
+
N = TILE_DIM
|
|
149
|
+
|
|
150
|
+
rng = np.random.default_rng(42)
|
|
151
|
+
input = rng.random((batch_count, N), dtype=np.float32)
|
|
152
|
+
|
|
153
|
+
input_wp = wp.array(input, requires_grad=True, device=device)
|
|
154
|
+
output_wp = wp.zeros(batch_count, requires_grad=True, device=device)
|
|
155
|
+
|
|
156
|
+
with wp.Tape() as tape:
|
|
157
|
+
wp.launch_tiled(
|
|
158
|
+
tile_reduce_custom_kernel,
|
|
159
|
+
dim=[batch_count],
|
|
160
|
+
inputs=[input_wp, output_wp],
|
|
161
|
+
block_dim=TILE_DIM,
|
|
162
|
+
device=device,
|
|
163
|
+
)
|
|
164
|
+
|
|
165
|
+
prod_wp = output_wp.numpy()
|
|
166
|
+
for i in range(batch_count):
|
|
167
|
+
prod_np = np.prod(input[i])
|
|
168
|
+
test.assertAlmostEqual(prod_wp[i], prod_np, places=4)
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
@wp.kernel
|
|
172
|
+
def tile_grouped_sum_kernel(input: wp.array3d(dtype=float), output: wp.array(dtype=float)):
|
|
173
|
+
# output tile index
|
|
174
|
+
i = wp.tid()
|
|
175
|
+
|
|
176
|
+
a = wp.tile_load(input[i], 0, 0, m=TILE_M, n=TILE_N)
|
|
177
|
+
s = wp.tile_sum(a) * 0.5
|
|
178
|
+
|
|
179
|
+
wp.tile_store(output, i, s)
|
|
180
|
+
|
|
181
|
+
|
|
182
|
+
def test_tile_reduce_grouped_sum(test, device):
|
|
183
|
+
batch_count = 56
|
|
184
|
+
|
|
185
|
+
M = TILE_M
|
|
186
|
+
N = TILE_N
|
|
187
|
+
|
|
188
|
+
rng = np.random.default_rng(42)
|
|
189
|
+
input = rng.random((batch_count, M, N), dtype=np.float32)
|
|
190
|
+
|
|
191
|
+
input_wp = wp.array(input, requires_grad=True, device=device)
|
|
192
|
+
output_wp = wp.zeros(batch_count, requires_grad=True, device=device)
|
|
193
|
+
|
|
194
|
+
with wp.Tape() as tape:
|
|
195
|
+
wp.launch_tiled(
|
|
196
|
+
tile_sum_kernel, dim=[batch_count], inputs=[input_wp, output_wp], block_dim=TILE_DIM, device=device
|
|
197
|
+
)
|
|
198
|
+
|
|
199
|
+
sum_wp = output_wp.numpy()
|
|
200
|
+
for i in range(batch_count):
|
|
201
|
+
sum_np = np.sum(input[i]) * 0.5
|
|
202
|
+
test.assertAlmostEqual(sum_wp[i], sum_np, places=4)
|
|
203
|
+
|
|
204
|
+
output_wp.grad.fill_(1.0)
|
|
205
|
+
|
|
206
|
+
tape.backward()
|
|
207
|
+
|
|
208
|
+
assert_np_equal(input_wp.grad.numpy(), np.ones_like(input) * 0.5, tol=1.0e-4)
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
@wp.kernel
|
|
212
|
+
def tile_reduce_simt_kernel(output: wp.array(dtype=int)):
|
|
213
|
+
# thread index
|
|
214
|
+
i = wp.tid()
|
|
215
|
+
|
|
216
|
+
t = wp.tile(i) # convert to block wide tile
|
|
217
|
+
s = wp.tile_sum(t) # sum over block
|
|
218
|
+
|
|
219
|
+
# update global sum
|
|
220
|
+
wp.tile_atomic_add(output, 0, 0, s)
|
|
221
|
+
|
|
222
|
+
|
|
223
|
+
def test_tile_reduce_simt(test, device):
|
|
224
|
+
# use an unaligned grid dimension
|
|
225
|
+
N = TILE_DIM * 4 + 5
|
|
226
|
+
|
|
227
|
+
output = wp.zeros(shape=1, dtype=int, requires_grad=True, device=device)
|
|
228
|
+
|
|
229
|
+
with wp.Tape() as tape:
|
|
230
|
+
wp.launch(tile_reduce_simt_kernel, dim=N, inputs=[output], block_dim=TILE_DIM, device=device)
|
|
231
|
+
|
|
232
|
+
test.assertEqual(output.numpy()[0], np.sum(np.arange(N)))
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
@wp.kernel
|
|
236
|
+
def tile_untile_kernel(output: wp.array(dtype=int)):
|
|
237
|
+
# thread index
|
|
238
|
+
i = wp.tid()
|
|
239
|
+
|
|
240
|
+
# convert to block wide tile
|
|
241
|
+
t = wp.tile(i) * 2
|
|
242
|
+
s = wp.untile(t)
|
|
243
|
+
|
|
244
|
+
output[i] = s
|
|
245
|
+
|
|
246
|
+
|
|
247
|
+
def test_tile_untile(test, device):
|
|
248
|
+
# use an unaligned grid dimension
|
|
249
|
+
N = TILE_DIM * 4 + 5
|
|
250
|
+
|
|
251
|
+
output = wp.zeros(shape=N, dtype=int, requires_grad=True, device=device)
|
|
252
|
+
|
|
253
|
+
with wp.Tape() as tape:
|
|
254
|
+
wp.launch(tile_untile_kernel, dim=N, inputs=[output], block_dim=TILE_DIM, device=device)
|
|
255
|
+
|
|
256
|
+
assert_np_equal(output.numpy(), np.arange(N) * 2)
|
|
257
|
+
|
|
258
|
+
|
|
259
|
+
@wp.kernel
|
|
260
|
+
def tile_untile_scalar_kernel(output: wp.array(dtype=int)):
|
|
261
|
+
# thread index
|
|
262
|
+
i = wp.tid()
|
|
263
|
+
|
|
264
|
+
# convert to block wide tile
|
|
265
|
+
t = wp.tile(i) * 2
|
|
266
|
+
s = wp.untile(t)
|
|
267
|
+
|
|
268
|
+
output[i] = s
|
|
269
|
+
|
|
270
|
+
|
|
271
|
+
def test_tile_untile_scalar(test, device):
|
|
272
|
+
# use an unaligned grid dimension
|
|
273
|
+
N = TILE_DIM * 4 + 5
|
|
274
|
+
|
|
275
|
+
output = wp.zeros(shape=N, dtype=int, requires_grad=True, device=device)
|
|
276
|
+
|
|
277
|
+
with wp.Tape() as tape:
|
|
278
|
+
wp.launch(tile_untile_kernel, dim=N, inputs=[output], block_dim=TILE_DIM, device=device)
|
|
279
|
+
|
|
280
|
+
assert_np_equal(output.numpy(), np.arange(N) * 2)
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
@wp.kernel
|
|
284
|
+
def test_untile_vector_kernel(input: wp.array(dtype=wp.vec3), output: wp.array(dtype=wp.vec3)):
|
|
285
|
+
i = wp.tid()
|
|
286
|
+
|
|
287
|
+
v = input[i] * 0.5
|
|
288
|
+
|
|
289
|
+
t = wp.tile(v)
|
|
290
|
+
u = wp.untile(t)
|
|
291
|
+
|
|
292
|
+
output[i] = u * 2.0
|
|
293
|
+
|
|
294
|
+
|
|
295
|
+
def test_tile_untile_vector(test, device):
|
|
296
|
+
input = wp.full(16, wp.vec3(1.0, 2.0, 3.0), requires_grad=True, device=device)
|
|
297
|
+
output = wp.zeros_like(input, device=device)
|
|
298
|
+
|
|
299
|
+
with wp.Tape() as tape:
|
|
300
|
+
wp.launch(test_untile_vector_kernel, dim=16, inputs=[input, output], block_dim=16, device=device)
|
|
301
|
+
|
|
302
|
+
output.grad = wp.ones_like(output, device=device)
|
|
303
|
+
tape.backward()
|
|
304
|
+
|
|
305
|
+
assert_np_equal(output.numpy(), input.numpy())
|
|
306
|
+
assert_np_equal(input.grad.numpy(), np.ones((16, 3)))
|
|
307
|
+
|
|
308
|
+
|
|
309
|
+
@wp.kernel
|
|
310
|
+
def tile_ones_kernel(out: wp.array(dtype=float)):
|
|
311
|
+
i = wp.tid()
|
|
312
|
+
|
|
313
|
+
t = wp.tile_ones(dtype=float, m=16, n=16)
|
|
314
|
+
s = wp.tile_sum(t)
|
|
315
|
+
|
|
316
|
+
wp.tile_store(out, 0, s)
|
|
317
|
+
|
|
318
|
+
|
|
319
|
+
def test_tile_ones(test, device):
|
|
320
|
+
output = wp.zeros(1, dtype=float, device=device)
|
|
321
|
+
|
|
322
|
+
with wp.Tape() as tape:
|
|
323
|
+
wp.launch_tiled(tile_ones_kernel, dim=[1], inputs=[output], block_dim=TILE_DIM, device=device)
|
|
324
|
+
|
|
325
|
+
test.assertAlmostEqual(output.numpy()[0], 256.0)
|
|
326
|
+
|
|
327
|
+
|
|
328
|
+
@wp.kernel
|
|
329
|
+
def tile_arange_kernel(out: wp.array2d(dtype=int)):
|
|
330
|
+
i = wp.tid()
|
|
331
|
+
|
|
332
|
+
a = wp.tile_arange(17, dtype=int)
|
|
333
|
+
b = wp.tile_arange(5, 23, dtype=int)
|
|
334
|
+
c = wp.tile_arange(0, 34, 2, dtype=int)
|
|
335
|
+
|
|
336
|
+
wp.tile_store(out, 0, 0, a)
|
|
337
|
+
wp.tile_store(out, 1, 0, b)
|
|
338
|
+
wp.tile_store(out, 2, 0, c)
|
|
339
|
+
|
|
340
|
+
|
|
341
|
+
def test_tile_arange(test, device):
|
|
342
|
+
N = 17
|
|
343
|
+
|
|
344
|
+
output = wp.zeros(shape=(3, N), dtype=int, device=device)
|
|
345
|
+
|
|
346
|
+
with wp.Tape() as tape:
|
|
347
|
+
wp.launch_tiled(tile_arange_kernel, dim=[1], inputs=[output], block_dim=TILE_DIM, device=device)
|
|
348
|
+
|
|
349
|
+
assert_np_equal(output.numpy()[0], np.arange(17))
|
|
350
|
+
assert_np_equal(output.numpy()[1], np.arange(5, 22))
|
|
351
|
+
assert_np_equal(output.numpy()[2], np.arange(0, 34, 2))
|
|
352
|
+
|
|
353
|
+
|
|
354
|
+
devices = get_cuda_test_devices()
|
|
355
|
+
|
|
356
|
+
|
|
357
|
+
class TestTileReduce(unittest.TestCase):
|
|
358
|
+
pass
|
|
359
|
+
|
|
360
|
+
|
|
361
|
+
add_function_test(TestTileReduce, "test_tile_reduce_sum", test_tile_reduce_sum, devices=devices)
|
|
362
|
+
add_function_test(TestTileReduce, "test_tile_reduce_min", test_tile_reduce_min, devices=devices)
|
|
363
|
+
add_function_test(TestTileReduce, "test_tile_reduce_max", test_tile_reduce_max, devices=devices)
|
|
364
|
+
add_function_test(TestTileReduce, "test_tile_reduce_custom", test_tile_reduce_custom, devices=devices)
|
|
365
|
+
add_function_test(TestTileReduce, "test_tile_reduce_grouped_sum", test_tile_reduce_sum, devices=devices)
|
|
366
|
+
add_function_test(TestTileReduce, "test_tile_reduce_simt", test_tile_reduce_simt, devices=devices)
|
|
367
|
+
add_function_test(TestTileReduce, "test_tile_ones", test_tile_ones, devices=devices)
|
|
368
|
+
add_function_test(TestTileReduce, "test_tile_arange", test_tile_arange, devices=devices)
|
|
369
|
+
add_function_test(TestTileReduce, "test_tile_untile_scalar", test_tile_untile_scalar, devices=devices)
|
|
370
|
+
add_function_test(TestTileReduce, "test_tile_untile_vector", test_tile_untile_vector, devices=devices)
|
|
371
|
+
|
|
372
|
+
if __name__ == "__main__":
|
|
373
|
+
wp.clear_kernel_cache()
|
|
374
|
+
unittest.main(verbosity=2, failfast=True)
|
|
@@ -0,0 +1,190 @@
|
|
|
1
|
+
# Copyright (c) 2024 NVIDIA CORPORATION. All rights reserved.
|
|
2
|
+
# NVIDIA CORPORATION and its licensors retain all intellectual property
|
|
3
|
+
# and proprietary rights in and to this software, related documentation
|
|
4
|
+
# and any modifications thereto. Any use, reproduction, disclosure or
|
|
5
|
+
# distribution of this software and related documentation without an express
|
|
6
|
+
# license agreement from NVIDIA CORPORATION is strictly prohibited.
|
|
7
|
+
|
|
8
|
+
import unittest
|
|
9
|
+
|
|
10
|
+
import numpy as np
|
|
11
|
+
|
|
12
|
+
import warp as wp
|
|
13
|
+
from warp.tests.unittest_utils import *
|
|
14
|
+
|
|
15
|
+
|
|
16
|
+
# checks that we can configure shared memory to the expected size
|
|
17
|
+
def test_tile_shared_mem_size(test, device):
|
|
18
|
+
DIM_M = 32
|
|
19
|
+
DIM_N = 32
|
|
20
|
+
|
|
21
|
+
BLOCK_DIM = 256
|
|
22
|
+
|
|
23
|
+
@wp.kernel
|
|
24
|
+
def compute(out: wp.array2d(dtype=float)):
|
|
25
|
+
a = wp.tile_ones(DIM_M, DIM_N, dtype=float, storage="shared")
|
|
26
|
+
b = wp.tile_ones(DIM_M, DIM_N, dtype=float, storage="shared") * 2.0
|
|
27
|
+
|
|
28
|
+
c = a + b
|
|
29
|
+
wp.tile_store(out, 0, 0, c)
|
|
30
|
+
|
|
31
|
+
out = wp.empty((DIM_M, DIM_N), dtype=float, device=device)
|
|
32
|
+
|
|
33
|
+
wp.launch_tiled(compute, dim=[1], inputs=[out], block_dim=BLOCK_DIM, device=device)
|
|
34
|
+
|
|
35
|
+
# check output
|
|
36
|
+
assert_np_equal(out.numpy(), np.ones((DIM_M, DIM_N)) * 3.0)
|
|
37
|
+
|
|
38
|
+
# check required shared memory
|
|
39
|
+
expected_forward_bytes = DIM_M * DIM_N * 4 * 2
|
|
40
|
+
expected_backward_bytes = expected_forward_bytes * 2
|
|
41
|
+
|
|
42
|
+
# check shared memory for kernel on the device
|
|
43
|
+
module_exec = compute.module.load(device, BLOCK_DIM)
|
|
44
|
+
hooks = module_exec.get_kernel_hooks(compute)
|
|
45
|
+
|
|
46
|
+
assert hooks.forward_smem_bytes == expected_forward_bytes
|
|
47
|
+
assert hooks.backward_smem_bytes == expected_backward_bytes
|
|
48
|
+
|
|
49
|
+
|
|
50
|
+
# checks that we can configure shared memory > 48kb default
|
|
51
|
+
def test_tile_shared_mem_large(test, device):
|
|
52
|
+
# set dimensions that require 64kb for the forward kernel
|
|
53
|
+
DIM_M = 64
|
|
54
|
+
DIM_N = 128
|
|
55
|
+
|
|
56
|
+
BLOCK_DIM = 256
|
|
57
|
+
|
|
58
|
+
# we disable backward kernel gen since 128k is not supported on most architectures
|
|
59
|
+
@wp.kernel(enable_backward=False)
|
|
60
|
+
def compute(out: wp.array2d(dtype=float)):
|
|
61
|
+
a = wp.tile_ones(DIM_M, DIM_N, dtype=float, storage="shared")
|
|
62
|
+
b = wp.tile_ones(DIM_M, DIM_N, dtype=float, storage="shared") * 2.0
|
|
63
|
+
|
|
64
|
+
c = a + b
|
|
65
|
+
wp.tile_store(out, 0, 0, c)
|
|
66
|
+
|
|
67
|
+
out = wp.empty((DIM_M, DIM_N), dtype=float, device=device)
|
|
68
|
+
|
|
69
|
+
wp.launch_tiled(compute, dim=[1], inputs=[out], block_dim=BLOCK_DIM, device=device)
|
|
70
|
+
|
|
71
|
+
# check output
|
|
72
|
+
assert_np_equal(out.numpy(), np.ones((DIM_M, DIM_N)) * 3.0)
|
|
73
|
+
|
|
74
|
+
# check required shared memory
|
|
75
|
+
expected_forward_bytes = DIM_M * DIM_N * 4 * 2
|
|
76
|
+
expected_backward_bytes = expected_forward_bytes * 2
|
|
77
|
+
|
|
78
|
+
assert expected_forward_bytes == 2**16
|
|
79
|
+
|
|
80
|
+
# check shared memory for kernel on the device
|
|
81
|
+
module_exec = compute.module.load(device, BLOCK_DIM)
|
|
82
|
+
hooks = module_exec.get_kernel_hooks(compute)
|
|
83
|
+
|
|
84
|
+
assert hooks.forward_smem_bytes == expected_forward_bytes
|
|
85
|
+
assert hooks.backward_smem_bytes == expected_backward_bytes
|
|
86
|
+
|
|
87
|
+
|
|
88
|
+
# checks that we can configure dynamic shared memory during graph capture
|
|
89
|
+
def test_tile_shared_mem_graph(test, device):
|
|
90
|
+
DIM_M = 32
|
|
91
|
+
DIM_N = 32
|
|
92
|
+
|
|
93
|
+
BLOCK_DIM = 256
|
|
94
|
+
|
|
95
|
+
@wp.kernel
|
|
96
|
+
def compute(out: wp.array2d(dtype=float)):
|
|
97
|
+
a = wp.tile_ones(DIM_M, DIM_N, dtype=float, storage="shared")
|
|
98
|
+
b = wp.tile_ones(DIM_M, DIM_N, dtype=float, storage="shared") * 2.0
|
|
99
|
+
|
|
100
|
+
c = a + b
|
|
101
|
+
wp.tile_store(out, 0, 0, c)
|
|
102
|
+
|
|
103
|
+
out = wp.empty((DIM_M, DIM_N), dtype=float, device=device)
|
|
104
|
+
|
|
105
|
+
wp.load_module(device=device)
|
|
106
|
+
|
|
107
|
+
wp.capture_begin(device, force_module_load=False)
|
|
108
|
+
wp.launch_tiled(compute, dim=[1], inputs=[out], block_dim=BLOCK_DIM, device=device)
|
|
109
|
+
graph = wp.capture_end(device)
|
|
110
|
+
|
|
111
|
+
wp.capture_launch(graph)
|
|
112
|
+
|
|
113
|
+
# check output
|
|
114
|
+
assert_np_equal(out.numpy(), np.ones((DIM_M, DIM_N)) * 3.0)
|
|
115
|
+
|
|
116
|
+
# check required shared memory
|
|
117
|
+
expected_forward_bytes = DIM_M * DIM_N * 4 * 2
|
|
118
|
+
expected_backward_bytes = expected_forward_bytes * 2
|
|
119
|
+
|
|
120
|
+
# check shared memory for kernel on the device
|
|
121
|
+
module_exec = compute.module.load(device, BLOCK_DIM)
|
|
122
|
+
hooks = module_exec.get_kernel_hooks(compute)
|
|
123
|
+
|
|
124
|
+
assert hooks.forward_smem_bytes == expected_forward_bytes
|
|
125
|
+
assert hooks.backward_smem_bytes == expected_backward_bytes
|
|
126
|
+
|
|
127
|
+
|
|
128
|
+
# checks that stack allocations work for user functions
|
|
129
|
+
def test_tile_shared_mem_func(test, device):
|
|
130
|
+
DIM_M = 32
|
|
131
|
+
DIM_N = 32
|
|
132
|
+
|
|
133
|
+
BLOCK_DIM = 256
|
|
134
|
+
|
|
135
|
+
@wp.func
|
|
136
|
+
def add_tile_small():
|
|
137
|
+
a = wp.tile_ones(16, 16, dtype=float, storage="shared")
|
|
138
|
+
b = wp.tile_ones(16, 16, dtype=float, storage="shared") * 2.0
|
|
139
|
+
|
|
140
|
+
return a + b
|
|
141
|
+
|
|
142
|
+
@wp.func
|
|
143
|
+
def add_tile_big():
|
|
144
|
+
a = wp.tile_ones(64, 64, dtype=float, storage="shared")
|
|
145
|
+
b = wp.tile_ones(64, 64, dtype=float, storage="shared") * 2.0
|
|
146
|
+
|
|
147
|
+
return a + b
|
|
148
|
+
|
|
149
|
+
@wp.kernel
|
|
150
|
+
def compute(out: wp.array2d(dtype=float)):
|
|
151
|
+
s = add_tile_small()
|
|
152
|
+
b = add_tile_big()
|
|
153
|
+
|
|
154
|
+
wp.tile_store(out, 0, 0, b)
|
|
155
|
+
|
|
156
|
+
out = wp.empty((DIM_M, DIM_N), dtype=float, device=device)
|
|
157
|
+
|
|
158
|
+
wp.launch_tiled(compute, dim=[1], inputs=[out], block_dim=BLOCK_DIM, device=device)
|
|
159
|
+
|
|
160
|
+
# check shared memory for kernel on the device
|
|
161
|
+
module_exec = compute.module.load(device, BLOCK_DIM)
|
|
162
|
+
hooks = module_exec.get_kernel_hooks(compute)
|
|
163
|
+
|
|
164
|
+
# ensure that total required dynamic shared is the larger of the two tiles
|
|
165
|
+
expected_required_shared = 64 * 64 * 4 * 2
|
|
166
|
+
|
|
167
|
+
assert hooks.forward_smem_bytes == expected_required_shared
|
|
168
|
+
assert hooks.backward_smem_bytes == expected_required_shared * 2
|
|
169
|
+
|
|
170
|
+
|
|
171
|
+
devices = get_cuda_test_devices()
|
|
172
|
+
|
|
173
|
+
|
|
174
|
+
class TestTileSharedMemory(unittest.TestCase):
|
|
175
|
+
pass
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
add_function_test(
|
|
179
|
+
TestTileSharedMemory, "test_tile_shared_mem_size", test_tile_shared_mem_size, devices=devices, check_output=False
|
|
180
|
+
)
|
|
181
|
+
add_function_test(
|
|
182
|
+
TestTileSharedMemory, "test_tile_shared_mem_large", test_tile_shared_mem_large, devices=devices, check_output=False
|
|
183
|
+
)
|
|
184
|
+
add_function_test(TestTileSharedMemory, "test_tile_shared_mem_graph", test_tile_shared_mem_graph, devices=devices)
|
|
185
|
+
add_function_test(TestTileSharedMemory, "test_tile_shared_mem_func", test_tile_shared_mem_func, devices=devices)
|
|
186
|
+
|
|
187
|
+
|
|
188
|
+
if __name__ == "__main__":
|
|
189
|
+
wp.clear_kernel_cache()
|
|
190
|
+
unittest.main(verbosity=2, failfast=True)
|
warp/tests/test_vbd.py
CHANGED
|
@@ -5,6 +5,8 @@
|
|
|
5
5
|
# distribution of this software and related documentation without an express
|
|
6
6
|
# license agreement from NVIDIA CORPORATION is strictly prohibited.
|
|
7
7
|
|
|
8
|
+
import contextlib
|
|
9
|
+
import io
|
|
8
10
|
import unittest
|
|
9
11
|
|
|
10
12
|
import warp as wp
|
|
@@ -287,14 +289,6 @@ class VBDClothSim:
|
|
|
287
289
|
89, 99, 100
|
|
288
290
|
]
|
|
289
291
|
|
|
290
|
-
self.coloring = [
|
|
291
|
-
[9, 12, 17, 24, 31, 38, 43, 46, 50, 62, 65, 68, 80, 84, 89, 92],
|
|
292
|
-
[6, 20, 25, 32, 37, 44, 51, 56, 59, 63, 70, 75, 82, 88, 90, 94, 96],
|
|
293
|
-
[2, 8, 10, 14, 26, 29, 33, 40, 48, 52, 55, 67, 73, 79, 86, 91, 98],
|
|
294
|
-
[4, 11, 16, 23, 28, 30, 35, 42, 49, 54, 57, 71, 74, 76, 78, 93, 97],
|
|
295
|
-
[3, 15, 18, 22, 34, 36, 39, 41, 53, 58, 60, 66, 72, 85, 99, 0, 87],
|
|
296
|
-
[7, 21, 27, 45, 47, 61, 64, 69, 77, 81, 83, 95, 1, 5, 13, 19],
|
|
297
|
-
]
|
|
298
292
|
# fmt: on
|
|
299
293
|
|
|
300
294
|
self.dt = 1 / 60
|
|
@@ -323,6 +317,7 @@ class VBDClothSim:
|
|
|
323
317
|
tri_ka=stiffness,
|
|
324
318
|
tri_kd=kd,
|
|
325
319
|
)
|
|
320
|
+
builder.color()
|
|
326
321
|
|
|
327
322
|
self.model = builder.finalize(device=device)
|
|
328
323
|
self.model.ground = True
|
|
@@ -331,11 +326,6 @@ class VBDClothSim:
|
|
|
331
326
|
self.model.soft_contact_ke = 1.0e4
|
|
332
327
|
self.model.soft_contact_kd = 1.0e2
|
|
333
328
|
|
|
334
|
-
coloring_wp = []
|
|
335
|
-
for color in self.coloring:
|
|
336
|
-
coloring_wp.append(wp.array(color, dtype=wp.int32, device=self.model.device))
|
|
337
|
-
self.model.coloring = coloring_wp
|
|
338
|
-
|
|
339
329
|
self.dt = self.dt / self.num_substeps
|
|
340
330
|
self.fixed_particles = [0, 9]
|
|
341
331
|
|
|
@@ -367,19 +357,21 @@ class VBDClothSim:
|
|
|
367
357
|
model.particle_flags = wp.array(flags, device=model.device)
|
|
368
358
|
|
|
369
359
|
|
|
370
|
-
def test_vbd_cloth(test, device):
|
|
371
|
-
example = VBDClothSim(device)
|
|
372
|
-
example.run(test)
|
|
373
|
-
|
|
374
|
-
|
|
375
360
|
devices = get_test_devices()
|
|
376
361
|
|
|
377
362
|
|
|
378
363
|
class TestVBD(unittest.TestCase):
|
|
379
|
-
|
|
364
|
+
def test_vbd_cloth(self):
|
|
365
|
+
for device in devices:
|
|
366
|
+
with contextlib.redirect_stdout(io.StringIO()) as f:
|
|
367
|
+
example = VBDClothSim(device)
|
|
368
|
+
self.assertRegex(
|
|
369
|
+
f.getvalue(),
|
|
370
|
+
r"Warp UserWarning: The graph is not optimizable anymore, terminated with a max/min ratio: 2.0 without reaching the target ratio: 1.1",
|
|
371
|
+
)
|
|
380
372
|
|
|
373
|
+
example.run(self)
|
|
381
374
|
|
|
382
|
-
add_function_test(TestVBD, "test_vbd_cloth", test_vbd_cloth, devices=devices)
|
|
383
375
|
|
|
384
376
|
if __name__ == "__main__":
|
|
385
377
|
wp.clear_kernel_cache()
|
warp/tests/test_volume.py
CHANGED
|
@@ -5,6 +5,8 @@
|
|
|
5
5
|
# distribution of this software and related documentation without an express
|
|
6
6
|
# license agreement from NVIDIA CORPORATION is strictly prohibited.
|
|
7
7
|
|
|
8
|
+
import os
|
|
9
|
+
import tempfile
|
|
8
10
|
import unittest
|
|
9
11
|
from typing import Any
|
|
10
12
|
|
|
@@ -890,6 +892,46 @@ def test_volume_aniso_transform(test, device):
|
|
|
890
892
|
assert_np_equal(transform, np.array(volume.get_grid_info().transform_matrix).reshape(3, 3))
|
|
891
893
|
|
|
892
894
|
|
|
895
|
+
def test_volume_write(test, device):
|
|
896
|
+
codecs = ["none", "zip", "blosc"]
|
|
897
|
+
try:
|
|
898
|
+
import blosc # noqa: F401 I001
|
|
899
|
+
except ImportError:
|
|
900
|
+
codecs.pop()
|
|
901
|
+
|
|
902
|
+
for volume_name in ("float", "vec3f", "index"):
|
|
903
|
+
for codec in codecs:
|
|
904
|
+
with test.subTest(volume_name=volume_name, codec=codec):
|
|
905
|
+
volume = volumes[volume_name][device.alias]
|
|
906
|
+
fd, file_path = tempfile.mkstemp(suffix=".nvdb")
|
|
907
|
+
os.close(fd)
|
|
908
|
+
try:
|
|
909
|
+
volume.save_to_nvdb(file_path, codec=codec)
|
|
910
|
+
with open(file_path, "rb") as f:
|
|
911
|
+
volume_2 = wp.Volume.load_from_nvdb(f)
|
|
912
|
+
next_volume = volume
|
|
913
|
+
while next_volume:
|
|
914
|
+
np.testing.assert_array_equal(next_volume.array().numpy(), volume_2.array().numpy())
|
|
915
|
+
next_volume = next_volume.load_next_grid()
|
|
916
|
+
volume_2 = volume_2.load_next_grid()
|
|
917
|
+
|
|
918
|
+
finally:
|
|
919
|
+
os.remove(file_path)
|
|
920
|
+
|
|
921
|
+
with test.subTest(volume_write="unsupported"):
|
|
922
|
+
volume = volumes["index"][device.alias]
|
|
923
|
+
volume = volume.load_next_grid()
|
|
924
|
+
|
|
925
|
+
fd, file_path = tempfile.mkstemp(suffix=".nvdb")
|
|
926
|
+
os.close(fd)
|
|
927
|
+
|
|
928
|
+
try:
|
|
929
|
+
with test.assertRaises(RuntimeError):
|
|
930
|
+
volume.save_to_nvdb(file_path, codec=codec)
|
|
931
|
+
finally:
|
|
932
|
+
os.remove(file_path)
|
|
933
|
+
|
|
934
|
+
|
|
893
935
|
class TestVolume(unittest.TestCase):
|
|
894
936
|
def test_volume_new_del(self):
|
|
895
937
|
# test the scenario in which a volume is created but not initialized before gc
|
|
@@ -930,6 +972,7 @@ add_function_test(
|
|
|
930
972
|
add_function_test(TestVolume, "test_volume_multiple_grids", test_volume_multiple_grids, devices=devices)
|
|
931
973
|
add_function_test(TestVolume, "test_volume_feature_array", test_volume_feature_array, devices=devices)
|
|
932
974
|
add_function_test(TestVolume, "test_volume_sample_index", test_volume_sample_index, devices=devices)
|
|
975
|
+
add_function_test(TestVolume, "test_volume_write", test_volume_write, devices=[wp.get_device("cpu")])
|
|
933
976
|
|
|
934
977
|
points = {}
|
|
935
978
|
points_jittered = {}
|