warp-lang 1.5.1__py3-none-manylinux2014_x86_64.whl → 1.6.1__py3-none-manylinux2014_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of warp-lang might be problematic. Click here for more details.
- warp/__init__.py +5 -0
- warp/autograd.py +414 -191
- warp/bin/warp-clang.so +0 -0
- warp/bin/warp.so +0 -0
- warp/build.py +40 -12
- warp/build_dll.py +13 -6
- warp/builtins.py +1077 -481
- warp/codegen.py +250 -122
- warp/config.py +65 -21
- warp/context.py +500 -149
- warp/examples/assets/square_cloth.usd +0 -0
- warp/examples/benchmarks/benchmark_gemm.py +27 -18
- warp/examples/benchmarks/benchmark_interop_paddle.py +3 -3
- warp/examples/benchmarks/benchmark_interop_torch.py +3 -3
- warp/examples/core/example_marching_cubes.py +1 -1
- warp/examples/core/example_mesh.py +1 -1
- warp/examples/core/example_torch.py +18 -34
- warp/examples/core/example_wave.py +1 -1
- warp/examples/fem/example_apic_fluid.py +1 -0
- warp/examples/fem/example_mixed_elasticity.py +1 -1
- warp/examples/optim/example_bounce.py +1 -1
- warp/examples/optim/example_cloth_throw.py +1 -1
- warp/examples/optim/example_diffray.py +4 -15
- warp/examples/optim/example_drone.py +1 -1
- warp/examples/optim/example_softbody_properties.py +392 -0
- warp/examples/optim/example_trajectory.py +1 -3
- warp/examples/optim/example_walker.py +5 -0
- warp/examples/sim/example_cartpole.py +0 -2
- warp/examples/sim/example_cloth_self_contact.py +314 -0
- warp/examples/sim/example_granular_collision_sdf.py +4 -5
- warp/examples/sim/example_jacobian_ik.py +0 -2
- warp/examples/sim/example_quadruped.py +5 -2
- warp/examples/tile/example_tile_cholesky.py +79 -0
- warp/examples/tile/example_tile_convolution.py +2 -2
- warp/examples/tile/example_tile_fft.py +2 -2
- warp/examples/tile/example_tile_filtering.py +3 -3
- warp/examples/tile/example_tile_matmul.py +4 -4
- warp/examples/tile/example_tile_mlp.py +12 -12
- warp/examples/tile/example_tile_nbody.py +191 -0
- warp/examples/tile/example_tile_walker.py +319 -0
- warp/math.py +147 -0
- warp/native/array.h +12 -0
- warp/native/builtin.h +0 -1
- warp/native/bvh.cpp +149 -70
- warp/native/bvh.cu +287 -68
- warp/native/bvh.h +195 -85
- warp/native/clang/clang.cpp +6 -2
- warp/native/crt.h +1 -0
- warp/native/cuda_util.cpp +35 -0
- warp/native/cuda_util.h +5 -0
- warp/native/exports.h +40 -40
- warp/native/intersect.h +17 -0
- warp/native/mat.h +57 -3
- warp/native/mathdx.cpp +19 -0
- warp/native/mesh.cpp +25 -8
- warp/native/mesh.cu +153 -101
- warp/native/mesh.h +482 -403
- warp/native/quat.h +40 -0
- warp/native/solid_angle.h +7 -0
- warp/native/sort.cpp +85 -0
- warp/native/sort.cu +34 -0
- warp/native/sort.h +3 -1
- warp/native/spatial.h +11 -0
- warp/native/tile.h +1189 -664
- warp/native/tile_reduce.h +8 -6
- warp/native/vec.h +41 -0
- warp/native/warp.cpp +8 -1
- warp/native/warp.cu +263 -40
- warp/native/warp.h +19 -5
- warp/optim/linear.py +22 -4
- warp/render/render_opengl.py +132 -59
- warp/render/render_usd.py +10 -2
- warp/sim/__init__.py +6 -1
- warp/sim/collide.py +289 -32
- warp/sim/import_urdf.py +20 -5
- warp/sim/integrator_euler.py +25 -7
- warp/sim/integrator_featherstone.py +147 -35
- warp/sim/integrator_vbd.py +842 -40
- warp/sim/model.py +173 -112
- warp/sim/render.py +2 -2
- warp/stubs.py +249 -116
- warp/tape.py +28 -30
- warp/tests/aux_test_module_unload.py +15 -0
- warp/tests/{test_sim_grad.py → flaky_test_sim_grad.py} +104 -63
- warp/tests/test_array.py +100 -0
- warp/tests/test_assert.py +242 -0
- warp/tests/test_codegen.py +14 -61
- warp/tests/test_collision.py +8 -8
- warp/tests/test_examples.py +16 -1
- warp/tests/test_grad_debug.py +87 -2
- warp/tests/test_hash_grid.py +1 -1
- warp/tests/test_ipc.py +116 -0
- warp/tests/test_launch.py +77 -26
- warp/tests/test_mat.py +213 -168
- warp/tests/test_math.py +47 -1
- warp/tests/test_matmul.py +11 -7
- warp/tests/test_matmul_lite.py +4 -4
- warp/tests/test_mesh.py +84 -60
- warp/tests/test_mesh_query_aabb.py +165 -0
- warp/tests/test_mesh_query_point.py +328 -286
- warp/tests/test_mesh_query_ray.py +134 -121
- warp/tests/test_mlp.py +2 -2
- warp/tests/test_operators.py +43 -0
- warp/tests/test_overwrite.py +6 -5
- warp/tests/test_quat.py +77 -0
- warp/tests/test_reload.py +29 -0
- warp/tests/test_sim_grad_bounce_linear.py +204 -0
- warp/tests/test_static.py +16 -0
- warp/tests/test_tape.py +25 -0
- warp/tests/test_tile.py +134 -191
- warp/tests/test_tile_load.py +399 -0
- warp/tests/test_tile_mathdx.py +61 -8
- warp/tests/test_tile_mlp.py +17 -17
- warp/tests/test_tile_reduce.py +24 -18
- warp/tests/test_tile_shared_memory.py +66 -17
- warp/tests/test_tile_view.py +165 -0
- warp/tests/test_torch.py +35 -0
- warp/tests/test_utils.py +36 -24
- warp/tests/test_vec.py +110 -0
- warp/tests/unittest_suites.py +29 -4
- warp/tests/unittest_utils.py +30 -11
- warp/thirdparty/unittest_parallel.py +5 -2
- warp/types.py +419 -111
- warp/utils.py +9 -5
- {warp_lang-1.5.1.dist-info → warp_lang-1.6.1.dist-info}/METADATA +86 -45
- {warp_lang-1.5.1.dist-info → warp_lang-1.6.1.dist-info}/RECORD +129 -118
- {warp_lang-1.5.1.dist-info → warp_lang-1.6.1.dist-info}/WHEEL +1 -1
- warp/examples/benchmarks/benchmark_tile.py +0 -179
- warp/native/tile_gemm.h +0 -341
- {warp_lang-1.5.1.dist-info → warp_lang-1.6.1.dist-info}/LICENSE.md +0 -0
- {warp_lang-1.5.1.dist-info → warp_lang-1.6.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,399 @@
|
|
|
1
|
+
# Copyright (c) 2024 NVIDIA CORPORATION. All rights reserved.
|
|
2
|
+
# NVIDIA CORPORATION and its licensors retain all intellectual property
|
|
3
|
+
# and proprietary rights in and to this software, related documentation
|
|
4
|
+
# and any modifications thereto. Any use, reproduction, disclosure or
|
|
5
|
+
# distribution of this software and related documentation without an express
|
|
6
|
+
# license agreement from NVIDIA CORPORATION is strictly prohibited.
|
|
7
|
+
|
|
8
|
+
import unittest
|
|
9
|
+
|
|
10
|
+
import numpy as np
|
|
11
|
+
|
|
12
|
+
import warp as wp
|
|
13
|
+
from warp.tests.unittest_utils import *
|
|
14
|
+
|
|
15
|
+
TILE_DIM = 64
|
|
16
|
+
|
|
17
|
+
TILE_M = wp.constant(16)
|
|
18
|
+
TILE_N = wp.constant(8)
|
|
19
|
+
TILE_O = wp.constant(8)
|
|
20
|
+
TILE_P = wp.constant(6)
|
|
21
|
+
|
|
22
|
+
TILE_OFFSET = 5
|
|
23
|
+
|
|
24
|
+
|
|
25
|
+
@wp.kernel
|
|
26
|
+
def tile_load_1d_kernel(
|
|
27
|
+
input: wp.array1d(dtype=float),
|
|
28
|
+
out_full: wp.array1d(dtype=float),
|
|
29
|
+
out_padded: wp.array1d(dtype=float),
|
|
30
|
+
out_offset: wp.array1d(dtype=float),
|
|
31
|
+
):
|
|
32
|
+
full0 = wp.tile_load(input, TILE_M)
|
|
33
|
+
full1 = wp.tile_load(input, shape=TILE_M)
|
|
34
|
+
full2 = wp.tile_load(input, shape=(TILE_M,))
|
|
35
|
+
|
|
36
|
+
padded0 = wp.tile_load(input, TILE_M, TILE_OFFSET)
|
|
37
|
+
padded1 = wp.tile_load(input, shape=TILE_M, offset=TILE_OFFSET)
|
|
38
|
+
padded2 = wp.tile_load(input, shape=(TILE_M,), offset=(TILE_OFFSET,))
|
|
39
|
+
|
|
40
|
+
wp.tile_store(out_full, full0)
|
|
41
|
+
wp.tile_store(out_padded, padded0)
|
|
42
|
+
wp.tile_store(out_offset, full0, offset=(TILE_OFFSET,))
|
|
43
|
+
|
|
44
|
+
|
|
45
|
+
@wp.kernel
|
|
46
|
+
def tile_load_2d_kernel(
|
|
47
|
+
input: wp.array2d(dtype=float),
|
|
48
|
+
out_full: wp.array2d(dtype=float),
|
|
49
|
+
out_padded: wp.array2d(dtype=float),
|
|
50
|
+
out_offset: wp.array2d(dtype=float),
|
|
51
|
+
):
|
|
52
|
+
full0 = wp.tile_load(input, shape=(TILE_M, TILE_N))
|
|
53
|
+
padded0 = wp.tile_load(input, shape=(TILE_M, TILE_N), offset=(TILE_OFFSET, TILE_OFFSET))
|
|
54
|
+
|
|
55
|
+
wp.tile_store(out_full, full0)
|
|
56
|
+
wp.tile_store(out_padded, padded0)
|
|
57
|
+
wp.tile_store(out_offset, full0, offset=(TILE_OFFSET, TILE_OFFSET))
|
|
58
|
+
|
|
59
|
+
|
|
60
|
+
@wp.kernel
|
|
61
|
+
def tile_load_3d_kernel(
|
|
62
|
+
input: wp.array3d(dtype=float),
|
|
63
|
+
out_full: wp.array3d(dtype=float),
|
|
64
|
+
out_padded: wp.array3d(dtype=float),
|
|
65
|
+
out_offset: wp.array3d(dtype=float),
|
|
66
|
+
):
|
|
67
|
+
full0 = wp.tile_load(input, shape=(TILE_M, TILE_N, TILE_O))
|
|
68
|
+
padded0 = wp.tile_load(input, shape=(TILE_M, TILE_N, TILE_O), offset=(TILE_OFFSET, TILE_OFFSET, TILE_OFFSET))
|
|
69
|
+
|
|
70
|
+
wp.tile_store(out_full, full0)
|
|
71
|
+
wp.tile_store(out_padded, padded0)
|
|
72
|
+
wp.tile_store(out_offset, full0, offset=(TILE_OFFSET, TILE_OFFSET, TILE_OFFSET))
|
|
73
|
+
|
|
74
|
+
|
|
75
|
+
@wp.kernel
|
|
76
|
+
def tile_load_4d_kernel(
|
|
77
|
+
input: wp.array4d(dtype=float),
|
|
78
|
+
out_full: wp.array4d(dtype=float),
|
|
79
|
+
out_padded: wp.array4d(dtype=float),
|
|
80
|
+
out_offset: wp.array4d(dtype=float),
|
|
81
|
+
):
|
|
82
|
+
full0 = wp.tile_load(input, shape=(TILE_M, TILE_N, TILE_O, TILE_P))
|
|
83
|
+
padded0 = wp.tile_load(
|
|
84
|
+
input, shape=(TILE_M, TILE_N, TILE_O, TILE_P), offset=(TILE_OFFSET, TILE_OFFSET, TILE_OFFSET, TILE_OFFSET)
|
|
85
|
+
)
|
|
86
|
+
|
|
87
|
+
wp.tile_store(out_full, full0)
|
|
88
|
+
wp.tile_store(out_padded, padded0)
|
|
89
|
+
wp.tile_store(out_offset, full0, offset=(TILE_OFFSET, TILE_OFFSET, TILE_OFFSET, TILE_OFFSET))
|
|
90
|
+
|
|
91
|
+
|
|
92
|
+
def test_tile_load(kernel, ndim):
|
|
93
|
+
def test(test, device):
|
|
94
|
+
rng = np.random.default_rng(42)
|
|
95
|
+
|
|
96
|
+
shape = [TILE_M, TILE_N, TILE_O, TILE_P]
|
|
97
|
+
shape = shape[0:ndim]
|
|
98
|
+
|
|
99
|
+
input = wp.array(rng.random(shape), dtype=float, requires_grad=True, device=device)
|
|
100
|
+
output_full = wp.zeros(shape, dtype=float, device=device)
|
|
101
|
+
output_padded = wp.zeros(shape, dtype=float, device=device)
|
|
102
|
+
output_offset = wp.zeros(shape, dtype=float, device=device)
|
|
103
|
+
|
|
104
|
+
with wp.Tape() as tape:
|
|
105
|
+
wp.launch_tiled(
|
|
106
|
+
kernel,
|
|
107
|
+
dim=[1],
|
|
108
|
+
inputs=[input, output_full, output_padded, output_offset],
|
|
109
|
+
block_dim=TILE_DIM,
|
|
110
|
+
device=device,
|
|
111
|
+
)
|
|
112
|
+
|
|
113
|
+
# construct a slice for the offset portion of the source/dest arrays
|
|
114
|
+
src_slice = tuple(slice(TILE_OFFSET, dim) for dim in shape)
|
|
115
|
+
dest_slice = tuple(slice(None, dim - TILE_OFFSET) for dim in shape)
|
|
116
|
+
|
|
117
|
+
ref_full = input.numpy()
|
|
118
|
+
ref_padded = np.zeros_like(ref_full)
|
|
119
|
+
ref_padded[dest_slice] = ref_full[src_slice]
|
|
120
|
+
|
|
121
|
+
ref_offset = np.zeros_like(ref_full)
|
|
122
|
+
ref_offset[src_slice] = ref_full[dest_slice]
|
|
123
|
+
|
|
124
|
+
assert_np_equal(output_full.numpy(), ref_full)
|
|
125
|
+
assert_np_equal(output_padded.numpy(), ref_padded)
|
|
126
|
+
assert_np_equal(output_offset.numpy(), ref_offset)
|
|
127
|
+
|
|
128
|
+
output_full.grad = wp.ones_like(output_full)
|
|
129
|
+
tape.backward()
|
|
130
|
+
|
|
131
|
+
assert_np_equal(input.grad.numpy(), np.ones_like(input.grad.numpy()))
|
|
132
|
+
|
|
133
|
+
return test
|
|
134
|
+
|
|
135
|
+
|
|
136
|
+
@wp.kernel
|
|
137
|
+
def tile_load_unaligned_kernel(
|
|
138
|
+
input: wp.array2d(dtype=float),
|
|
139
|
+
output: wp.array2d(dtype=float),
|
|
140
|
+
):
|
|
141
|
+
t = wp.tile_load(input, shape=(TILE_M, TILE_N), offset=(1, 1), storage="shared")
|
|
142
|
+
wp.tile_store(output, t, offset=(1, 1))
|
|
143
|
+
|
|
144
|
+
|
|
145
|
+
def test_tile_load_unaligned(test, device):
|
|
146
|
+
rng = np.random.default_rng(42)
|
|
147
|
+
|
|
148
|
+
shape = [TILE_M + 1, TILE_N + 1]
|
|
149
|
+
|
|
150
|
+
input = wp.array(rng.random(shape), dtype=float, requires_grad=True, device=device)
|
|
151
|
+
output = wp.zeros(shape, dtype=float, device=device)
|
|
152
|
+
|
|
153
|
+
with wp.Tape() as tape:
|
|
154
|
+
wp.launch_tiled(
|
|
155
|
+
tile_load_unaligned_kernel,
|
|
156
|
+
dim=[1],
|
|
157
|
+
inputs=[input, output],
|
|
158
|
+
block_dim=TILE_DIM,
|
|
159
|
+
device=device,
|
|
160
|
+
)
|
|
161
|
+
|
|
162
|
+
# first row and column should be zero
|
|
163
|
+
assert_np_equal(output.numpy()[0, :], np.zeros(TILE_N + 1))
|
|
164
|
+
assert_np_equal(output.numpy()[:, 0], np.zeros(TILE_M + 1))
|
|
165
|
+
|
|
166
|
+
# check output elements
|
|
167
|
+
assert_np_equal(output.numpy()[1:, 1:], input.numpy()[1:, 1:])
|
|
168
|
+
|
|
169
|
+
output.grad = wp.ones_like(output)
|
|
170
|
+
tape.backward()
|
|
171
|
+
|
|
172
|
+
expected_grad = np.ones_like(input.grad.numpy())
|
|
173
|
+
expected_grad[0, :] = 0.0
|
|
174
|
+
expected_grad[:, 0] = 0.0
|
|
175
|
+
|
|
176
|
+
assert_np_equal(input.grad.numpy(), expected_grad)
|
|
177
|
+
|
|
178
|
+
|
|
179
|
+
# ----------------------------------------------------------------------------------------
|
|
180
|
+
|
|
181
|
+
TILE_SIZE = 4
|
|
182
|
+
|
|
183
|
+
|
|
184
|
+
@wp.kernel
|
|
185
|
+
def tile_extract_1d_kernel(input: wp.array1d(dtype=float), output: wp.array1d(dtype=float)):
|
|
186
|
+
i = wp.tid()
|
|
187
|
+
|
|
188
|
+
t = wp.tile_load(input, shape=TILE_SIZE)
|
|
189
|
+
|
|
190
|
+
output[i] = t[i]
|
|
191
|
+
|
|
192
|
+
|
|
193
|
+
@wp.kernel
|
|
194
|
+
def tile_extract_2d_kernel(input: wp.array2d(dtype=float), output: wp.array2d(dtype=float)):
|
|
195
|
+
i, j = wp.tid()
|
|
196
|
+
|
|
197
|
+
t = wp.tile_load(input, shape=(TILE_SIZE, TILE_SIZE))
|
|
198
|
+
|
|
199
|
+
output[i, j] = t[i, j]
|
|
200
|
+
|
|
201
|
+
|
|
202
|
+
@wp.kernel
|
|
203
|
+
def tile_extract_3d_kernel(input: wp.array3d(dtype=float), output: wp.array3d(dtype=float)):
|
|
204
|
+
i, j, k = wp.tid()
|
|
205
|
+
|
|
206
|
+
t = wp.tile_load(input, shape=(TILE_SIZE, TILE_SIZE, TILE_SIZE))
|
|
207
|
+
|
|
208
|
+
output[i, j, k] = t[i, j, k]
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
@wp.kernel
|
|
212
|
+
def tile_extract_4d_kernel(input: wp.array4d(dtype=float), output: wp.array4d(dtype=float)):
|
|
213
|
+
i, j, k, l = wp.tid()
|
|
214
|
+
|
|
215
|
+
t = wp.tile_load(input, shape=(TILE_SIZE, TILE_SIZE, TILE_SIZE, TILE_SIZE))
|
|
216
|
+
|
|
217
|
+
output[i, j, k, l] = t[i, j, k, l]
|
|
218
|
+
|
|
219
|
+
|
|
220
|
+
def test_tile_extract(kernel, ndim):
|
|
221
|
+
shape = (TILE_SIZE,) * ndim
|
|
222
|
+
|
|
223
|
+
def test_run(test, device):
|
|
224
|
+
rng = np.random.default_rng(42)
|
|
225
|
+
|
|
226
|
+
input = wp.array(rng.random(shape), dtype=float, requires_grad=True, device=device)
|
|
227
|
+
output = wp.zeros_like(input)
|
|
228
|
+
|
|
229
|
+
with wp.Tape() as tape:
|
|
230
|
+
wp.launch(
|
|
231
|
+
kernel,
|
|
232
|
+
dim=shape,
|
|
233
|
+
inputs=[input, output],
|
|
234
|
+
block_dim=1024,
|
|
235
|
+
device=device,
|
|
236
|
+
)
|
|
237
|
+
|
|
238
|
+
assert_np_equal(output.numpy(), input.numpy())
|
|
239
|
+
|
|
240
|
+
output.grad = wp.ones_like(output)
|
|
241
|
+
tape.backward()
|
|
242
|
+
|
|
243
|
+
assert_np_equal(input.grad.numpy(), np.ones_like(input.numpy()))
|
|
244
|
+
|
|
245
|
+
return test_run
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
# ----------------------------------------------------------------------------------------
|
|
249
|
+
|
|
250
|
+
TILE_SIZE = 4
|
|
251
|
+
|
|
252
|
+
|
|
253
|
+
@wp.kernel
|
|
254
|
+
def tile_assign_1d_kernel(input: wp.array1d(dtype=float), output: wp.array1d(dtype=float)):
|
|
255
|
+
i = wp.tid()
|
|
256
|
+
|
|
257
|
+
t = wp.tile_zeros(shape=(TILE_SIZE,), dtype=float)
|
|
258
|
+
|
|
259
|
+
# assign to tile
|
|
260
|
+
t[i] = input[i] * 2.0
|
|
261
|
+
|
|
262
|
+
output[i] = t[i]
|
|
263
|
+
|
|
264
|
+
|
|
265
|
+
@wp.kernel
|
|
266
|
+
def tile_assign_2d_kernel(input: wp.array2d(dtype=float), output: wp.array2d(dtype=float)):
|
|
267
|
+
i, j = wp.tid()
|
|
268
|
+
|
|
269
|
+
t = wp.tile_zeros(shape=(TILE_SIZE, TILE_SIZE), dtype=float)
|
|
270
|
+
|
|
271
|
+
# assign to tile
|
|
272
|
+
t[i, j] = input[i, j] * 2.0
|
|
273
|
+
|
|
274
|
+
output[i, j] = t[i, j]
|
|
275
|
+
|
|
276
|
+
|
|
277
|
+
@wp.kernel
|
|
278
|
+
def tile_assign_3d_kernel(input: wp.array3d(dtype=float), output: wp.array3d(dtype=float)):
|
|
279
|
+
i, j, k = wp.tid()
|
|
280
|
+
|
|
281
|
+
t = wp.tile_zeros(shape=(TILE_SIZE, TILE_SIZE, TILE_SIZE), dtype=float)
|
|
282
|
+
|
|
283
|
+
# assign to tile
|
|
284
|
+
t[i, j, k] = input[i, j, k] * 2.0
|
|
285
|
+
|
|
286
|
+
output[i, j, k] = t[i, j, k]
|
|
287
|
+
|
|
288
|
+
|
|
289
|
+
@wp.kernel
|
|
290
|
+
def tile_assign_4d_kernel(input: wp.array4d(dtype=float), output: wp.array4d(dtype=float)):
|
|
291
|
+
i, j, k, l = wp.tid()
|
|
292
|
+
|
|
293
|
+
t = wp.tile_zeros(shape=(TILE_SIZE, TILE_SIZE, TILE_SIZE, TILE_SIZE), dtype=float)
|
|
294
|
+
|
|
295
|
+
# assign to tile
|
|
296
|
+
t[i, j, k, l] = input[i, j, k, l] * 2.0
|
|
297
|
+
|
|
298
|
+
output[i, j, k, l] = t[i, j, k, l]
|
|
299
|
+
|
|
300
|
+
|
|
301
|
+
def test_tile_assign(kernel, ndim):
|
|
302
|
+
shape = (TILE_SIZE,) * ndim
|
|
303
|
+
|
|
304
|
+
def test_run(test, device):
|
|
305
|
+
rng = np.random.default_rng(42)
|
|
306
|
+
|
|
307
|
+
input = wp.array(rng.random(shape), dtype=float, requires_grad=True, device=device)
|
|
308
|
+
output = wp.zeros_like(input)
|
|
309
|
+
|
|
310
|
+
with wp.Tape() as tape:
|
|
311
|
+
wp.launch(
|
|
312
|
+
kernel,
|
|
313
|
+
dim=shape,
|
|
314
|
+
inputs=[input, output],
|
|
315
|
+
block_dim=1024,
|
|
316
|
+
device=device,
|
|
317
|
+
)
|
|
318
|
+
|
|
319
|
+
assert_np_equal(output.numpy(), input.numpy() * 2.0)
|
|
320
|
+
|
|
321
|
+
return test_run
|
|
322
|
+
|
|
323
|
+
|
|
324
|
+
# ----------------------------------------------------------------------------------------
|
|
325
|
+
|
|
326
|
+
|
|
327
|
+
@wp.kernel
|
|
328
|
+
def tile_load_fortran_kernel(A: wp.array2d(dtype=float), B: wp.array2d(dtype=float)):
|
|
329
|
+
# tile index
|
|
330
|
+
i, j = wp.tid()
|
|
331
|
+
|
|
332
|
+
a = wp.tile_load(A, shape=(TILE_M, TILE_N), offset=(i * TILE_M, j * TILE_N))
|
|
333
|
+
wp.tile_store(B, t=a, offset=(i * TILE_M, j * TILE_N))
|
|
334
|
+
|
|
335
|
+
|
|
336
|
+
def test_tile_load_fortran(test, device):
|
|
337
|
+
rng = np.random.default_rng(42)
|
|
338
|
+
|
|
339
|
+
M = TILE_M * 7
|
|
340
|
+
N = TILE_N * 5
|
|
341
|
+
|
|
342
|
+
A = rng.random((M, N), dtype=np.float32)
|
|
343
|
+
B = rng.random((M, N), dtype=np.float32)
|
|
344
|
+
|
|
345
|
+
# convert to column major layout
|
|
346
|
+
A = np.asfortranarray(A)
|
|
347
|
+
B = np.asfortranarray(B)
|
|
348
|
+
|
|
349
|
+
A_wp = wp.array(A, requires_grad=True, device=device)
|
|
350
|
+
B_wp = wp.array(B, requires_grad=True, device=device)
|
|
351
|
+
|
|
352
|
+
with wp.Tape() as tape:
|
|
353
|
+
wp.launch_tiled(
|
|
354
|
+
tile_load_fortran_kernel,
|
|
355
|
+
dim=[int(M / TILE_M), int(N / TILE_N)],
|
|
356
|
+
inputs=[A_wp, B_wp],
|
|
357
|
+
block_dim=TILE_DIM,
|
|
358
|
+
device=device,
|
|
359
|
+
)
|
|
360
|
+
|
|
361
|
+
# verify forward pass
|
|
362
|
+
assert_array_equal(B_wp, A_wp)
|
|
363
|
+
|
|
364
|
+
# verify backward pass
|
|
365
|
+
B_wp.grad = wp.ones_like(B_wp, device=device)
|
|
366
|
+
tape.backward()
|
|
367
|
+
|
|
368
|
+
assert_array_equal(B_wp.grad, A_wp.grad)
|
|
369
|
+
|
|
370
|
+
|
|
371
|
+
devices = get_cuda_test_devices()
|
|
372
|
+
|
|
373
|
+
|
|
374
|
+
class TestTileLoad(unittest.TestCase):
|
|
375
|
+
pass
|
|
376
|
+
|
|
377
|
+
|
|
378
|
+
add_function_test(TestTileLoad, "test_tile_load_1d", test_tile_load(tile_load_1d_kernel, 1), devices=devices)
|
|
379
|
+
add_function_test(TestTileLoad, "test_tile_load_2d", test_tile_load(tile_load_2d_kernel, 2), devices=devices)
|
|
380
|
+
add_function_test(TestTileLoad, "test_tile_load_3d", test_tile_load(tile_load_3d_kernel, 3), devices=devices)
|
|
381
|
+
add_function_test(TestTileLoad, "test_tile_load_4d", test_tile_load(tile_load_4d_kernel, 4), devices=devices)
|
|
382
|
+
add_function_test(TestTileLoad, "test_tile_load_unaligned", test_tile_load_unaligned, devices=devices)
|
|
383
|
+
|
|
384
|
+
add_function_test(TestTileLoad, "test_tile_extract_1d", test_tile_extract(tile_extract_1d_kernel, 1), devices=devices)
|
|
385
|
+
add_function_test(TestTileLoad, "test_tile_extract_2d", test_tile_extract(tile_extract_2d_kernel, 2), devices=devices)
|
|
386
|
+
add_function_test(TestTileLoad, "test_tile_extract_3d", test_tile_extract(tile_extract_3d_kernel, 3), devices=devices)
|
|
387
|
+
add_function_test(TestTileLoad, "test_tile_extract_4d", test_tile_extract(tile_extract_4d_kernel, 4), devices=devices)
|
|
388
|
+
|
|
389
|
+
add_function_test(TestTileLoad, "test_tile_assign_1d", test_tile_assign(tile_assign_1d_kernel, 1), devices=devices)
|
|
390
|
+
add_function_test(TestTileLoad, "test_tile_assign_2d", test_tile_assign(tile_assign_2d_kernel, 2), devices=devices)
|
|
391
|
+
add_function_test(TestTileLoad, "test_tile_assign_3d", test_tile_assign(tile_assign_3d_kernel, 3), devices=devices)
|
|
392
|
+
add_function_test(TestTileLoad, "test_tile_assign_4d", test_tile_assign(tile_assign_4d_kernel, 4), devices=devices)
|
|
393
|
+
|
|
394
|
+
add_function_test(TestTileLoad, "test_tile_load_fortran", test_tile_load_fortran, devices=devices)
|
|
395
|
+
|
|
396
|
+
|
|
397
|
+
if __name__ == "__main__":
|
|
398
|
+
wp.clear_kernel_cache()
|
|
399
|
+
unittest.main(verbosity=2, failfast=True)
|
warp/tests/test_tile_mathdx.py
CHANGED
|
@@ -30,11 +30,11 @@ def tile_math_matmul_kernel(
|
|
|
30
30
|
ga: wp.array2d(dtype=wp.float16), gb: wp.array2d(dtype=wp.float32), gc: wp.array2d(dtype=wp.float64)
|
|
31
31
|
):
|
|
32
32
|
i, j = wp.tid()
|
|
33
|
-
a = wp.tile_load(ga,
|
|
34
|
-
b = wp.tile_load(gb,
|
|
35
|
-
c = wp.tile_zeros(
|
|
33
|
+
a = wp.tile_load(ga, shape=(TILE_M, TILE_K), offset=(i * TILE_M, j * TILE_K))
|
|
34
|
+
b = wp.tile_load(gb, shape=(TILE_K, TILE_N), offset=(i * TILE_K, j * TILE_N))
|
|
35
|
+
c = wp.tile_zeros(shape=(TILE_M, TILE_N), dtype=wp.float64)
|
|
36
36
|
wp.tile_matmul(a, b, c)
|
|
37
|
-
wp.tile_store(gc, i, j
|
|
37
|
+
wp.tile_store(gc, c, offset=(i * TILE_M, j * TILE_N))
|
|
38
38
|
|
|
39
39
|
|
|
40
40
|
def test_tile_math_matmul(test, device):
|
|
@@ -71,17 +71,17 @@ def test_tile_math_matmul(test, device):
|
|
|
71
71
|
@wp.kernel()
|
|
72
72
|
def tile_math_fft_kernel_vec2f(gx: wp.array2d(dtype=wp.vec2f), gy: wp.array2d(dtype=wp.vec2f)):
|
|
73
73
|
i, j = wp.tid()
|
|
74
|
-
xy = wp.tile_load(gx,
|
|
74
|
+
xy = wp.tile_load(gx, shape=(FFT_SIZE_FP32, FFT_SIZE_FP32))
|
|
75
75
|
wp.tile_fft(xy)
|
|
76
|
-
wp.tile_store(gy,
|
|
76
|
+
wp.tile_store(gy, xy)
|
|
77
77
|
|
|
78
78
|
|
|
79
79
|
@wp.kernel()
|
|
80
80
|
def tile_math_fft_kernel_vec2d(gx: wp.array2d(dtype=wp.vec2d), gy: wp.array2d(dtype=wp.vec2d)):
|
|
81
81
|
i, j = wp.tid()
|
|
82
|
-
xy = wp.tile_load(gx,
|
|
82
|
+
xy = wp.tile_load(gx, shape=(FFT_SIZE_FP64, FFT_SIZE_FP64))
|
|
83
83
|
wp.tile_fft(xy)
|
|
84
|
-
wp.tile_store(gy,
|
|
84
|
+
wp.tile_store(gy, xy)
|
|
85
85
|
|
|
86
86
|
|
|
87
87
|
def test_tile_math_fft(test, device, wp_dtype):
|
|
@@ -114,6 +114,56 @@ def test_tile_math_fft(test, device, wp_dtype):
|
|
|
114
114
|
# TODO: implement and test backward pass
|
|
115
115
|
|
|
116
116
|
|
|
117
|
+
@wp.kernel()
|
|
118
|
+
def tile_math_cholesky(
|
|
119
|
+
gA: wp.array2d(dtype=wp.float64),
|
|
120
|
+
gD: wp.array1d(dtype=wp.float64),
|
|
121
|
+
gL: wp.array2d(dtype=wp.float64),
|
|
122
|
+
gx: wp.array1d(dtype=wp.float64),
|
|
123
|
+
gy: wp.array1d(dtype=wp.float64),
|
|
124
|
+
):
|
|
125
|
+
i, j = wp.tid()
|
|
126
|
+
# Load A, D & x
|
|
127
|
+
a = wp.tile_load(gA, shape=(TILE_M, TILE_M), storage="shared")
|
|
128
|
+
d = wp.tile_load(gD, shape=TILE_M, storage="shared")
|
|
129
|
+
x = wp.tile_load(gx, shape=TILE_M, storage="shared")
|
|
130
|
+
# Compute L st LL^T = A + diag(D)
|
|
131
|
+
b = wp.tile_diag_add(a, d)
|
|
132
|
+
l = wp.tile_cholesky(b)
|
|
133
|
+
# Solve for y in LL^T y = x
|
|
134
|
+
y = wp.tile_cholesky_solve(l, x)
|
|
135
|
+
# Store L & y
|
|
136
|
+
wp.tile_store(gL, l)
|
|
137
|
+
wp.tile_store(gy, y)
|
|
138
|
+
|
|
139
|
+
|
|
140
|
+
def test_tile_math_cholesky(test, device):
|
|
141
|
+
A_h = np.ones((TILE_M, TILE_M), dtype=np.float64)
|
|
142
|
+
D_h = 8.0 * np.ones(TILE_M, dtype=np.float64)
|
|
143
|
+
L_h = np.zeros_like(A_h)
|
|
144
|
+
X_h = np.arange(TILE_M, dtype=np.float64)
|
|
145
|
+
Y_h = np.zeros_like(X_h)
|
|
146
|
+
|
|
147
|
+
A_np = A_h + np.diag(D_h)
|
|
148
|
+
L_np = np.linalg.cholesky(A_np)
|
|
149
|
+
Y_np = np.linalg.solve(A_np, X_h)
|
|
150
|
+
|
|
151
|
+
A_wp = wp.array2d(A_h, requires_grad=True, dtype=wp.float64, device=device)
|
|
152
|
+
D_wp = wp.array2d(D_h, requires_grad=True, dtype=wp.float64, device=device)
|
|
153
|
+
L_wp = wp.array2d(L_h, requires_grad=True, dtype=wp.float64, device=device)
|
|
154
|
+
X_wp = wp.array2d(X_h, requires_grad=True, dtype=wp.float64, device=device)
|
|
155
|
+
Y_wp = wp.array2d(Y_h, requires_grad=True, dtype=wp.float64, device=device)
|
|
156
|
+
|
|
157
|
+
wp.launch_tiled(
|
|
158
|
+
tile_math_cholesky, dim=[1, 1], inputs=[A_wp, D_wp, L_wp, X_wp, Y_wp], block_dim=TILE_DIM, device=device
|
|
159
|
+
)
|
|
160
|
+
wp.synchronize_device()
|
|
161
|
+
|
|
162
|
+
assert np.allclose(Y_wp.numpy(), Y_np) and np.allclose(L_wp.numpy(), L_np)
|
|
163
|
+
|
|
164
|
+
# TODO: implement and test backward pass
|
|
165
|
+
|
|
166
|
+
|
|
117
167
|
devices = get_cuda_test_devices()
|
|
118
168
|
|
|
119
169
|
|
|
@@ -124,6 +174,9 @@ class TestTileMathDx(unittest.TestCase):
|
|
|
124
174
|
|
|
125
175
|
# check_output=False so we can enable libmathdx's logging without failing the tests
|
|
126
176
|
add_function_test(TestTileMathDx, "test_tile_math_matmul", test_tile_math_matmul, devices=devices, check_output=False)
|
|
177
|
+
add_function_test(
|
|
178
|
+
TestTileMathDx, "test_tile_math_cholesky", test_tile_math_cholesky, devices=devices, check_output=False
|
|
179
|
+
)
|
|
127
180
|
add_function_test(
|
|
128
181
|
TestTileMathDx,
|
|
129
182
|
"test_tile_math_fft_vec2f",
|
warp/tests/test_tile_mlp.py
CHANGED
|
@@ -114,23 +114,23 @@ def test_multi_layer_nn(test, device):
|
|
|
114
114
|
f = wp.tile(local)
|
|
115
115
|
|
|
116
116
|
# input layer
|
|
117
|
-
w0 = wp.tile_load(weights_0,
|
|
118
|
-
b0 = wp.tile_load(bias_0,
|
|
119
|
-
z = wp.tile_map(relu, wp.tile_matmul(w0, f) + wp.tile_broadcast(b0,
|
|
117
|
+
w0 = wp.tile_load(weights_0, shape=(DIM_HID, DIM_IN))
|
|
118
|
+
b0 = wp.tile_load(bias_0, shape=(DIM_HID, 1))
|
|
119
|
+
z = wp.tile_map(relu, wp.tile_matmul(w0, f) + wp.tile_broadcast(b0, shape=(DIM_HID, NUM_THREADS)))
|
|
120
120
|
|
|
121
121
|
# hidden layer
|
|
122
|
-
w1 = wp.tile_load(weights_1,
|
|
123
|
-
b1 = wp.tile_load(bias_1,
|
|
124
|
-
z = wp.tile_map(relu, wp.tile_matmul(w1, z) + wp.tile_broadcast(b1,
|
|
122
|
+
w1 = wp.tile_load(weights_1, shape=(DIM_HID, DIM_HID))
|
|
123
|
+
b1 = wp.tile_load(bias_1, shape=(DIM_HID, 1))
|
|
124
|
+
z = wp.tile_map(relu, wp.tile_matmul(w1, z) + wp.tile_broadcast(b1, shape=(DIM_HID, NUM_THREADS)))
|
|
125
125
|
|
|
126
|
-
w2 = wp.tile_load(weights_2,
|
|
127
|
-
b2 = wp.tile_load(bias_2,
|
|
128
|
-
z = wp.tile_map(relu, wp.tile_matmul(w2, z) + wp.tile_broadcast(b2,
|
|
126
|
+
w2 = wp.tile_load(weights_2, shape=(DIM_HID, DIM_HID))
|
|
127
|
+
b2 = wp.tile_load(bias_2, shape=(DIM_HID, 1))
|
|
128
|
+
z = wp.tile_map(relu, wp.tile_matmul(w2, z) + wp.tile_broadcast(b2, shape=(DIM_HID, NUM_THREADS)))
|
|
129
129
|
|
|
130
130
|
# output layer
|
|
131
|
-
w3 = wp.tile_load(weights_3,
|
|
132
|
-
b3 = wp.tile_load(bias_3,
|
|
133
|
-
o = wp.tile_map(relu, wp.tile_matmul(w3, z) + wp.tile_broadcast(b3,
|
|
131
|
+
w3 = wp.tile_load(weights_3, shape=(DIM_OUT, DIM_HID))
|
|
132
|
+
b3 = wp.tile_load(bias_3, shape=(DIM_OUT, 1))
|
|
133
|
+
o = wp.tile_map(relu, wp.tile_matmul(w3, z) + wp.tile_broadcast(b3, shape=(DIM_OUT, NUM_THREADS)))
|
|
134
134
|
|
|
135
135
|
# untile back to SIMT
|
|
136
136
|
output = wp.untile(o)
|
|
@@ -292,14 +292,14 @@ def test_single_layer_nn(test, device):
|
|
|
292
292
|
):
|
|
293
293
|
i = wp.tid()
|
|
294
294
|
|
|
295
|
-
f = wp.tile_load(input,
|
|
295
|
+
f = wp.tile_load(input, shape=(DIM_IN, NUM_THREADS), offset=(0, i * NUM_THREADS))
|
|
296
296
|
|
|
297
|
-
w = wp.tile_load(weights,
|
|
298
|
-
b = wp.tile_load(bias,
|
|
297
|
+
w = wp.tile_load(weights, shape=(DIM_OUT, DIM_IN))
|
|
298
|
+
b = wp.tile_load(bias, shape=(DIM_OUT, 1))
|
|
299
299
|
|
|
300
|
-
o = wp.tile_map(relu, wp.tile_matmul(w, f) + wp.tile_broadcast(b,
|
|
300
|
+
o = wp.tile_map(relu, wp.tile_matmul(w, f) + wp.tile_broadcast(b, shape=(DIM_OUT, NUM_THREADS)))
|
|
301
301
|
|
|
302
|
-
wp.tile_store(out, 0, i
|
|
302
|
+
wp.tile_store(out, o, offset=(0, i * NUM_THREADS))
|
|
303
303
|
|
|
304
304
|
with wp.ScopedDevice(device):
|
|
305
305
|
rng = np.random.default_rng(45)
|
warp/tests/test_tile_reduce.py
CHANGED
|
@@ -28,13 +28,13 @@ def tile_sum_kernel(input: wp.array2d(dtype=float), output: wp.array(dtype=float
|
|
|
28
28
|
n = input.shape[1]
|
|
29
29
|
count = int(n / TILE_DIM)
|
|
30
30
|
|
|
31
|
-
s = wp.tile_zeros(
|
|
31
|
+
s = wp.tile_zeros(shape=1, dtype=float)
|
|
32
32
|
|
|
33
33
|
for j in range(count):
|
|
34
|
-
a = wp.tile_load(input
|
|
34
|
+
a = wp.tile_load(input[i], shape=TILE_DIM, offset=j * TILE_DIM)
|
|
35
35
|
s += wp.tile_sum(a) * 0.5
|
|
36
36
|
|
|
37
|
-
wp.tile_store(output,
|
|
37
|
+
wp.tile_store(output, s, offset=i)
|
|
38
38
|
|
|
39
39
|
|
|
40
40
|
def test_tile_reduce_sum(test, device):
|
|
@@ -70,10 +70,10 @@ def tile_min_kernel(input: wp.array2d(dtype=float), output: wp.array(dtype=float
|
|
|
70
70
|
# output tile index
|
|
71
71
|
i = wp.tid()
|
|
72
72
|
|
|
73
|
-
a = wp.tile_load(input
|
|
73
|
+
a = wp.tile_load(input[i], shape=TILE_DIM)
|
|
74
74
|
m = wp.tile_min(a)
|
|
75
75
|
|
|
76
|
-
wp.tile_store(output,
|
|
76
|
+
wp.tile_store(output, m, offset=i)
|
|
77
77
|
|
|
78
78
|
|
|
79
79
|
def test_tile_reduce_min(test, device):
|
|
@@ -103,10 +103,10 @@ def tile_max_kernel(input: wp.array2d(dtype=float), output: wp.array(dtype=float
|
|
|
103
103
|
# output tile index
|
|
104
104
|
i = wp.tid()
|
|
105
105
|
|
|
106
|
-
a = wp.tile_load(input
|
|
106
|
+
a = wp.tile_load(input[i], shape=TILE_DIM)
|
|
107
107
|
m = wp.tile_max(a)
|
|
108
108
|
|
|
109
|
-
wp.tile_store(output,
|
|
109
|
+
wp.tile_store(output, m, offset=i)
|
|
110
110
|
|
|
111
111
|
|
|
112
112
|
def test_tile_reduce_max(test, device):
|
|
@@ -136,10 +136,10 @@ def tile_reduce_custom_kernel(input: wp.array2d(dtype=float), output: wp.array(d
|
|
|
136
136
|
# output tile index
|
|
137
137
|
i = wp.tid()
|
|
138
138
|
|
|
139
|
-
a = wp.tile_load(input
|
|
139
|
+
a = wp.tile_load(input[i], shape=TILE_DIM)
|
|
140
140
|
m = wp.tile_reduce(wp.mul, a)
|
|
141
141
|
|
|
142
|
-
wp.tile_store(output,
|
|
142
|
+
wp.tile_store(output, m, offset=i)
|
|
143
143
|
|
|
144
144
|
|
|
145
145
|
def test_tile_reduce_custom(test, device):
|
|
@@ -173,10 +173,10 @@ def tile_grouped_sum_kernel(input: wp.array3d(dtype=float), output: wp.array(dty
|
|
|
173
173
|
# output tile index
|
|
174
174
|
i = wp.tid()
|
|
175
175
|
|
|
176
|
-
a = wp.tile_load(input[i],
|
|
176
|
+
a = wp.tile_load(input[i], shape=(TILE_M, TILE_N))
|
|
177
177
|
s = wp.tile_sum(a) * 0.5
|
|
178
178
|
|
|
179
|
-
wp.tile_store(output,
|
|
179
|
+
wp.tile_store(output, s, offset=i)
|
|
180
180
|
|
|
181
181
|
|
|
182
182
|
def test_tile_reduce_grouped_sum(test, device):
|
|
@@ -217,7 +217,7 @@ def tile_reduce_simt_kernel(output: wp.array(dtype=int)):
|
|
|
217
217
|
s = wp.tile_sum(t) # sum over block
|
|
218
218
|
|
|
219
219
|
# update global sum
|
|
220
|
-
wp.tile_atomic_add(output,
|
|
220
|
+
wp.tile_atomic_add(output, s)
|
|
221
221
|
|
|
222
222
|
|
|
223
223
|
def test_tile_reduce_simt(test, device):
|
|
@@ -310,10 +310,10 @@ def test_tile_untile_vector(test, device):
|
|
|
310
310
|
def tile_ones_kernel(out: wp.array(dtype=float)):
|
|
311
311
|
i = wp.tid()
|
|
312
312
|
|
|
313
|
-
t = wp.tile_ones(dtype=float,
|
|
313
|
+
t = wp.tile_ones(dtype=float, shape=(16, 16))
|
|
314
314
|
s = wp.tile_sum(t)
|
|
315
315
|
|
|
316
|
-
wp.tile_store(out,
|
|
316
|
+
wp.tile_store(out, s)
|
|
317
317
|
|
|
318
318
|
|
|
319
319
|
def test_tile_ones(test, device):
|
|
@@ -332,16 +332,20 @@ def tile_arange_kernel(out: wp.array2d(dtype=int)):
|
|
|
332
332
|
a = wp.tile_arange(17, dtype=int)
|
|
333
333
|
b = wp.tile_arange(5, 23, dtype=int)
|
|
334
334
|
c = wp.tile_arange(0, 34, 2, dtype=int)
|
|
335
|
+
d = wp.tile_arange(-1, 16, dtype=int)
|
|
336
|
+
e = wp.tile_arange(17, 0, -1, dtype=int)
|
|
335
337
|
|
|
336
|
-
wp.tile_store(out
|
|
337
|
-
wp.tile_store(out
|
|
338
|
-
wp.tile_store(out
|
|
338
|
+
wp.tile_store(out[0], a)
|
|
339
|
+
wp.tile_store(out[1], b)
|
|
340
|
+
wp.tile_store(out[2], c)
|
|
341
|
+
wp.tile_store(out[3], d)
|
|
342
|
+
wp.tile_store(out[4], e)
|
|
339
343
|
|
|
340
344
|
|
|
341
345
|
def test_tile_arange(test, device):
|
|
342
346
|
N = 17
|
|
343
347
|
|
|
344
|
-
output = wp.zeros(shape=(
|
|
348
|
+
output = wp.zeros(shape=(5, N), dtype=int, device=device)
|
|
345
349
|
|
|
346
350
|
with wp.Tape() as tape:
|
|
347
351
|
wp.launch_tiled(tile_arange_kernel, dim=[1], inputs=[output], block_dim=TILE_DIM, device=device)
|
|
@@ -349,6 +353,8 @@ def test_tile_arange(test, device):
|
|
|
349
353
|
assert_np_equal(output.numpy()[0], np.arange(17))
|
|
350
354
|
assert_np_equal(output.numpy()[1], np.arange(5, 22))
|
|
351
355
|
assert_np_equal(output.numpy()[2], np.arange(0, 34, 2))
|
|
356
|
+
assert_np_equal(output.numpy()[3], np.arange(-1, 16))
|
|
357
|
+
assert_np_equal(output.numpy()[4], np.arange(17, 0, -1))
|
|
352
358
|
|
|
353
359
|
|
|
354
360
|
devices = get_cuda_test_devices()
|