warp-lang 1.5.0__py3-none-manylinux2014_x86_64.whl → 1.6.0__py3-none-manylinux2014_x86_64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of warp-lang might be problematic. Click here for more details.
- warp/__init__.py +5 -0
- warp/autograd.py +414 -191
- warp/bin/warp-clang.so +0 -0
- warp/bin/warp.so +0 -0
- warp/build.py +40 -12
- warp/build_dll.py +13 -6
- warp/builtins.py +1124 -497
- warp/codegen.py +261 -136
- warp/config.py +1 -1
- warp/context.py +357 -119
- warp/examples/assets/square_cloth.usd +0 -0
- warp/examples/benchmarks/benchmark_gemm.py +27 -18
- warp/examples/benchmarks/benchmark_interop_paddle.py +3 -3
- warp/examples/benchmarks/benchmark_interop_torch.py +3 -3
- warp/examples/core/example_torch.py +18 -34
- warp/examples/fem/example_apic_fluid.py +1 -0
- warp/examples/fem/example_mixed_elasticity.py +1 -1
- warp/examples/optim/example_bounce.py +1 -1
- warp/examples/optim/example_cloth_throw.py +1 -1
- warp/examples/optim/example_diffray.py +4 -15
- warp/examples/optim/example_drone.py +1 -1
- warp/examples/optim/example_softbody_properties.py +392 -0
- warp/examples/optim/example_trajectory.py +1 -3
- warp/examples/optim/example_walker.py +5 -0
- warp/examples/sim/example_cartpole.py +0 -2
- warp/examples/sim/example_cloth.py +3 -1
- warp/examples/sim/example_cloth_self_contact.py +260 -0
- warp/examples/sim/example_granular_collision_sdf.py +4 -5
- warp/examples/sim/example_jacobian_ik.py +0 -2
- warp/examples/sim/example_quadruped.py +5 -2
- warp/examples/tile/example_tile_cholesky.py +79 -0
- warp/examples/tile/example_tile_convolution.py +2 -2
- warp/examples/tile/example_tile_fft.py +2 -2
- warp/examples/tile/example_tile_filtering.py +3 -3
- warp/examples/tile/example_tile_matmul.py +4 -4
- warp/examples/tile/example_tile_mlp.py +12 -12
- warp/examples/tile/example_tile_nbody.py +180 -0
- warp/examples/tile/example_tile_walker.py +319 -0
- warp/fem/geometry/geometry.py +0 -2
- warp/math.py +147 -0
- warp/native/array.h +12 -0
- warp/native/builtin.h +0 -1
- warp/native/bvh.cpp +149 -70
- warp/native/bvh.cu +287 -68
- warp/native/bvh.h +195 -85
- warp/native/clang/clang.cpp +5 -1
- warp/native/coloring.cpp +5 -1
- warp/native/cuda_util.cpp +91 -53
- warp/native/cuda_util.h +5 -0
- warp/native/exports.h +40 -40
- warp/native/intersect.h +17 -0
- warp/native/mat.h +41 -0
- warp/native/mathdx.cpp +19 -0
- warp/native/mesh.cpp +25 -8
- warp/native/mesh.cu +153 -101
- warp/native/mesh.h +482 -403
- warp/native/quat.h +40 -0
- warp/native/solid_angle.h +7 -0
- warp/native/sort.cpp +85 -0
- warp/native/sort.cu +34 -0
- warp/native/sort.h +3 -1
- warp/native/spatial.h +11 -0
- warp/native/tile.h +1187 -669
- warp/native/tile_reduce.h +8 -6
- warp/native/vec.h +41 -0
- warp/native/warp.cpp +8 -1
- warp/native/warp.cu +263 -40
- warp/native/warp.h +19 -5
- warp/optim/linear.py +22 -4
- warp/render/render_opengl.py +130 -64
- warp/sim/__init__.py +6 -1
- warp/sim/collide.py +270 -26
- warp/sim/import_urdf.py +8 -8
- warp/sim/integrator_euler.py +25 -7
- warp/sim/integrator_featherstone.py +154 -35
- warp/sim/integrator_vbd.py +842 -40
- warp/sim/model.py +134 -72
- warp/sparse.py +1 -1
- warp/stubs.py +265 -132
- warp/tape.py +28 -30
- warp/tests/aux_test_module_unload.py +15 -0
- warp/tests/{test_sim_grad.py → flaky_test_sim_grad.py} +104 -63
- warp/tests/test_array.py +74 -0
- warp/tests/test_assert.py +242 -0
- warp/tests/test_codegen.py +14 -61
- warp/tests/test_collision.py +2 -2
- warp/tests/test_coloring.py +12 -2
- warp/tests/test_examples.py +12 -1
- warp/tests/test_func.py +21 -4
- warp/tests/test_grad_debug.py +87 -2
- warp/tests/test_hash_grid.py +1 -1
- warp/tests/test_ipc.py +116 -0
- warp/tests/test_lerp.py +13 -87
- warp/tests/test_mat.py +138 -167
- warp/tests/test_math.py +47 -1
- warp/tests/test_matmul.py +17 -16
- warp/tests/test_matmul_lite.py +10 -15
- warp/tests/test_mesh.py +84 -60
- warp/tests/test_mesh_query_aabb.py +165 -0
- warp/tests/test_mesh_query_point.py +328 -286
- warp/tests/test_mesh_query_ray.py +134 -121
- warp/tests/test_mlp.py +2 -2
- warp/tests/test_operators.py +43 -0
- warp/tests/test_overwrite.py +47 -2
- warp/tests/test_quat.py +77 -0
- warp/tests/test_reload.py +29 -0
- warp/tests/test_sim_grad_bounce_linear.py +204 -0
- warp/tests/test_smoothstep.py +17 -83
- warp/tests/test_static.py +19 -3
- warp/tests/test_tape.py +25 -0
- warp/tests/test_tile.py +178 -191
- warp/tests/test_tile_load.py +356 -0
- warp/tests/test_tile_mathdx.py +61 -8
- warp/tests/test_tile_mlp.py +17 -17
- warp/tests/test_tile_reduce.py +24 -18
- warp/tests/test_tile_shared_memory.py +66 -17
- warp/tests/test_tile_view.py +165 -0
- warp/tests/test_torch.py +35 -0
- warp/tests/test_utils.py +36 -24
- warp/tests/test_vec.py +110 -0
- warp/tests/unittest_suites.py +29 -4
- warp/tests/unittest_utils.py +30 -13
- warp/thirdparty/unittest_parallel.py +2 -2
- warp/types.py +411 -101
- warp/utils.py +10 -7
- {warp_lang-1.5.0.dist-info → warp_lang-1.6.0.dist-info}/METADATA +92 -69
- {warp_lang-1.5.0.dist-info → warp_lang-1.6.0.dist-info}/RECORD +130 -119
- {warp_lang-1.5.0.dist-info → warp_lang-1.6.0.dist-info}/WHEEL +1 -1
- warp/examples/benchmarks/benchmark_tile.py +0 -179
- warp/native/tile_gemm.h +0 -341
- {warp_lang-1.5.0.dist-info → warp_lang-1.6.0.dist-info}/LICENSE.md +0 -0
- {warp_lang-1.5.0.dist-info → warp_lang-1.6.0.dist-info}/top_level.txt +0 -0
warp/tests/test_tile.py
CHANGED
|
@@ -27,8 +27,8 @@ def tile_copy_1d_kernel(A: wp.array(dtype=float), B: wp.array(dtype=float)):
|
|
|
27
27
|
# tile index
|
|
28
28
|
i = wp.tid()
|
|
29
29
|
|
|
30
|
-
a = wp.tile_load(A,
|
|
31
|
-
wp.tile_store(B,
|
|
30
|
+
a = wp.tile_load(A, shape=TILE_N, offset=i * TILE_N)
|
|
31
|
+
wp.tile_store(B, a, offset=i * TILE_N)
|
|
32
32
|
|
|
33
33
|
|
|
34
34
|
def test_tile_copy_1d(test, device):
|
|
@@ -66,8 +66,8 @@ def tile_copy_2d_kernel(A: wp.array2d(dtype=float), B: wp.array2d(dtype=float)):
|
|
|
66
66
|
# tile index
|
|
67
67
|
i, j = wp.tid()
|
|
68
68
|
|
|
69
|
-
a = wp.tile_load(A,
|
|
70
|
-
wp.tile_store(B, i, j
|
|
69
|
+
a = wp.tile_load(A, shape=(TILE_M, TILE_N), offset=(i * TILE_M, j * TILE_N))
|
|
70
|
+
wp.tile_store(B, a, offset=(i * TILE_M, j * TILE_N))
|
|
71
71
|
|
|
72
72
|
|
|
73
73
|
def test_tile_copy_2d(test, device):
|
|
@@ -111,11 +111,11 @@ def tile_unary_map(input: wp.array2d(dtype=float), output: wp.array2d(dtype=floa
|
|
|
111
111
|
# tile index
|
|
112
112
|
i, j = wp.tid()
|
|
113
113
|
|
|
114
|
-
a = wp.tile_load(input,
|
|
114
|
+
a = wp.tile_load(input, shape=(TILE_M, TILE_N), offset=(i * TILE_M, j * TILE_N))
|
|
115
115
|
|
|
116
116
|
sa = wp.tile_map(wp.sin, a)
|
|
117
117
|
|
|
118
|
-
wp.tile_store(output, i, j
|
|
118
|
+
wp.tile_store(output, sa, offset=(i * TILE_M, j * TILE_N))
|
|
119
119
|
|
|
120
120
|
|
|
121
121
|
def test_tile_unary_map(test, device):
|
|
@@ -163,12 +163,12 @@ def tile_binary_map(
|
|
|
163
163
|
# tile index
|
|
164
164
|
i, j = wp.tid()
|
|
165
165
|
|
|
166
|
-
a = wp.tile_load(input_a,
|
|
167
|
-
b = wp.tile_load(input_b,
|
|
166
|
+
a = wp.tile_load(input_a, shape=(TILE_M, TILE_N), offset=(i * TILE_M, j * TILE_N))
|
|
167
|
+
b = wp.tile_load(input_b, shape=(TILE_M, TILE_N), offset=(i * TILE_M, j * TILE_N))
|
|
168
168
|
|
|
169
169
|
sa = wp.tile_map(binary_func, a, b)
|
|
170
170
|
|
|
171
|
-
wp.tile_store(output, i, j
|
|
171
|
+
wp.tile_store(output, sa, offset=(i * TILE_M, j * TILE_N))
|
|
172
172
|
|
|
173
173
|
|
|
174
174
|
def test_tile_binary_map(test, device):
|
|
@@ -215,14 +215,14 @@ def test_tile_grouped_gemm(test, device):
|
|
|
215
215
|
# output tile index
|
|
216
216
|
i = wp.tid()
|
|
217
217
|
|
|
218
|
-
a = wp.tile_load(A[i],
|
|
219
|
-
b = wp.tile_load(B[i],
|
|
218
|
+
a = wp.tile_load(A[i], shape=(TILE_M, TILE_K))
|
|
219
|
+
b = wp.tile_load(B[i], shape=(TILE_K, TILE_N))
|
|
220
220
|
|
|
221
|
-
sum = wp.tile_zeros(
|
|
221
|
+
sum = wp.tile_zeros(shape=(TILE_M, TILE_N), dtype=wp.float32)
|
|
222
222
|
|
|
223
223
|
wp.tile_matmul(a, b, sum)
|
|
224
224
|
|
|
225
|
-
wp.tile_store(C[i],
|
|
225
|
+
wp.tile_store(C[i], sum)
|
|
226
226
|
|
|
227
227
|
batch_count = 56
|
|
228
228
|
|
|
@@ -245,7 +245,7 @@ def test_tile_grouped_gemm(test, device):
|
|
|
245
245
|
)
|
|
246
246
|
|
|
247
247
|
# TODO: 32 mismatched elements
|
|
248
|
-
assert_np_equal(C_wp.numpy(), C)
|
|
248
|
+
assert_np_equal(C_wp.numpy(), C, 1e-6)
|
|
249
249
|
|
|
250
250
|
|
|
251
251
|
@unittest.skipUnless(wp.context.runtime.core.is_mathdx_enabled(), "Warp was not built with MathDx support")
|
|
@@ -255,7 +255,7 @@ def test_tile_gemm(test, device):
|
|
|
255
255
|
# output tile index
|
|
256
256
|
i, j = wp.tid()
|
|
257
257
|
|
|
258
|
-
sum = wp.tile_zeros(
|
|
258
|
+
sum = wp.tile_zeros(shape=(TILE_M, TILE_N), dtype=wp.float32)
|
|
259
259
|
|
|
260
260
|
M = A.shape[0]
|
|
261
261
|
N = B.shape[1]
|
|
@@ -264,13 +264,13 @@ def test_tile_gemm(test, device):
|
|
|
264
264
|
count = int(K / TILE_K)
|
|
265
265
|
|
|
266
266
|
for k in range(0, count):
|
|
267
|
-
a = wp.tile_load(A,
|
|
268
|
-
b = wp.tile_load(B,
|
|
267
|
+
a = wp.tile_load(A, shape=(TILE_M, TILE_K), offset=(i * TILE_M, k * TILE_K))
|
|
268
|
+
b = wp.tile_load(B, shape=(TILE_K, TILE_N), offset=(k * TILE_K, j * TILE_N))
|
|
269
269
|
|
|
270
270
|
# sum += a*b
|
|
271
271
|
wp.tile_matmul(a, b, sum)
|
|
272
272
|
|
|
273
|
-
wp.tile_store(C, i, j
|
|
273
|
+
wp.tile_store(C, sum, offset=(i * TILE_M, j * TILE_N))
|
|
274
274
|
|
|
275
275
|
M = TILE_M * 7
|
|
276
276
|
K = TILE_K * 6
|
|
@@ -309,7 +309,7 @@ def tile_operators(input: wp.array3d(dtype=float), output: wp.array3d(dtype=floa
|
|
|
309
309
|
# output tile index
|
|
310
310
|
i = wp.tid()
|
|
311
311
|
|
|
312
|
-
a = wp.tile_load(input[i],
|
|
312
|
+
a = wp.tile_load(input[i], shape=(TILE_M, TILE_N))
|
|
313
313
|
|
|
314
314
|
# neg
|
|
315
315
|
b = -a
|
|
@@ -323,7 +323,7 @@ def tile_operators(input: wp.array3d(dtype=float), output: wp.array3d(dtype=floa
|
|
|
323
323
|
# add tiles
|
|
324
324
|
e = a + d
|
|
325
325
|
|
|
326
|
-
wp.tile_store(output[i],
|
|
326
|
+
wp.tile_store(output[i], e)
|
|
327
327
|
|
|
328
328
|
|
|
329
329
|
def test_tile_operators(test, device):
|
|
@@ -358,10 +358,10 @@ def tile_sum_kernel(input: wp.array3d(dtype=float), output: wp.array(dtype=float
|
|
|
358
358
|
# output tile index
|
|
359
359
|
i = wp.tid()
|
|
360
360
|
|
|
361
|
-
a = wp.tile_load(input[i],
|
|
361
|
+
a = wp.tile_load(input[i], shape=(TILE_M, TILE_N))
|
|
362
362
|
s = wp.tile_sum(a) * 0.5
|
|
363
363
|
|
|
364
|
-
wp.tile_store(output,
|
|
364
|
+
wp.tile_store(output, s, offset=i)
|
|
365
365
|
|
|
366
366
|
|
|
367
367
|
def test_tile_sum(test, device):
|
|
@@ -398,48 +398,138 @@ def test_tile_sum(test, device):
|
|
|
398
398
|
assert_np_equal(input_wp.grad.numpy(), np.ones_like(input) * 0.5)
|
|
399
399
|
|
|
400
400
|
|
|
401
|
+
def test_tile_sum_launch(test, device):
|
|
402
|
+
batch_count = 56
|
|
403
|
+
|
|
404
|
+
M = TILE_M
|
|
405
|
+
N = TILE_N
|
|
406
|
+
|
|
407
|
+
rng = np.random.default_rng(42)
|
|
408
|
+
input = rng.random((batch_count, M, N), dtype=np.float32)
|
|
409
|
+
|
|
410
|
+
input_wp = wp.array(input, requires_grad=True, device=device)
|
|
411
|
+
output_wp = wp.zeros(batch_count, requires_grad=True, device=device)
|
|
412
|
+
|
|
413
|
+
cmd = wp.launch_tiled(
|
|
414
|
+
tile_sum_kernel,
|
|
415
|
+
dim=[batch_count],
|
|
416
|
+
inputs=[input_wp, output_wp],
|
|
417
|
+
block_dim=TILE_DIM,
|
|
418
|
+
device=device,
|
|
419
|
+
record_cmd=True,
|
|
420
|
+
)
|
|
421
|
+
cmd.launch()
|
|
422
|
+
|
|
423
|
+
sum_wp = output_wp.numpy()
|
|
424
|
+
|
|
425
|
+
for i in range(batch_count):
|
|
426
|
+
sum_np = np.sum(input[i]) * 0.5
|
|
427
|
+
test.assertAlmostEqual(sum_wp[i], sum_np, places=5)
|
|
428
|
+
|
|
429
|
+
output_wp.grad.fill_(1.0)
|
|
430
|
+
|
|
431
|
+
wp.launch_tiled(
|
|
432
|
+
tile_sum_kernel,
|
|
433
|
+
dim=[batch_count],
|
|
434
|
+
inputs=[input_wp, output_wp],
|
|
435
|
+
adj_inputs=[input_wp.grad, output_wp.grad],
|
|
436
|
+
block_dim=TILE_DIM,
|
|
437
|
+
device=device,
|
|
438
|
+
adjoint=True,
|
|
439
|
+
)
|
|
440
|
+
|
|
441
|
+
assert_np_equal(input_wp.grad.numpy(), np.ones_like(input) * 0.5)
|
|
442
|
+
|
|
443
|
+
|
|
401
444
|
@wp.kernel
|
|
402
|
-
def
|
|
403
|
-
|
|
404
|
-
i = wp.tid()
|
|
445
|
+
def test_tile_extract_kernel(a: wp.array2d(dtype=float), b: wp.array2d(dtype=float)):
|
|
446
|
+
i, j, x, y = wp.tid()
|
|
405
447
|
|
|
406
|
-
|
|
448
|
+
tile = wp.tile_load(a, shape=(TILE_M, TILE_N), offset=(i * TILE_M, j * TILE_N))
|
|
407
449
|
|
|
408
|
-
#
|
|
409
|
-
|
|
410
|
-
for i in range(TILE_M):
|
|
411
|
-
for j in range(TILE_N):
|
|
412
|
-
output[i, j] = t[i, j]
|
|
450
|
+
# compute sum of array sub tile
|
|
451
|
+
wp.atomic_add(b, i, j, wp.tile_extract(tile, x, y))
|
|
413
452
|
|
|
414
453
|
|
|
415
454
|
def test_tile_extract(test, device):
|
|
416
|
-
|
|
417
|
-
N = TILE_N
|
|
455
|
+
block_dim = 16
|
|
418
456
|
|
|
419
|
-
|
|
420
|
-
input = rng.random((M, N), dtype=np.float32)
|
|
457
|
+
input = np.arange(TILE_M * TILE_N * 4).reshape((TILE_M * 2, TILE_N * 2))
|
|
421
458
|
|
|
422
|
-
|
|
423
|
-
|
|
459
|
+
a = wp.array(input, dtype=float, requires_grad=True, device=device)
|
|
460
|
+
b = wp.zeros((2, 2), dtype=float, requires_grad=True, device=device)
|
|
424
461
|
|
|
425
462
|
with wp.Tape() as tape:
|
|
426
|
-
wp.
|
|
463
|
+
wp.launch(
|
|
464
|
+
test_tile_extract_kernel, dim=[2, 2, TILE_M, TILE_N], inputs=[a, b], block_dim=block_dim, device=device
|
|
465
|
+
)
|
|
427
466
|
|
|
428
|
-
|
|
467
|
+
# compute sum of each sub-block
|
|
468
|
+
sums = input.reshape(2, input.shape[0] // 2, 2, input.shape[1] // 2).sum(axis=(1, 3))
|
|
429
469
|
|
|
430
|
-
|
|
470
|
+
assert_np_equal(b.numpy(), sums)
|
|
471
|
+
|
|
472
|
+
b.grad.fill_(1.0)
|
|
431
473
|
|
|
432
474
|
tape.backward()
|
|
433
475
|
|
|
434
|
-
|
|
476
|
+
expected_grad = np.ones_like(input)
|
|
477
|
+
assert_np_equal(a.grad.numpy(), expected_grad)
|
|
478
|
+
|
|
479
|
+
|
|
480
|
+
@wp.kernel
|
|
481
|
+
def test_tile_extract_repeated_kernel(a: wp.array2d(dtype=float), b: wp.array2d(dtype=float)):
|
|
482
|
+
i, j, x, y = wp.tid()
|
|
483
|
+
|
|
484
|
+
tile = wp.tile_load(a, shape=(TILE_M, TILE_N), offset=(i * TILE_M, j * TILE_N))
|
|
485
|
+
|
|
486
|
+
# each thread extracts the first element of the sub-tile
|
|
487
|
+
# and accumulates the value onto the output
|
|
488
|
+
wp.atomic_add(b, i, j, wp.tile_extract(tile, 0, 0))
|
|
489
|
+
|
|
490
|
+
|
|
491
|
+
def test_tile_extract_repeated(test, device):
|
|
492
|
+
block_dim = 16
|
|
493
|
+
|
|
494
|
+
input = np.arange(TILE_M * TILE_N * 4).reshape((TILE_M * 2, TILE_N * 2))
|
|
495
|
+
|
|
496
|
+
a = wp.array(input, dtype=float, requires_grad=True, device=device)
|
|
497
|
+
b = wp.zeros((2, 2), dtype=float, requires_grad=True, device=device)
|
|
498
|
+
|
|
499
|
+
with wp.Tape() as tape:
|
|
500
|
+
wp.launch(
|
|
501
|
+
test_tile_extract_repeated_kernel,
|
|
502
|
+
dim=[2, 2, TILE_M, TILE_N],
|
|
503
|
+
inputs=[a, b],
|
|
504
|
+
block_dim=block_dim,
|
|
505
|
+
device=device,
|
|
506
|
+
)
|
|
507
|
+
|
|
508
|
+
# each thread adds the first element to the output
|
|
509
|
+
scale = TILE_M * TILE_N
|
|
510
|
+
sums = np.array([[input[0, 0], input[0, TILE_N]], [input[TILE_M, 0], input[TILE_M, TILE_N]]]) * scale
|
|
511
|
+
|
|
512
|
+
assert_np_equal(b.numpy(), sums)
|
|
513
|
+
|
|
514
|
+
b.grad.fill_(1.0)
|
|
515
|
+
|
|
516
|
+
tape.backward()
|
|
517
|
+
|
|
518
|
+
expected_grad = np.zeros_like(input)
|
|
519
|
+
expected_grad[0, 0] = scale
|
|
520
|
+
expected_grad[0, TILE_N] = scale
|
|
521
|
+
expected_grad[TILE_M, 0] = scale
|
|
522
|
+
expected_grad[TILE_M, TILE_N] = scale
|
|
523
|
+
|
|
524
|
+
assert_np_equal(a.grad.numpy(), expected_grad)
|
|
435
525
|
|
|
436
526
|
|
|
437
527
|
@wp.kernel
|
|
438
528
|
def test_tile_transpose_kernel(input: wp.array2d(dtype=float), output: wp.array2d(dtype=float)):
|
|
439
|
-
x = wp.tile_load(input,
|
|
529
|
+
x = wp.tile_load(input, shape=(TILE_M, TILE_N))
|
|
440
530
|
y = wp.tile_transpose(x)
|
|
441
531
|
|
|
442
|
-
wp.tile_store(output,
|
|
532
|
+
wp.tile_store(output, y)
|
|
443
533
|
|
|
444
534
|
|
|
445
535
|
def test_tile_transpose(test, device):
|
|
@@ -456,13 +546,13 @@ def test_tile_transpose(test, device):
|
|
|
456
546
|
def test_tile_transpose_matmul(test, device):
|
|
457
547
|
@wp.kernel
|
|
458
548
|
def test_tile_transpose_matmul_kernel(input: wp.array2d(dtype=float), output: wp.array2d(dtype=float)):
|
|
459
|
-
x = wp.tile_load(input,
|
|
549
|
+
x = wp.tile_load(input, shape=(TILE_M, TILE_N))
|
|
460
550
|
y = wp.tile_transpose(x)
|
|
461
551
|
|
|
462
|
-
z = wp.tile_zeros(dtype=float,
|
|
552
|
+
z = wp.tile_zeros(dtype=float, shape=(TILE_N, TILE_N))
|
|
463
553
|
wp.tile_matmul(y, x, z)
|
|
464
554
|
|
|
465
|
-
wp.tile_store(output,
|
|
555
|
+
wp.tile_store(output, z)
|
|
466
556
|
|
|
467
557
|
rng = np.random.default_rng(42)
|
|
468
558
|
input = wp.array(rng.random((TILE_M, TILE_N), dtype=np.float32), device=device)
|
|
@@ -477,13 +567,13 @@ def test_tile_transpose_matmul(test, device):
|
|
|
477
567
|
def test_tile_broadcast_add_kernel(
|
|
478
568
|
input_a: wp.array2d(dtype=float), input_b: wp.array(dtype=float), output: wp.array2d(dtype=float)
|
|
479
569
|
):
|
|
480
|
-
a = wp.tile_load(input_a,
|
|
481
|
-
b = wp.tile_load(input_b,
|
|
570
|
+
a = wp.tile_load(input_a, shape=(10, 10))
|
|
571
|
+
b = wp.tile_load(input_b, shape=10)
|
|
482
572
|
|
|
483
|
-
c = wp.tile_broadcast(b, 10, 10)
|
|
573
|
+
c = wp.tile_broadcast(b, shape=(10, 10))
|
|
484
574
|
d = a + c
|
|
485
575
|
|
|
486
|
-
wp.tile_store(output,
|
|
576
|
+
wp.tile_store(output, d)
|
|
487
577
|
|
|
488
578
|
|
|
489
579
|
def test_tile_broadcast_add(test, device):
|
|
@@ -501,13 +591,13 @@ def test_tile_broadcast_add(test, device):
|
|
|
501
591
|
|
|
502
592
|
@wp.kernel
|
|
503
593
|
def test_tile_broadcast_grad_kernel(a: wp.array(dtype=float), b: wp.array2d(dtype=float)):
|
|
504
|
-
x = wp.tile_load(a,
|
|
505
|
-
y = wp.tile_broadcast(x,
|
|
594
|
+
x = wp.tile_load(a, shape=5)
|
|
595
|
+
y = wp.tile_broadcast(x, shape=(5, 5))
|
|
506
596
|
|
|
507
|
-
w = wp.tile_ones(dtype=float,
|
|
597
|
+
w = wp.tile_ones(dtype=float, shape=(5, 5))
|
|
508
598
|
z = w + y
|
|
509
599
|
|
|
510
|
-
wp.tile_store(b,
|
|
600
|
+
wp.tile_store(b, z)
|
|
511
601
|
|
|
512
602
|
|
|
513
603
|
def test_tile_broadcast_grad(test, device):
|
|
@@ -524,152 +614,48 @@ def test_tile_broadcast_grad(test, device):
|
|
|
524
614
|
assert_np_equal(a.grad.numpy(), np.ones(5) * 5.0)
|
|
525
615
|
|
|
526
616
|
|
|
527
|
-
TILE_VIEW_M = 16
|
|
528
|
-
TILE_VIEW_N = 128
|
|
529
|
-
|
|
530
|
-
|
|
531
617
|
@wp.kernel
|
|
532
|
-
def
|
|
533
|
-
|
|
534
|
-
|
|
535
|
-
|
|
536
|
-
|
|
537
|
-
for i in range(TILE_VIEW_M):
|
|
538
|
-
# create a view on original array and store
|
|
539
|
-
row = a[i]
|
|
540
|
-
wp.tile_store(dst, i, 0, row)
|
|
541
|
-
|
|
542
|
-
|
|
543
|
-
def test_tile_view(test, device):
|
|
544
|
-
rng = np.random.default_rng(42)
|
|
545
|
-
|
|
546
|
-
a = wp.array(rng.random((TILE_VIEW_M, TILE_VIEW_N), dtype=np.float32), requires_grad=True, device=device)
|
|
547
|
-
b = wp.array(np.zeros((TILE_VIEW_M, TILE_VIEW_N), dtype=np.float32), requires_grad=True, device=device)
|
|
618
|
+
def tile_len_kernel(
|
|
619
|
+
a: wp.array(dtype=float, ndim=2),
|
|
620
|
+
out: wp.array(dtype=int),
|
|
621
|
+
):
|
|
622
|
+
x = wp.tile_load(a, shape=(TILE_M, TILE_N))
|
|
548
623
|
|
|
549
|
-
|
|
550
|
-
|
|
624
|
+
length = wp.static(len(x))
|
|
625
|
+
wp.expect_eq(wp.static(len(x)), TILE_M)
|
|
626
|
+
out[0] = wp.static(len(x))
|
|
551
627
|
|
|
552
|
-
assert_np_equal(b.numpy(), a.numpy())
|
|
553
628
|
|
|
554
|
-
|
|
555
|
-
|
|
629
|
+
def test_tile_len(test, device):
|
|
630
|
+
a = wp.zeros((TILE_M, TILE_N), dtype=float, device=device)
|
|
631
|
+
out = wp.empty(1, dtype=int, device=device)
|
|
632
|
+
wp.launch_tiled(
|
|
633
|
+
tile_len_kernel,
|
|
634
|
+
dim=(1,),
|
|
635
|
+
inputs=(a,),
|
|
636
|
+
outputs=(out,),
|
|
637
|
+
block_dim=32,
|
|
638
|
+
device=device,
|
|
639
|
+
)
|
|
556
640
|
|
|
557
|
-
|
|
641
|
+
test.assertEqual(out.numpy()[0], TILE_M)
|
|
558
642
|
|
|
559
643
|
|
|
560
644
|
@wp.kernel
|
|
561
|
-
def
|
|
562
|
-
#
|
|
563
|
-
a = wp.
|
|
564
|
-
|
|
565
|
-
|
|
566
|
-
# copy the source array row by row
|
|
567
|
-
for i in range(TILE_VIEW_M):
|
|
568
|
-
# create views onto source and dest rows
|
|
569
|
-
row_src = a[i]
|
|
570
|
-
row_dst = b[i]
|
|
571
|
-
|
|
572
|
-
# copy onto dest row
|
|
573
|
-
wp.tile_assign(row_dst, 0, 0, row_src)
|
|
574
|
-
|
|
575
|
-
wp.tile_store(dst, 0, 0, b)
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
def test_tile_assign(test, device):
|
|
579
|
-
rng = np.random.default_rng(42)
|
|
580
|
-
|
|
581
|
-
a = wp.array(rng.random((TILE_VIEW_M, TILE_VIEW_N), dtype=np.float32), requires_grad=True, device=device)
|
|
582
|
-
b = wp.array(np.zeros((TILE_VIEW_M, TILE_VIEW_N), dtype=np.float32), requires_grad=True, device=device)
|
|
583
|
-
|
|
584
|
-
with wp.Tape() as tape:
|
|
585
|
-
wp.launch_tiled(test_tile_assign_kernel, dim=[1], inputs=[a, b], block_dim=32, device=device)
|
|
586
|
-
|
|
587
|
-
assert_np_equal(b.numpy(), a.numpy())
|
|
588
|
-
|
|
589
|
-
b.grad = wp.ones_like(b, device=device)
|
|
590
|
-
tape.backward()
|
|
591
|
-
|
|
592
|
-
assert_np_equal(a.grad.numpy(), np.ones_like(a.numpy()))
|
|
593
|
-
|
|
594
|
-
|
|
595
|
-
# #-----------------------------------------
|
|
596
|
-
# # center of mass computation
|
|
645
|
+
def test_tile_print_kernel():
|
|
646
|
+
# shared tile
|
|
647
|
+
a = wp.tile_ones(shape=(4, 3), dtype=float, storage="shared")
|
|
648
|
+
# register tile
|
|
649
|
+
b = wp.tile_ones(shape=(4, 3), dtype=float)
|
|
597
650
|
|
|
598
|
-
|
|
599
|
-
|
|
651
|
+
print(a)
|
|
652
|
+
print(b)
|
|
600
653
|
|
|
601
|
-
# com = wp.tile_zeros(dtype=wp.vec3, M=1)
|
|
602
654
|
|
|
603
|
-
|
|
604
|
-
|
|
655
|
+
def test_tile_print(test, device):
|
|
656
|
+
wp.launch_tiled(test_tile_print_kernel, dim=1, inputs=[], block_dim=64, device=device)
|
|
657
|
+
wp.synchronize()
|
|
605
658
|
|
|
606
|
-
# count = wp.min(N, end-i)
|
|
607
|
-
|
|
608
|
-
# idx = wp.tile_load(indices, i, N, max_col=count)
|
|
609
|
-
# p = wp.tile_load(points, idx, max_col=count)
|
|
610
|
-
|
|
611
|
-
# com += wp.tile_sum(p)
|
|
612
|
-
|
|
613
|
-
|
|
614
|
-
# wp.tile_store(out[i], com)
|
|
615
|
-
|
|
616
|
-
|
|
617
|
-
# #-------------------------------------------
|
|
618
|
-
# # compute deformation gradient
|
|
619
|
-
|
|
620
|
-
# i =
|
|
621
|
-
# j =
|
|
622
|
-
# k =
|
|
623
|
-
# l =
|
|
624
|
-
|
|
625
|
-
# f = wp.tile(F) # generate a block size tile of feature vectors
|
|
626
|
-
|
|
627
|
-
# # layer 1
|
|
628
|
-
# w1 = wp.tile_load(weights)
|
|
629
|
-
# b1 = wp.tile_load(bias)
|
|
630
|
-
|
|
631
|
-
# z = wp.tile_matmul(w1, f) + b1
|
|
632
|
-
# z = wp.tile_map(relu, z)
|
|
633
|
-
|
|
634
|
-
# # layer 2
|
|
635
|
-
# w2 = wp.tile_load(weights)
|
|
636
|
-
# b2 = wp.tile_load(bias)
|
|
637
|
-
|
|
638
|
-
# z = wp.tile_matmul(w2, z) + b2
|
|
639
|
-
# z = wp.tile_map(relu, z)
|
|
640
|
-
|
|
641
|
-
# o = wp.untile(f)
|
|
642
|
-
|
|
643
|
-
|
|
644
|
-
# #----------------------------------
|
|
645
|
-
# # MLP with helper function for linear layers
|
|
646
|
-
# # where shape is only partially known
|
|
647
|
-
# # at compile time, and the other dims
|
|
648
|
-
# # are inferred from the input vector
|
|
649
|
-
|
|
650
|
-
# f = wp.tile(F)
|
|
651
|
-
|
|
652
|
-
# z = wp.tile_linear(weights1, bias1, f, hidden=16)
|
|
653
|
-
# z = wp.tile_map(relu, z)
|
|
654
|
-
|
|
655
|
-
# z = wp.tile_linear(weights2, bias2, f, hidden=8)
|
|
656
|
-
# z = wp.tile_map(relu, z)
|
|
657
|
-
|
|
658
|
-
# z = wp.tile_linear(weights3, bias3, f, hidden=4)
|
|
659
|
-
# z = wp.tile_map(relu, z)
|
|
660
|
-
|
|
661
|
-
# o = wp.untile(z)
|
|
662
|
-
|
|
663
|
-
|
|
664
|
-
# #----------------------------------
|
|
665
|
-
# # softmax
|
|
666
|
-
|
|
667
|
-
# def softmax(z: Any):
|
|
668
|
-
|
|
669
|
-
# e = wp.tile_map(wp.exp, z)
|
|
670
|
-
# s = wp.tile_sum(e, dim=0)
|
|
671
|
-
|
|
672
|
-
# return z/s[0]
|
|
673
659
|
|
|
674
660
|
devices = get_cuda_test_devices()
|
|
675
661
|
|
|
@@ -688,12 +674,13 @@ add_function_test(TestTile, "test_tile_transpose", test_tile_transpose, devices=
|
|
|
688
674
|
add_function_test(TestTile, "test_tile_transpose_matmul", test_tile_transpose_matmul, devices=devices)
|
|
689
675
|
add_function_test(TestTile, "test_tile_operators", test_tile_operators, devices=devices)
|
|
690
676
|
add_function_test(TestTile, "test_tile_sum", test_tile_sum, devices=devices)
|
|
677
|
+
add_function_test(TestTile, "test_tile_sum_launch", test_tile_sum_launch, devices=devices)
|
|
691
678
|
add_function_test(TestTile, "test_tile_extract", test_tile_extract, devices=devices)
|
|
679
|
+
add_function_test(TestTile, "test_tile_extract_repeated", test_tile_extract_repeated, devices=devices)
|
|
692
680
|
add_function_test(TestTile, "test_tile_broadcast_add", test_tile_broadcast_add, devices=devices)
|
|
693
681
|
add_function_test(TestTile, "test_tile_broadcast_grad", test_tile_broadcast_grad, devices=devices)
|
|
694
|
-
add_function_test(TestTile, "
|
|
695
|
-
add_function_test(TestTile, "
|
|
696
|
-
|
|
682
|
+
add_function_test(TestTile, "test_tile_len", test_tile_len, devices=devices)
|
|
683
|
+
add_function_test(TestTile, "test_tile_print", test_tile_print, devices=devices, check_output=False)
|
|
697
684
|
|
|
698
685
|
if __name__ == "__main__":
|
|
699
686
|
wp.clear_kernel_cache()
|