warp-lang 1.5.1__py3-none-win_amd64.whl → 1.6.1__py3-none-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of warp-lang might be problematic. Click here for more details.
- warp/__init__.py +5 -0
- warp/autograd.py +414 -191
- warp/bin/warp-clang.dll +0 -0
- warp/bin/warp.dll +0 -0
- warp/build.py +40 -12
- warp/build_dll.py +13 -6
- warp/builtins.py +1077 -481
- warp/codegen.py +250 -122
- warp/config.py +65 -21
- warp/context.py +500 -149
- warp/examples/assets/square_cloth.usd +0 -0
- warp/examples/benchmarks/benchmark_gemm.py +27 -18
- warp/examples/benchmarks/benchmark_interop_paddle.py +3 -3
- warp/examples/benchmarks/benchmark_interop_torch.py +3 -3
- warp/examples/core/example_marching_cubes.py +1 -1
- warp/examples/core/example_mesh.py +1 -1
- warp/examples/core/example_torch.py +18 -34
- warp/examples/core/example_wave.py +1 -1
- warp/examples/fem/example_apic_fluid.py +1 -0
- warp/examples/fem/example_mixed_elasticity.py +1 -1
- warp/examples/optim/example_bounce.py +1 -1
- warp/examples/optim/example_cloth_throw.py +1 -1
- warp/examples/optim/example_diffray.py +4 -15
- warp/examples/optim/example_drone.py +1 -1
- warp/examples/optim/example_softbody_properties.py +392 -0
- warp/examples/optim/example_trajectory.py +1 -3
- warp/examples/optim/example_walker.py +5 -0
- warp/examples/sim/example_cartpole.py +0 -2
- warp/examples/sim/example_cloth_self_contact.py +314 -0
- warp/examples/sim/example_granular_collision_sdf.py +4 -5
- warp/examples/sim/example_jacobian_ik.py +0 -2
- warp/examples/sim/example_quadruped.py +5 -2
- warp/examples/tile/example_tile_cholesky.py +79 -0
- warp/examples/tile/example_tile_convolution.py +2 -2
- warp/examples/tile/example_tile_fft.py +2 -2
- warp/examples/tile/example_tile_filtering.py +3 -3
- warp/examples/tile/example_tile_matmul.py +4 -4
- warp/examples/tile/example_tile_mlp.py +12 -12
- warp/examples/tile/example_tile_nbody.py +191 -0
- warp/examples/tile/example_tile_walker.py +319 -0
- warp/math.py +147 -0
- warp/native/array.h +12 -0
- warp/native/builtin.h +0 -1
- warp/native/bvh.cpp +149 -70
- warp/native/bvh.cu +287 -68
- warp/native/bvh.h +195 -85
- warp/native/clang/clang.cpp +6 -2
- warp/native/crt.h +1 -0
- warp/native/cuda_util.cpp +35 -0
- warp/native/cuda_util.h +5 -0
- warp/native/exports.h +40 -40
- warp/native/intersect.h +17 -0
- warp/native/mat.h +57 -3
- warp/native/mathdx.cpp +19 -0
- warp/native/mesh.cpp +25 -8
- warp/native/mesh.cu +153 -101
- warp/native/mesh.h +482 -403
- warp/native/quat.h +40 -0
- warp/native/solid_angle.h +7 -0
- warp/native/sort.cpp +85 -0
- warp/native/sort.cu +34 -0
- warp/native/sort.h +3 -1
- warp/native/spatial.h +11 -0
- warp/native/tile.h +1189 -664
- warp/native/tile_reduce.h +8 -6
- warp/native/vec.h +41 -0
- warp/native/warp.cpp +8 -1
- warp/native/warp.cu +263 -40
- warp/native/warp.h +19 -5
- warp/optim/linear.py +22 -4
- warp/render/render_opengl.py +132 -59
- warp/render/render_usd.py +10 -2
- warp/sim/__init__.py +6 -1
- warp/sim/collide.py +289 -32
- warp/sim/import_urdf.py +20 -5
- warp/sim/integrator_euler.py +25 -7
- warp/sim/integrator_featherstone.py +147 -35
- warp/sim/integrator_vbd.py +842 -40
- warp/sim/model.py +173 -112
- warp/sim/render.py +2 -2
- warp/stubs.py +249 -116
- warp/tape.py +28 -30
- warp/tests/aux_test_module_unload.py +15 -0
- warp/tests/{test_sim_grad.py → flaky_test_sim_grad.py} +104 -63
- warp/tests/test_array.py +100 -0
- warp/tests/test_assert.py +242 -0
- warp/tests/test_codegen.py +14 -61
- warp/tests/test_collision.py +8 -8
- warp/tests/test_examples.py +16 -1
- warp/tests/test_grad_debug.py +87 -2
- warp/tests/test_hash_grid.py +1 -1
- warp/tests/test_ipc.py +116 -0
- warp/tests/test_launch.py +77 -26
- warp/tests/test_mat.py +213 -168
- warp/tests/test_math.py +47 -1
- warp/tests/test_matmul.py +11 -7
- warp/tests/test_matmul_lite.py +4 -4
- warp/tests/test_mesh.py +84 -60
- warp/tests/test_mesh_query_aabb.py +165 -0
- warp/tests/test_mesh_query_point.py +328 -286
- warp/tests/test_mesh_query_ray.py +134 -121
- warp/tests/test_mlp.py +2 -2
- warp/tests/test_operators.py +43 -0
- warp/tests/test_overwrite.py +6 -5
- warp/tests/test_quat.py +77 -0
- warp/tests/test_reload.py +29 -0
- warp/tests/test_sim_grad_bounce_linear.py +204 -0
- warp/tests/test_static.py +16 -0
- warp/tests/test_tape.py +25 -0
- warp/tests/test_tile.py +134 -191
- warp/tests/test_tile_load.py +399 -0
- warp/tests/test_tile_mathdx.py +61 -8
- warp/tests/test_tile_mlp.py +17 -17
- warp/tests/test_tile_reduce.py +24 -18
- warp/tests/test_tile_shared_memory.py +66 -17
- warp/tests/test_tile_view.py +165 -0
- warp/tests/test_torch.py +35 -0
- warp/tests/test_utils.py +36 -24
- warp/tests/test_vec.py +110 -0
- warp/tests/unittest_suites.py +29 -4
- warp/tests/unittest_utils.py +30 -11
- warp/thirdparty/unittest_parallel.py +5 -2
- warp/types.py +419 -111
- warp/utils.py +9 -5
- {warp_lang-1.5.1.dist-info → warp_lang-1.6.1.dist-info}/METADATA +86 -45
- {warp_lang-1.5.1.dist-info → warp_lang-1.6.1.dist-info}/RECORD +129 -118
- {warp_lang-1.5.1.dist-info → warp_lang-1.6.1.dist-info}/WHEEL +1 -1
- warp/examples/benchmarks/benchmark_tile.py +0 -179
- warp/native/tile_gemm.h +0 -341
- {warp_lang-1.5.1.dist-info → warp_lang-1.6.1.dist-info}/LICENSE.md +0 -0
- {warp_lang-1.5.1.dist-info → warp_lang-1.6.1.dist-info}/top_level.txt +0 -0
warp/tests/test_tile.py
CHANGED
|
@@ -27,8 +27,8 @@ def tile_copy_1d_kernel(A: wp.array(dtype=float), B: wp.array(dtype=float)):
|
|
|
27
27
|
# tile index
|
|
28
28
|
i = wp.tid()
|
|
29
29
|
|
|
30
|
-
a = wp.tile_load(A,
|
|
31
|
-
wp.tile_store(B,
|
|
30
|
+
a = wp.tile_load(A, shape=TILE_N, offset=i * TILE_N)
|
|
31
|
+
wp.tile_store(B, a, offset=i * TILE_N)
|
|
32
32
|
|
|
33
33
|
|
|
34
34
|
def test_tile_copy_1d(test, device):
|
|
@@ -66,8 +66,8 @@ def tile_copy_2d_kernel(A: wp.array2d(dtype=float), B: wp.array2d(dtype=float)):
|
|
|
66
66
|
# tile index
|
|
67
67
|
i, j = wp.tid()
|
|
68
68
|
|
|
69
|
-
a = wp.tile_load(A,
|
|
70
|
-
wp.tile_store(B, i, j
|
|
69
|
+
a = wp.tile_load(A, shape=(TILE_M, TILE_N), offset=(i * TILE_M, j * TILE_N))
|
|
70
|
+
wp.tile_store(B, a, offset=(i * TILE_M, j * TILE_N))
|
|
71
71
|
|
|
72
72
|
|
|
73
73
|
def test_tile_copy_2d(test, device):
|
|
@@ -111,11 +111,11 @@ def tile_unary_map(input: wp.array2d(dtype=float), output: wp.array2d(dtype=floa
|
|
|
111
111
|
# tile index
|
|
112
112
|
i, j = wp.tid()
|
|
113
113
|
|
|
114
|
-
a = wp.tile_load(input,
|
|
114
|
+
a = wp.tile_load(input, shape=(TILE_M, TILE_N), offset=(i * TILE_M, j * TILE_N))
|
|
115
115
|
|
|
116
116
|
sa = wp.tile_map(wp.sin, a)
|
|
117
117
|
|
|
118
|
-
wp.tile_store(output, i, j
|
|
118
|
+
wp.tile_store(output, sa, offset=(i * TILE_M, j * TILE_N))
|
|
119
119
|
|
|
120
120
|
|
|
121
121
|
def test_tile_unary_map(test, device):
|
|
@@ -163,12 +163,12 @@ def tile_binary_map(
|
|
|
163
163
|
# tile index
|
|
164
164
|
i, j = wp.tid()
|
|
165
165
|
|
|
166
|
-
a = wp.tile_load(input_a,
|
|
167
|
-
b = wp.tile_load(input_b,
|
|
166
|
+
a = wp.tile_load(input_a, shape=(TILE_M, TILE_N), offset=(i * TILE_M, j * TILE_N))
|
|
167
|
+
b = wp.tile_load(input_b, shape=(TILE_M, TILE_N), offset=(i * TILE_M, j * TILE_N))
|
|
168
168
|
|
|
169
169
|
sa = wp.tile_map(binary_func, a, b)
|
|
170
170
|
|
|
171
|
-
wp.tile_store(output, i, j
|
|
171
|
+
wp.tile_store(output, sa, offset=(i * TILE_M, j * TILE_N))
|
|
172
172
|
|
|
173
173
|
|
|
174
174
|
def test_tile_binary_map(test, device):
|
|
@@ -215,14 +215,14 @@ def test_tile_grouped_gemm(test, device):
|
|
|
215
215
|
# output tile index
|
|
216
216
|
i = wp.tid()
|
|
217
217
|
|
|
218
|
-
a = wp.tile_load(A[i],
|
|
219
|
-
b = wp.tile_load(B[i],
|
|
218
|
+
a = wp.tile_load(A[i], shape=(TILE_M, TILE_K))
|
|
219
|
+
b = wp.tile_load(B[i], shape=(TILE_K, TILE_N))
|
|
220
220
|
|
|
221
|
-
sum = wp.tile_zeros(
|
|
221
|
+
sum = wp.tile_zeros(shape=(TILE_M, TILE_N), dtype=wp.float32)
|
|
222
222
|
|
|
223
223
|
wp.tile_matmul(a, b, sum)
|
|
224
224
|
|
|
225
|
-
wp.tile_store(C[i],
|
|
225
|
+
wp.tile_store(C[i], sum)
|
|
226
226
|
|
|
227
227
|
batch_count = 56
|
|
228
228
|
|
|
@@ -245,7 +245,7 @@ def test_tile_grouped_gemm(test, device):
|
|
|
245
245
|
)
|
|
246
246
|
|
|
247
247
|
# TODO: 32 mismatched elements
|
|
248
|
-
assert_np_equal(C_wp.numpy(), C)
|
|
248
|
+
assert_np_equal(C_wp.numpy(), C, 1e-6)
|
|
249
249
|
|
|
250
250
|
|
|
251
251
|
@unittest.skipUnless(wp.context.runtime.core.is_mathdx_enabled(), "Warp was not built with MathDx support")
|
|
@@ -255,7 +255,7 @@ def test_tile_gemm(test, device):
|
|
|
255
255
|
# output tile index
|
|
256
256
|
i, j = wp.tid()
|
|
257
257
|
|
|
258
|
-
sum = wp.tile_zeros(
|
|
258
|
+
sum = wp.tile_zeros(shape=(TILE_M, TILE_N), dtype=wp.float32)
|
|
259
259
|
|
|
260
260
|
M = A.shape[0]
|
|
261
261
|
N = B.shape[1]
|
|
@@ -264,13 +264,13 @@ def test_tile_gemm(test, device):
|
|
|
264
264
|
count = int(K / TILE_K)
|
|
265
265
|
|
|
266
266
|
for k in range(0, count):
|
|
267
|
-
a = wp.tile_load(A,
|
|
268
|
-
b = wp.tile_load(B,
|
|
267
|
+
a = wp.tile_load(A, shape=(TILE_M, TILE_K), offset=(i * TILE_M, k * TILE_K))
|
|
268
|
+
b = wp.tile_load(B, shape=(TILE_K, TILE_N), offset=(k * TILE_K, j * TILE_N))
|
|
269
269
|
|
|
270
270
|
# sum += a*b
|
|
271
271
|
wp.tile_matmul(a, b, sum)
|
|
272
272
|
|
|
273
|
-
wp.tile_store(C, i, j
|
|
273
|
+
wp.tile_store(C, sum, offset=(i * TILE_M, j * TILE_N))
|
|
274
274
|
|
|
275
275
|
M = TILE_M * 7
|
|
276
276
|
K = TILE_K * 6
|
|
@@ -309,7 +309,7 @@ def tile_operators(input: wp.array3d(dtype=float), output: wp.array3d(dtype=floa
|
|
|
309
309
|
# output tile index
|
|
310
310
|
i = wp.tid()
|
|
311
311
|
|
|
312
|
-
a = wp.tile_load(input[i],
|
|
312
|
+
a = wp.tile_load(input[i], shape=(TILE_M, TILE_N))
|
|
313
313
|
|
|
314
314
|
# neg
|
|
315
315
|
b = -a
|
|
@@ -323,7 +323,7 @@ def tile_operators(input: wp.array3d(dtype=float), output: wp.array3d(dtype=floa
|
|
|
323
323
|
# add tiles
|
|
324
324
|
e = a + d
|
|
325
325
|
|
|
326
|
-
wp.tile_store(output[i],
|
|
326
|
+
wp.tile_store(output[i], e)
|
|
327
327
|
|
|
328
328
|
|
|
329
329
|
def test_tile_operators(test, device):
|
|
@@ -358,10 +358,10 @@ def tile_sum_kernel(input: wp.array3d(dtype=float), output: wp.array(dtype=float
|
|
|
358
358
|
# output tile index
|
|
359
359
|
i = wp.tid()
|
|
360
360
|
|
|
361
|
-
a = wp.tile_load(input[i],
|
|
361
|
+
a = wp.tile_load(input[i], shape=(TILE_M, TILE_N))
|
|
362
362
|
s = wp.tile_sum(a) * 0.5
|
|
363
363
|
|
|
364
|
-
wp.tile_store(output,
|
|
364
|
+
wp.tile_store(output, s, offset=i)
|
|
365
365
|
|
|
366
366
|
|
|
367
367
|
def test_tile_sum(test, device):
|
|
@@ -442,47 +442,94 @@ def test_tile_sum_launch(test, device):
|
|
|
442
442
|
|
|
443
443
|
|
|
444
444
|
@wp.kernel
|
|
445
|
-
def
|
|
446
|
-
|
|
447
|
-
i = wp.tid()
|
|
445
|
+
def test_tile_extract_kernel(a: wp.array2d(dtype=float), b: wp.array2d(dtype=float)):
|
|
446
|
+
i, j, x, y = wp.tid()
|
|
448
447
|
|
|
449
|
-
|
|
448
|
+
tile = wp.tile_load(a, shape=(TILE_M, TILE_N), offset=(i * TILE_M, j * TILE_N))
|
|
450
449
|
|
|
451
|
-
#
|
|
452
|
-
|
|
453
|
-
for i in range(TILE_M):
|
|
454
|
-
for j in range(TILE_N):
|
|
455
|
-
output[i, j] = t[i, j]
|
|
450
|
+
# compute sum of array sub tile
|
|
451
|
+
wp.atomic_add(b, i, j, wp.tile_extract(tile, x, y))
|
|
456
452
|
|
|
457
453
|
|
|
458
454
|
def test_tile_extract(test, device):
|
|
459
|
-
|
|
460
|
-
N = TILE_N
|
|
455
|
+
block_dim = 16
|
|
461
456
|
|
|
462
|
-
|
|
463
|
-
input = rng.random((M, N), dtype=np.float32)
|
|
457
|
+
input = np.arange(TILE_M * TILE_N * 4).reshape((TILE_M * 2, TILE_N * 2))
|
|
464
458
|
|
|
465
|
-
|
|
466
|
-
|
|
459
|
+
a = wp.array(input, dtype=float, requires_grad=True, device=device)
|
|
460
|
+
b = wp.zeros((2, 2), dtype=float, requires_grad=True, device=device)
|
|
467
461
|
|
|
468
462
|
with wp.Tape() as tape:
|
|
469
|
-
wp.
|
|
463
|
+
wp.launch(
|
|
464
|
+
test_tile_extract_kernel, dim=[2, 2, TILE_M, TILE_N], inputs=[a, b], block_dim=block_dim, device=device
|
|
465
|
+
)
|
|
470
466
|
|
|
471
|
-
|
|
467
|
+
# compute sum of each sub-block
|
|
468
|
+
sums = input.reshape(2, input.shape[0] // 2, 2, input.shape[1] // 2).sum(axis=(1, 3))
|
|
472
469
|
|
|
473
|
-
|
|
470
|
+
assert_np_equal(b.numpy(), sums)
|
|
471
|
+
|
|
472
|
+
b.grad.fill_(1.0)
|
|
474
473
|
|
|
475
474
|
tape.backward()
|
|
476
475
|
|
|
477
|
-
|
|
476
|
+
expected_grad = np.ones_like(input)
|
|
477
|
+
assert_np_equal(a.grad.numpy(), expected_grad)
|
|
478
|
+
|
|
479
|
+
|
|
480
|
+
@wp.kernel
|
|
481
|
+
def test_tile_extract_repeated_kernel(a: wp.array2d(dtype=float), b: wp.array2d(dtype=float)):
|
|
482
|
+
i, j, x, y = wp.tid()
|
|
483
|
+
|
|
484
|
+
tile = wp.tile_load(a, shape=(TILE_M, TILE_N), offset=(i * TILE_M, j * TILE_N))
|
|
485
|
+
|
|
486
|
+
# each thread extracts the first element of the sub-tile
|
|
487
|
+
# and accumulates the value onto the output
|
|
488
|
+
wp.atomic_add(b, i, j, wp.tile_extract(tile, 0, 0))
|
|
489
|
+
|
|
490
|
+
|
|
491
|
+
def test_tile_extract_repeated(test, device):
|
|
492
|
+
block_dim = 16
|
|
493
|
+
|
|
494
|
+
input = np.arange(TILE_M * TILE_N * 4).reshape((TILE_M * 2, TILE_N * 2))
|
|
495
|
+
|
|
496
|
+
a = wp.array(input, dtype=float, requires_grad=True, device=device)
|
|
497
|
+
b = wp.zeros((2, 2), dtype=float, requires_grad=True, device=device)
|
|
498
|
+
|
|
499
|
+
with wp.Tape() as tape:
|
|
500
|
+
wp.launch(
|
|
501
|
+
test_tile_extract_repeated_kernel,
|
|
502
|
+
dim=[2, 2, TILE_M, TILE_N],
|
|
503
|
+
inputs=[a, b],
|
|
504
|
+
block_dim=block_dim,
|
|
505
|
+
device=device,
|
|
506
|
+
)
|
|
507
|
+
|
|
508
|
+
# each thread adds the first element to the output
|
|
509
|
+
scale = TILE_M * TILE_N
|
|
510
|
+
sums = np.array([[input[0, 0], input[0, TILE_N]], [input[TILE_M, 0], input[TILE_M, TILE_N]]]) * scale
|
|
511
|
+
|
|
512
|
+
assert_np_equal(b.numpy(), sums)
|
|
513
|
+
|
|
514
|
+
b.grad.fill_(1.0)
|
|
515
|
+
|
|
516
|
+
tape.backward()
|
|
517
|
+
|
|
518
|
+
expected_grad = np.zeros_like(input)
|
|
519
|
+
expected_grad[0, 0] = scale
|
|
520
|
+
expected_grad[0, TILE_N] = scale
|
|
521
|
+
expected_grad[TILE_M, 0] = scale
|
|
522
|
+
expected_grad[TILE_M, TILE_N] = scale
|
|
523
|
+
|
|
524
|
+
assert_np_equal(a.grad.numpy(), expected_grad)
|
|
478
525
|
|
|
479
526
|
|
|
480
527
|
@wp.kernel
|
|
481
528
|
def test_tile_transpose_kernel(input: wp.array2d(dtype=float), output: wp.array2d(dtype=float)):
|
|
482
|
-
x = wp.tile_load(input,
|
|
529
|
+
x = wp.tile_load(input, shape=(TILE_M, TILE_N))
|
|
483
530
|
y = wp.tile_transpose(x)
|
|
484
531
|
|
|
485
|
-
wp.tile_store(output,
|
|
532
|
+
wp.tile_store(output, y)
|
|
486
533
|
|
|
487
534
|
|
|
488
535
|
def test_tile_transpose(test, device):
|
|
@@ -499,13 +546,13 @@ def test_tile_transpose(test, device):
|
|
|
499
546
|
def test_tile_transpose_matmul(test, device):
|
|
500
547
|
@wp.kernel
|
|
501
548
|
def test_tile_transpose_matmul_kernel(input: wp.array2d(dtype=float), output: wp.array2d(dtype=float)):
|
|
502
|
-
x = wp.tile_load(input,
|
|
549
|
+
x = wp.tile_load(input, shape=(TILE_M, TILE_N))
|
|
503
550
|
y = wp.tile_transpose(x)
|
|
504
551
|
|
|
505
|
-
z = wp.tile_zeros(dtype=float,
|
|
552
|
+
z = wp.tile_zeros(dtype=float, shape=(TILE_N, TILE_N))
|
|
506
553
|
wp.tile_matmul(y, x, z)
|
|
507
554
|
|
|
508
|
-
wp.tile_store(output,
|
|
555
|
+
wp.tile_store(output, z)
|
|
509
556
|
|
|
510
557
|
rng = np.random.default_rng(42)
|
|
511
558
|
input = wp.array(rng.random((TILE_M, TILE_N), dtype=np.float32), device=device)
|
|
@@ -520,13 +567,13 @@ def test_tile_transpose_matmul(test, device):
|
|
|
520
567
|
def test_tile_broadcast_add_kernel(
|
|
521
568
|
input_a: wp.array2d(dtype=float), input_b: wp.array(dtype=float), output: wp.array2d(dtype=float)
|
|
522
569
|
):
|
|
523
|
-
a = wp.tile_load(input_a,
|
|
524
|
-
b = wp.tile_load(input_b,
|
|
570
|
+
a = wp.tile_load(input_a, shape=(10, 10))
|
|
571
|
+
b = wp.tile_load(input_b, shape=10)
|
|
525
572
|
|
|
526
|
-
c = wp.tile_broadcast(b, 10, 10)
|
|
573
|
+
c = wp.tile_broadcast(b, shape=(10, 10))
|
|
527
574
|
d = a + c
|
|
528
575
|
|
|
529
|
-
wp.tile_store(output,
|
|
576
|
+
wp.tile_store(output, d)
|
|
530
577
|
|
|
531
578
|
|
|
532
579
|
def test_tile_broadcast_add(test, device):
|
|
@@ -544,13 +591,13 @@ def test_tile_broadcast_add(test, device):
|
|
|
544
591
|
|
|
545
592
|
@wp.kernel
|
|
546
593
|
def test_tile_broadcast_grad_kernel(a: wp.array(dtype=float), b: wp.array2d(dtype=float)):
|
|
547
|
-
x = wp.tile_load(a,
|
|
548
|
-
y = wp.tile_broadcast(x,
|
|
594
|
+
x = wp.tile_load(a, shape=5)
|
|
595
|
+
y = wp.tile_broadcast(x, shape=(5, 5))
|
|
549
596
|
|
|
550
|
-
w = wp.tile_ones(dtype=float,
|
|
597
|
+
w = wp.tile_ones(dtype=float, shape=(5, 5))
|
|
551
598
|
z = w + y
|
|
552
599
|
|
|
553
|
-
wp.tile_store(b,
|
|
600
|
+
wp.tile_store(b, z)
|
|
554
601
|
|
|
555
602
|
|
|
556
603
|
def test_tile_broadcast_grad(test, device):
|
|
@@ -567,153 +614,49 @@ def test_tile_broadcast_grad(test, device):
|
|
|
567
614
|
assert_np_equal(a.grad.numpy(), np.ones(5) * 5.0)
|
|
568
615
|
|
|
569
616
|
|
|
570
|
-
TILE_VIEW_M = 16
|
|
571
|
-
TILE_VIEW_N = 128
|
|
572
|
-
|
|
573
|
-
|
|
574
617
|
@wp.kernel
|
|
575
|
-
def
|
|
576
|
-
|
|
577
|
-
|
|
578
|
-
|
|
579
|
-
|
|
580
|
-
for i in range(TILE_VIEW_M):
|
|
581
|
-
# create a view on original array and store
|
|
582
|
-
row = a[i]
|
|
583
|
-
wp.tile_store(dst, i, 0, row)
|
|
584
|
-
|
|
585
|
-
|
|
586
|
-
def test_tile_view(test, device):
|
|
587
|
-
rng = np.random.default_rng(42)
|
|
588
|
-
|
|
589
|
-
a = wp.array(rng.random((TILE_VIEW_M, TILE_VIEW_N), dtype=np.float32), requires_grad=True, device=device)
|
|
590
|
-
b = wp.array(np.zeros((TILE_VIEW_M, TILE_VIEW_N), dtype=np.float32), requires_grad=True, device=device)
|
|
618
|
+
def tile_len_kernel(
|
|
619
|
+
a: wp.array(dtype=float, ndim=2),
|
|
620
|
+
out: wp.array(dtype=int),
|
|
621
|
+
):
|
|
622
|
+
x = wp.tile_load(a, shape=(TILE_M, TILE_N))
|
|
591
623
|
|
|
592
|
-
|
|
593
|
-
|
|
624
|
+
length = wp.static(len(x))
|
|
625
|
+
wp.expect_eq(wp.static(len(x)), TILE_M)
|
|
626
|
+
out[0] = wp.static(len(x))
|
|
594
627
|
|
|
595
|
-
assert_np_equal(b.numpy(), a.numpy())
|
|
596
628
|
|
|
597
|
-
|
|
598
|
-
|
|
629
|
+
def test_tile_len(test, device):
|
|
630
|
+
a = wp.zeros((TILE_M, TILE_N), dtype=float, device=device)
|
|
631
|
+
out = wp.empty(1, dtype=int, device=device)
|
|
632
|
+
wp.launch_tiled(
|
|
633
|
+
tile_len_kernel,
|
|
634
|
+
dim=(1,),
|
|
635
|
+
inputs=(a,),
|
|
636
|
+
outputs=(out,),
|
|
637
|
+
block_dim=32,
|
|
638
|
+
device=device,
|
|
639
|
+
)
|
|
599
640
|
|
|
600
|
-
|
|
641
|
+
test.assertEqual(out.numpy()[0], TILE_M)
|
|
601
642
|
|
|
602
643
|
|
|
603
644
|
@wp.kernel
|
|
604
|
-
def
|
|
605
|
-
#
|
|
606
|
-
a = wp.
|
|
607
|
-
|
|
608
|
-
|
|
609
|
-
# copy the source array row by row
|
|
610
|
-
for i in range(TILE_VIEW_M):
|
|
611
|
-
# create views onto source and dest rows
|
|
612
|
-
row_src = a[i]
|
|
613
|
-
row_dst = b[i]
|
|
614
|
-
|
|
615
|
-
# copy onto dest row
|
|
616
|
-
wp.tile_assign(row_dst, 0, 0, row_src)
|
|
617
|
-
|
|
618
|
-
wp.tile_store(dst, 0, 0, b)
|
|
619
|
-
|
|
620
|
-
|
|
621
|
-
def test_tile_assign(test, device):
|
|
622
|
-
rng = np.random.default_rng(42)
|
|
623
|
-
|
|
624
|
-
a = wp.array(rng.random((TILE_VIEW_M, TILE_VIEW_N), dtype=np.float32), requires_grad=True, device=device)
|
|
625
|
-
b = wp.array(np.zeros((TILE_VIEW_M, TILE_VIEW_N), dtype=np.float32), requires_grad=True, device=device)
|
|
626
|
-
|
|
627
|
-
with wp.Tape() as tape:
|
|
628
|
-
wp.launch_tiled(test_tile_assign_kernel, dim=[1], inputs=[a, b], block_dim=32, device=device)
|
|
629
|
-
|
|
630
|
-
assert_np_equal(b.numpy(), a.numpy())
|
|
631
|
-
|
|
632
|
-
b.grad = wp.ones_like(b, device=device)
|
|
633
|
-
tape.backward()
|
|
634
|
-
|
|
635
|
-
assert_np_equal(a.grad.numpy(), np.ones_like(a.numpy()))
|
|
636
|
-
|
|
637
|
-
|
|
638
|
-
# #-----------------------------------------
|
|
639
|
-
# # center of mass computation
|
|
640
|
-
|
|
641
|
-
# start = offset[i]
|
|
642
|
-
# end = offset[i+1]
|
|
643
|
-
|
|
644
|
-
# com = wp.tile_zeros(dtype=wp.vec3, M=1)
|
|
645
|
-
|
|
646
|
-
# # load chunks of indices
|
|
647
|
-
# for i in range(start, end, N):
|
|
648
|
-
|
|
649
|
-
# count = wp.min(N, end-i)
|
|
650
|
-
|
|
651
|
-
# idx = wp.tile_load(indices, i, N, max_col=count)
|
|
652
|
-
# p = wp.tile_load(points, idx, max_col=count)
|
|
645
|
+
def test_tile_print_kernel():
|
|
646
|
+
# shared tile
|
|
647
|
+
a = wp.tile_ones(shape=(4, 3), dtype=float, storage="shared")
|
|
648
|
+
# register tile
|
|
649
|
+
b = wp.tile_ones(shape=(4, 3), dtype=float)
|
|
653
650
|
|
|
654
|
-
|
|
651
|
+
print(a)
|
|
652
|
+
print(b)
|
|
655
653
|
|
|
656
654
|
|
|
657
|
-
|
|
655
|
+
def test_tile_print(test, device):
|
|
656
|
+
wp.launch_tiled(test_tile_print_kernel, dim=1, inputs=[], block_dim=64, device=device)
|
|
657
|
+
wp.synchronize()
|
|
658
658
|
|
|
659
659
|
|
|
660
|
-
# #-------------------------------------------
|
|
661
|
-
# # compute deformation gradient
|
|
662
|
-
|
|
663
|
-
# i =
|
|
664
|
-
# j =
|
|
665
|
-
# k =
|
|
666
|
-
# l =
|
|
667
|
-
|
|
668
|
-
# f = wp.tile(F) # generate a block size tile of feature vectors
|
|
669
|
-
|
|
670
|
-
# # layer 1
|
|
671
|
-
# w1 = wp.tile_load(weights)
|
|
672
|
-
# b1 = wp.tile_load(bias)
|
|
673
|
-
|
|
674
|
-
# z = wp.tile_matmul(w1, f) + b1
|
|
675
|
-
# z = wp.tile_map(relu, z)
|
|
676
|
-
|
|
677
|
-
# # layer 2
|
|
678
|
-
# w2 = wp.tile_load(weights)
|
|
679
|
-
# b2 = wp.tile_load(bias)
|
|
680
|
-
|
|
681
|
-
# z = wp.tile_matmul(w2, z) + b2
|
|
682
|
-
# z = wp.tile_map(relu, z)
|
|
683
|
-
|
|
684
|
-
# o = wp.untile(f)
|
|
685
|
-
|
|
686
|
-
|
|
687
|
-
# #----------------------------------
|
|
688
|
-
# # MLP with helper function for linear layers
|
|
689
|
-
# # where shape is only partially known
|
|
690
|
-
# # at compile time, and the other dims
|
|
691
|
-
# # are inferred from the input vector
|
|
692
|
-
|
|
693
|
-
# f = wp.tile(F)
|
|
694
|
-
|
|
695
|
-
# z = wp.tile_linear(weights1, bias1, f, hidden=16)
|
|
696
|
-
# z = wp.tile_map(relu, z)
|
|
697
|
-
|
|
698
|
-
# z = wp.tile_linear(weights2, bias2, f, hidden=8)
|
|
699
|
-
# z = wp.tile_map(relu, z)
|
|
700
|
-
|
|
701
|
-
# z = wp.tile_linear(weights3, bias3, f, hidden=4)
|
|
702
|
-
# z = wp.tile_map(relu, z)
|
|
703
|
-
|
|
704
|
-
# o = wp.untile(z)
|
|
705
|
-
|
|
706
|
-
|
|
707
|
-
# #----------------------------------
|
|
708
|
-
# # softmax
|
|
709
|
-
|
|
710
|
-
# def softmax(z: Any):
|
|
711
|
-
|
|
712
|
-
# e = wp.tile_map(wp.exp, z)
|
|
713
|
-
# s = wp.tile_sum(e, dim=0)
|
|
714
|
-
|
|
715
|
-
# return z/s[0]
|
|
716
|
-
|
|
717
660
|
devices = get_cuda_test_devices()
|
|
718
661
|
|
|
719
662
|
|
|
@@ -733,11 +676,11 @@ add_function_test(TestTile, "test_tile_operators", test_tile_operators, devices=
|
|
|
733
676
|
add_function_test(TestTile, "test_tile_sum", test_tile_sum, devices=devices)
|
|
734
677
|
add_function_test(TestTile, "test_tile_sum_launch", test_tile_sum_launch, devices=devices)
|
|
735
678
|
add_function_test(TestTile, "test_tile_extract", test_tile_extract, devices=devices)
|
|
679
|
+
add_function_test(TestTile, "test_tile_extract_repeated", test_tile_extract_repeated, devices=devices)
|
|
736
680
|
add_function_test(TestTile, "test_tile_broadcast_add", test_tile_broadcast_add, devices=devices)
|
|
737
681
|
add_function_test(TestTile, "test_tile_broadcast_grad", test_tile_broadcast_grad, devices=devices)
|
|
738
|
-
add_function_test(TestTile, "
|
|
739
|
-
add_function_test(TestTile, "
|
|
740
|
-
|
|
682
|
+
add_function_test(TestTile, "test_tile_len", test_tile_len, devices=devices)
|
|
683
|
+
add_function_test(TestTile, "test_tile_print", test_tile_print, devices=devices, check_output=False)
|
|
741
684
|
|
|
742
685
|
if __name__ == "__main__":
|
|
743
686
|
wp.clear_kernel_cache()
|