warp-lang 1.5.0__py3-none-macosx_10_13_universal2.whl → 1.6.0__py3-none-macosx_10_13_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of warp-lang might be problematic. Click here for more details.

Files changed (132) hide show
  1. warp/__init__.py +5 -0
  2. warp/autograd.py +414 -191
  3. warp/bin/libwarp-clang.dylib +0 -0
  4. warp/bin/libwarp.dylib +0 -0
  5. warp/build.py +40 -12
  6. warp/build_dll.py +13 -6
  7. warp/builtins.py +1124 -497
  8. warp/codegen.py +261 -136
  9. warp/config.py +1 -1
  10. warp/context.py +357 -119
  11. warp/examples/assets/square_cloth.usd +0 -0
  12. warp/examples/benchmarks/benchmark_gemm.py +27 -18
  13. warp/examples/benchmarks/benchmark_interop_paddle.py +3 -3
  14. warp/examples/benchmarks/benchmark_interop_torch.py +3 -3
  15. warp/examples/core/example_torch.py +18 -34
  16. warp/examples/fem/example_apic_fluid.py +1 -0
  17. warp/examples/fem/example_mixed_elasticity.py +1 -1
  18. warp/examples/optim/example_bounce.py +1 -1
  19. warp/examples/optim/example_cloth_throw.py +1 -1
  20. warp/examples/optim/example_diffray.py +4 -15
  21. warp/examples/optim/example_drone.py +1 -1
  22. warp/examples/optim/example_softbody_properties.py +392 -0
  23. warp/examples/optim/example_trajectory.py +1 -3
  24. warp/examples/optim/example_walker.py +5 -0
  25. warp/examples/sim/example_cartpole.py +0 -2
  26. warp/examples/sim/example_cloth.py +3 -1
  27. warp/examples/sim/example_cloth_self_contact.py +260 -0
  28. warp/examples/sim/example_granular_collision_sdf.py +4 -5
  29. warp/examples/sim/example_jacobian_ik.py +0 -2
  30. warp/examples/sim/example_quadruped.py +5 -2
  31. warp/examples/tile/example_tile_cholesky.py +79 -0
  32. warp/examples/tile/example_tile_convolution.py +2 -2
  33. warp/examples/tile/example_tile_fft.py +2 -2
  34. warp/examples/tile/example_tile_filtering.py +3 -3
  35. warp/examples/tile/example_tile_matmul.py +4 -4
  36. warp/examples/tile/example_tile_mlp.py +12 -12
  37. warp/examples/tile/example_tile_nbody.py +180 -0
  38. warp/examples/tile/example_tile_walker.py +319 -0
  39. warp/fem/geometry/geometry.py +0 -2
  40. warp/math.py +147 -0
  41. warp/native/array.h +12 -0
  42. warp/native/builtin.h +0 -1
  43. warp/native/bvh.cpp +149 -70
  44. warp/native/bvh.cu +287 -68
  45. warp/native/bvh.h +195 -85
  46. warp/native/clang/clang.cpp +5 -1
  47. warp/native/coloring.cpp +5 -1
  48. warp/native/cuda_util.cpp +91 -53
  49. warp/native/cuda_util.h +5 -0
  50. warp/native/exports.h +40 -40
  51. warp/native/intersect.h +17 -0
  52. warp/native/mat.h +41 -0
  53. warp/native/mathdx.cpp +19 -0
  54. warp/native/mesh.cpp +25 -8
  55. warp/native/mesh.cu +153 -101
  56. warp/native/mesh.h +482 -403
  57. warp/native/quat.h +40 -0
  58. warp/native/solid_angle.h +7 -0
  59. warp/native/sort.cpp +85 -0
  60. warp/native/sort.cu +34 -0
  61. warp/native/sort.h +3 -1
  62. warp/native/spatial.h +11 -0
  63. warp/native/tile.h +1187 -669
  64. warp/native/tile_reduce.h +8 -6
  65. warp/native/vec.h +41 -0
  66. warp/native/warp.cpp +8 -1
  67. warp/native/warp.cu +263 -40
  68. warp/native/warp.h +19 -5
  69. warp/optim/linear.py +22 -4
  70. warp/render/render_opengl.py +130 -64
  71. warp/sim/__init__.py +6 -1
  72. warp/sim/collide.py +270 -26
  73. warp/sim/import_urdf.py +8 -8
  74. warp/sim/integrator_euler.py +25 -7
  75. warp/sim/integrator_featherstone.py +154 -35
  76. warp/sim/integrator_vbd.py +842 -40
  77. warp/sim/model.py +134 -72
  78. warp/sparse.py +1 -1
  79. warp/stubs.py +265 -132
  80. warp/tape.py +28 -30
  81. warp/tests/aux_test_module_unload.py +15 -0
  82. warp/tests/{test_sim_grad.py → flaky_test_sim_grad.py} +104 -63
  83. warp/tests/test_array.py +74 -0
  84. warp/tests/test_assert.py +242 -0
  85. warp/tests/test_codegen.py +14 -61
  86. warp/tests/test_collision.py +2 -2
  87. warp/tests/test_coloring.py +12 -2
  88. warp/tests/test_examples.py +12 -1
  89. warp/tests/test_func.py +21 -4
  90. warp/tests/test_grad_debug.py +87 -2
  91. warp/tests/test_hash_grid.py +1 -1
  92. warp/tests/test_ipc.py +116 -0
  93. warp/tests/test_lerp.py +13 -87
  94. warp/tests/test_mat.py +138 -167
  95. warp/tests/test_math.py +47 -1
  96. warp/tests/test_matmul.py +17 -16
  97. warp/tests/test_matmul_lite.py +10 -15
  98. warp/tests/test_mesh.py +84 -60
  99. warp/tests/test_mesh_query_aabb.py +165 -0
  100. warp/tests/test_mesh_query_point.py +328 -286
  101. warp/tests/test_mesh_query_ray.py +134 -121
  102. warp/tests/test_mlp.py +2 -2
  103. warp/tests/test_operators.py +43 -0
  104. warp/tests/test_overwrite.py +47 -2
  105. warp/tests/test_quat.py +77 -0
  106. warp/tests/test_reload.py +29 -0
  107. warp/tests/test_sim_grad_bounce_linear.py +204 -0
  108. warp/tests/test_smoothstep.py +17 -83
  109. warp/tests/test_static.py +19 -3
  110. warp/tests/test_tape.py +25 -0
  111. warp/tests/test_tile.py +178 -191
  112. warp/tests/test_tile_load.py +356 -0
  113. warp/tests/test_tile_mathdx.py +61 -8
  114. warp/tests/test_tile_mlp.py +17 -17
  115. warp/tests/test_tile_reduce.py +24 -18
  116. warp/tests/test_tile_shared_memory.py +66 -17
  117. warp/tests/test_tile_view.py +165 -0
  118. warp/tests/test_torch.py +35 -0
  119. warp/tests/test_utils.py +36 -24
  120. warp/tests/test_vec.py +110 -0
  121. warp/tests/unittest_suites.py +29 -4
  122. warp/tests/unittest_utils.py +30 -13
  123. warp/thirdparty/unittest_parallel.py +2 -2
  124. warp/types.py +411 -101
  125. warp/utils.py +10 -7
  126. {warp_lang-1.5.0.dist-info → warp_lang-1.6.0.dist-info}/METADATA +92 -69
  127. {warp_lang-1.5.0.dist-info → warp_lang-1.6.0.dist-info}/RECORD +130 -119
  128. {warp_lang-1.5.0.dist-info → warp_lang-1.6.0.dist-info}/WHEEL +1 -1
  129. warp/examples/benchmarks/benchmark_tile.py +0 -179
  130. warp/native/tile_gemm.h +0 -341
  131. {warp_lang-1.5.0.dist-info → warp_lang-1.6.0.dist-info}/LICENSE.md +0 -0
  132. {warp_lang-1.5.0.dist-info → warp_lang-1.6.0.dist-info}/top_level.txt +0 -0
warp/tests/test_tile.py CHANGED
@@ -27,8 +27,8 @@ def tile_copy_1d_kernel(A: wp.array(dtype=float), B: wp.array(dtype=float)):
27
27
  # tile index
28
28
  i = wp.tid()
29
29
 
30
- a = wp.tile_load(A, i, n=TILE_N)
31
- wp.tile_store(B, i, a)
30
+ a = wp.tile_load(A, shape=TILE_N, offset=i * TILE_N)
31
+ wp.tile_store(B, a, offset=i * TILE_N)
32
32
 
33
33
 
34
34
  def test_tile_copy_1d(test, device):
@@ -66,8 +66,8 @@ def tile_copy_2d_kernel(A: wp.array2d(dtype=float), B: wp.array2d(dtype=float)):
66
66
  # tile index
67
67
  i, j = wp.tid()
68
68
 
69
- a = wp.tile_load(A, i, j, m=TILE_M, n=TILE_N)
70
- wp.tile_store(B, i, j, a)
69
+ a = wp.tile_load(A, shape=(TILE_M, TILE_N), offset=(i * TILE_M, j * TILE_N))
70
+ wp.tile_store(B, a, offset=(i * TILE_M, j * TILE_N))
71
71
 
72
72
 
73
73
  def test_tile_copy_2d(test, device):
@@ -111,11 +111,11 @@ def tile_unary_map(input: wp.array2d(dtype=float), output: wp.array2d(dtype=floa
111
111
  # tile index
112
112
  i, j = wp.tid()
113
113
 
114
- a = wp.tile_load(input, i, j, m=TILE_M, n=TILE_N)
114
+ a = wp.tile_load(input, shape=(TILE_M, TILE_N), offset=(i * TILE_M, j * TILE_N))
115
115
 
116
116
  sa = wp.tile_map(wp.sin, a)
117
117
 
118
- wp.tile_store(output, i, j, sa)
118
+ wp.tile_store(output, sa, offset=(i * TILE_M, j * TILE_N))
119
119
 
120
120
 
121
121
  def test_tile_unary_map(test, device):
@@ -163,12 +163,12 @@ def tile_binary_map(
163
163
  # tile index
164
164
  i, j = wp.tid()
165
165
 
166
- a = wp.tile_load(input_a, i, j, m=TILE_M, n=TILE_N)
167
- b = wp.tile_load(input_b, i, j, m=TILE_M, n=TILE_N)
166
+ a = wp.tile_load(input_a, shape=(TILE_M, TILE_N), offset=(i * TILE_M, j * TILE_N))
167
+ b = wp.tile_load(input_b, shape=(TILE_M, TILE_N), offset=(i * TILE_M, j * TILE_N))
168
168
 
169
169
  sa = wp.tile_map(binary_func, a, b)
170
170
 
171
- wp.tile_store(output, i, j, sa)
171
+ wp.tile_store(output, sa, offset=(i * TILE_M, j * TILE_N))
172
172
 
173
173
 
174
174
  def test_tile_binary_map(test, device):
@@ -215,14 +215,14 @@ def test_tile_grouped_gemm(test, device):
215
215
  # output tile index
216
216
  i = wp.tid()
217
217
 
218
- a = wp.tile_load(A[i], 0, 0, m=TILE_M, n=TILE_K)
219
- b = wp.tile_load(B[i], 0, 0, m=TILE_K, n=TILE_N)
218
+ a = wp.tile_load(A[i], shape=(TILE_M, TILE_K))
219
+ b = wp.tile_load(B[i], shape=(TILE_K, TILE_N))
220
220
 
221
- sum = wp.tile_zeros(m=TILE_M, n=TILE_N, dtype=wp.float32)
221
+ sum = wp.tile_zeros(shape=(TILE_M, TILE_N), dtype=wp.float32)
222
222
 
223
223
  wp.tile_matmul(a, b, sum)
224
224
 
225
- wp.tile_store(C[i], 0, 0, sum)
225
+ wp.tile_store(C[i], sum)
226
226
 
227
227
  batch_count = 56
228
228
 
@@ -245,7 +245,7 @@ def test_tile_grouped_gemm(test, device):
245
245
  )
246
246
 
247
247
  # TODO: 32 mismatched elements
248
- assert_np_equal(C_wp.numpy(), C)
248
+ assert_np_equal(C_wp.numpy(), C, 1e-6)
249
249
 
250
250
 
251
251
  @unittest.skipUnless(wp.context.runtime.core.is_mathdx_enabled(), "Warp was not built with MathDx support")
@@ -255,7 +255,7 @@ def test_tile_gemm(test, device):
255
255
  # output tile index
256
256
  i, j = wp.tid()
257
257
 
258
- sum = wp.tile_zeros(m=TILE_M, n=TILE_N, dtype=wp.float32)
258
+ sum = wp.tile_zeros(shape=(TILE_M, TILE_N), dtype=wp.float32)
259
259
 
260
260
  M = A.shape[0]
261
261
  N = B.shape[1]
@@ -264,13 +264,13 @@ def test_tile_gemm(test, device):
264
264
  count = int(K / TILE_K)
265
265
 
266
266
  for k in range(0, count):
267
- a = wp.tile_load(A, i, k, m=TILE_M, n=TILE_K)
268
- b = wp.tile_load(B, k, j, m=TILE_K, n=TILE_N)
267
+ a = wp.tile_load(A, shape=(TILE_M, TILE_K), offset=(i * TILE_M, k * TILE_K))
268
+ b = wp.tile_load(B, shape=(TILE_K, TILE_N), offset=(k * TILE_K, j * TILE_N))
269
269
 
270
270
  # sum += a*b
271
271
  wp.tile_matmul(a, b, sum)
272
272
 
273
- wp.tile_store(C, i, j, sum)
273
+ wp.tile_store(C, sum, offset=(i * TILE_M, j * TILE_N))
274
274
 
275
275
  M = TILE_M * 7
276
276
  K = TILE_K * 6
@@ -309,7 +309,7 @@ def tile_operators(input: wp.array3d(dtype=float), output: wp.array3d(dtype=floa
309
309
  # output tile index
310
310
  i = wp.tid()
311
311
 
312
- a = wp.tile_load(input[i], 0, 0, m=TILE_M, n=TILE_N)
312
+ a = wp.tile_load(input[i], shape=(TILE_M, TILE_N))
313
313
 
314
314
  # neg
315
315
  b = -a
@@ -323,7 +323,7 @@ def tile_operators(input: wp.array3d(dtype=float), output: wp.array3d(dtype=floa
323
323
  # add tiles
324
324
  e = a + d
325
325
 
326
- wp.tile_store(output[i], 0, 0, e)
326
+ wp.tile_store(output[i], e)
327
327
 
328
328
 
329
329
  def test_tile_operators(test, device):
@@ -358,10 +358,10 @@ def tile_sum_kernel(input: wp.array3d(dtype=float), output: wp.array(dtype=float
358
358
  # output tile index
359
359
  i = wp.tid()
360
360
 
361
- a = wp.tile_load(input[i], 0, 0, m=TILE_M, n=TILE_N)
361
+ a = wp.tile_load(input[i], shape=(TILE_M, TILE_N))
362
362
  s = wp.tile_sum(a) * 0.5
363
363
 
364
- wp.tile_store(output, i, s)
364
+ wp.tile_store(output, s, offset=i)
365
365
 
366
366
 
367
367
  def test_tile_sum(test, device):
@@ -398,48 +398,138 @@ def test_tile_sum(test, device):
398
398
  assert_np_equal(input_wp.grad.numpy(), np.ones_like(input) * 0.5)
399
399
 
400
400
 
401
+ def test_tile_sum_launch(test, device):
402
+ batch_count = 56
403
+
404
+ M = TILE_M
405
+ N = TILE_N
406
+
407
+ rng = np.random.default_rng(42)
408
+ input = rng.random((batch_count, M, N), dtype=np.float32)
409
+
410
+ input_wp = wp.array(input, requires_grad=True, device=device)
411
+ output_wp = wp.zeros(batch_count, requires_grad=True, device=device)
412
+
413
+ cmd = wp.launch_tiled(
414
+ tile_sum_kernel,
415
+ dim=[batch_count],
416
+ inputs=[input_wp, output_wp],
417
+ block_dim=TILE_DIM,
418
+ device=device,
419
+ record_cmd=True,
420
+ )
421
+ cmd.launch()
422
+
423
+ sum_wp = output_wp.numpy()
424
+
425
+ for i in range(batch_count):
426
+ sum_np = np.sum(input[i]) * 0.5
427
+ test.assertAlmostEqual(sum_wp[i], sum_np, places=5)
428
+
429
+ output_wp.grad.fill_(1.0)
430
+
431
+ wp.launch_tiled(
432
+ tile_sum_kernel,
433
+ dim=[batch_count],
434
+ inputs=[input_wp, output_wp],
435
+ adj_inputs=[input_wp.grad, output_wp.grad],
436
+ block_dim=TILE_DIM,
437
+ device=device,
438
+ adjoint=True,
439
+ )
440
+
441
+ assert_np_equal(input_wp.grad.numpy(), np.ones_like(input) * 0.5)
442
+
443
+
401
444
  @wp.kernel
402
- def tile_extract_kernel(input: wp.array2d(dtype=float), output: wp.array2d(dtype=float)):
403
- # output tile index
404
- i = wp.tid()
445
+ def test_tile_extract_kernel(a: wp.array2d(dtype=float), b: wp.array2d(dtype=float)):
446
+ i, j, x, y = wp.tid()
405
447
 
406
- t = wp.tile_load(input, 0, 0, m=TILE_M, n=TILE_N)
448
+ tile = wp.tile_load(a, shape=(TILE_M, TILE_N), offset=(i * TILE_M, j * TILE_N))
407
449
 
408
- # perform a scalar copy, extracting each
409
- # tile element individually
410
- for i in range(TILE_M):
411
- for j in range(TILE_N):
412
- output[i, j] = t[i, j]
450
+ # compute sum of array sub tile
451
+ wp.atomic_add(b, i, j, wp.tile_extract(tile, x, y))
413
452
 
414
453
 
415
454
  def test_tile_extract(test, device):
416
- M = TILE_M
417
- N = TILE_N
455
+ block_dim = 16
418
456
 
419
- rng = np.random.default_rng(42)
420
- input = rng.random((M, N), dtype=np.float32)
457
+ input = np.arange(TILE_M * TILE_N * 4).reshape((TILE_M * 2, TILE_N * 2))
421
458
 
422
- input_wp = wp.array(input, requires_grad=True, device=device)
423
- output_wp = wp.zeros_like(input_wp, requires_grad=True, device=device)
459
+ a = wp.array(input, dtype=float, requires_grad=True, device=device)
460
+ b = wp.zeros((2, 2), dtype=float, requires_grad=True, device=device)
424
461
 
425
462
  with wp.Tape() as tape:
426
- wp.launch_tiled(tile_extract_kernel, dim=[1], inputs=[input_wp, output_wp], block_dim=TILE_DIM, device=device)
463
+ wp.launch(
464
+ test_tile_extract_kernel, dim=[2, 2, TILE_M, TILE_N], inputs=[a, b], block_dim=block_dim, device=device
465
+ )
427
466
 
428
- assert_array_equal(output_wp, input_wp)
467
+ # compute sum of each sub-block
468
+ sums = input.reshape(2, input.shape[0] // 2, 2, input.shape[1] // 2).sum(axis=(1, 3))
429
469
 
430
- output_wp.grad.fill_(1.0)
470
+ assert_np_equal(b.numpy(), sums)
471
+
472
+ b.grad.fill_(1.0)
431
473
 
432
474
  tape.backward()
433
475
 
434
- assert_np_equal(input_wp.grad.numpy(), np.ones_like(input))
476
+ expected_grad = np.ones_like(input)
477
+ assert_np_equal(a.grad.numpy(), expected_grad)
478
+
479
+
480
+ @wp.kernel
481
+ def test_tile_extract_repeated_kernel(a: wp.array2d(dtype=float), b: wp.array2d(dtype=float)):
482
+ i, j, x, y = wp.tid()
483
+
484
+ tile = wp.tile_load(a, shape=(TILE_M, TILE_N), offset=(i * TILE_M, j * TILE_N))
485
+
486
+ # each thread extracts the first element of the sub-tile
487
+ # and accumulates the value onto the output
488
+ wp.atomic_add(b, i, j, wp.tile_extract(tile, 0, 0))
489
+
490
+
491
+ def test_tile_extract_repeated(test, device):
492
+ block_dim = 16
493
+
494
+ input = np.arange(TILE_M * TILE_N * 4).reshape((TILE_M * 2, TILE_N * 2))
495
+
496
+ a = wp.array(input, dtype=float, requires_grad=True, device=device)
497
+ b = wp.zeros((2, 2), dtype=float, requires_grad=True, device=device)
498
+
499
+ with wp.Tape() as tape:
500
+ wp.launch(
501
+ test_tile_extract_repeated_kernel,
502
+ dim=[2, 2, TILE_M, TILE_N],
503
+ inputs=[a, b],
504
+ block_dim=block_dim,
505
+ device=device,
506
+ )
507
+
508
+ # each thread adds the first element to the output
509
+ scale = TILE_M * TILE_N
510
+ sums = np.array([[input[0, 0], input[0, TILE_N]], [input[TILE_M, 0], input[TILE_M, TILE_N]]]) * scale
511
+
512
+ assert_np_equal(b.numpy(), sums)
513
+
514
+ b.grad.fill_(1.0)
515
+
516
+ tape.backward()
517
+
518
+ expected_grad = np.zeros_like(input)
519
+ expected_grad[0, 0] = scale
520
+ expected_grad[0, TILE_N] = scale
521
+ expected_grad[TILE_M, 0] = scale
522
+ expected_grad[TILE_M, TILE_N] = scale
523
+
524
+ assert_np_equal(a.grad.numpy(), expected_grad)
435
525
 
436
526
 
437
527
  @wp.kernel
438
528
  def test_tile_transpose_kernel(input: wp.array2d(dtype=float), output: wp.array2d(dtype=float)):
439
- x = wp.tile_load(input, 0, 0, m=TILE_M, n=TILE_N)
529
+ x = wp.tile_load(input, shape=(TILE_M, TILE_N))
440
530
  y = wp.tile_transpose(x)
441
531
 
442
- wp.tile_store(output, 0, 0, y)
532
+ wp.tile_store(output, y)
443
533
 
444
534
 
445
535
  def test_tile_transpose(test, device):
@@ -456,13 +546,13 @@ def test_tile_transpose(test, device):
456
546
  def test_tile_transpose_matmul(test, device):
457
547
  @wp.kernel
458
548
  def test_tile_transpose_matmul_kernel(input: wp.array2d(dtype=float), output: wp.array2d(dtype=float)):
459
- x = wp.tile_load(input, 0, 0, m=TILE_M, n=TILE_N)
549
+ x = wp.tile_load(input, shape=(TILE_M, TILE_N))
460
550
  y = wp.tile_transpose(x)
461
551
 
462
- z = wp.tile_zeros(dtype=float, m=TILE_N, n=TILE_N)
552
+ z = wp.tile_zeros(dtype=float, shape=(TILE_N, TILE_N))
463
553
  wp.tile_matmul(y, x, z)
464
554
 
465
- wp.tile_store(output, 0, 0, z)
555
+ wp.tile_store(output, z)
466
556
 
467
557
  rng = np.random.default_rng(42)
468
558
  input = wp.array(rng.random((TILE_M, TILE_N), dtype=np.float32), device=device)
@@ -477,13 +567,13 @@ def test_tile_transpose_matmul(test, device):
477
567
  def test_tile_broadcast_add_kernel(
478
568
  input_a: wp.array2d(dtype=float), input_b: wp.array(dtype=float), output: wp.array2d(dtype=float)
479
569
  ):
480
- a = wp.tile_load(input_a, 0, 0, m=10, n=10)
481
- b = wp.tile_load(input_b, 0, n=10)
570
+ a = wp.tile_load(input_a, shape=(10, 10))
571
+ b = wp.tile_load(input_b, shape=10)
482
572
 
483
- c = wp.tile_broadcast(b, 10, 10)
573
+ c = wp.tile_broadcast(b, shape=(10, 10))
484
574
  d = a + c
485
575
 
486
- wp.tile_store(output, 0, 0, d)
576
+ wp.tile_store(output, d)
487
577
 
488
578
 
489
579
  def test_tile_broadcast_add(test, device):
@@ -501,13 +591,13 @@ def test_tile_broadcast_add(test, device):
501
591
 
502
592
  @wp.kernel
503
593
  def test_tile_broadcast_grad_kernel(a: wp.array(dtype=float), b: wp.array2d(dtype=float)):
504
- x = wp.tile_load(a, i=0, n=5)
505
- y = wp.tile_broadcast(x, m=5, n=5)
594
+ x = wp.tile_load(a, shape=5)
595
+ y = wp.tile_broadcast(x, shape=(5, 5))
506
596
 
507
- w = wp.tile_ones(dtype=float, m=5, n=5)
597
+ w = wp.tile_ones(dtype=float, shape=(5, 5))
508
598
  z = w + y
509
599
 
510
- wp.tile_store(b, 0, 0, z)
600
+ wp.tile_store(b, z)
511
601
 
512
602
 
513
603
  def test_tile_broadcast_grad(test, device):
@@ -524,152 +614,48 @@ def test_tile_broadcast_grad(test, device):
524
614
  assert_np_equal(a.grad.numpy(), np.ones(5) * 5.0)
525
615
 
526
616
 
527
- TILE_VIEW_M = 16
528
- TILE_VIEW_N = 128
529
-
530
-
531
617
  @wp.kernel
532
- def test_tile_view_kernel(src: wp.array2d(dtype=float), dst: wp.array2d(dtype=float)):
533
- # load whole source into local memory
534
- a = wp.tile_load(src, 0, 0, TILE_VIEW_M, TILE_VIEW_N)
535
-
536
- # copy the source array row by row
537
- for i in range(TILE_VIEW_M):
538
- # create a view on original array and store
539
- row = a[i]
540
- wp.tile_store(dst, i, 0, row)
541
-
542
-
543
- def test_tile_view(test, device):
544
- rng = np.random.default_rng(42)
545
-
546
- a = wp.array(rng.random((TILE_VIEW_M, TILE_VIEW_N), dtype=np.float32), requires_grad=True, device=device)
547
- b = wp.array(np.zeros((TILE_VIEW_M, TILE_VIEW_N), dtype=np.float32), requires_grad=True, device=device)
618
+ def tile_len_kernel(
619
+ a: wp.array(dtype=float, ndim=2),
620
+ out: wp.array(dtype=int),
621
+ ):
622
+ x = wp.tile_load(a, shape=(TILE_M, TILE_N))
548
623
 
549
- with wp.Tape() as tape:
550
- wp.launch_tiled(test_tile_view_kernel, dim=[1], inputs=[a, b], block_dim=32, device=device)
624
+ length = wp.static(len(x))
625
+ wp.expect_eq(wp.static(len(x)), TILE_M)
626
+ out[0] = wp.static(len(x))
551
627
 
552
- assert_np_equal(b.numpy(), a.numpy())
553
628
 
554
- b.grad = wp.ones_like(b, device=device)
555
- tape.backward()
629
+ def test_tile_len(test, device):
630
+ a = wp.zeros((TILE_M, TILE_N), dtype=float, device=device)
631
+ out = wp.empty(1, dtype=int, device=device)
632
+ wp.launch_tiled(
633
+ tile_len_kernel,
634
+ dim=(1,),
635
+ inputs=(a,),
636
+ outputs=(out,),
637
+ block_dim=32,
638
+ device=device,
639
+ )
556
640
 
557
- assert_np_equal(a.grad.numpy(), np.ones_like(a.numpy()))
641
+ test.assertEqual(out.numpy()[0], TILE_M)
558
642
 
559
643
 
560
644
  @wp.kernel
561
- def test_tile_assign_kernel(src: wp.array2d(dtype=float), dst: wp.array2d(dtype=float)):
562
- # load whole source into local memory
563
- a = wp.tile_load(src, 0, 0, m=TILE_VIEW_M, n=TILE_VIEW_N)
564
- b = wp.tile_zeros(dtype=float, m=TILE_VIEW_M, n=TILE_VIEW_N)
565
-
566
- # copy the source array row by row
567
- for i in range(TILE_VIEW_M):
568
- # create views onto source and dest rows
569
- row_src = a[i]
570
- row_dst = b[i]
571
-
572
- # copy onto dest row
573
- wp.tile_assign(row_dst, 0, 0, row_src)
574
-
575
- wp.tile_store(dst, 0, 0, b)
576
-
577
-
578
- def test_tile_assign(test, device):
579
- rng = np.random.default_rng(42)
580
-
581
- a = wp.array(rng.random((TILE_VIEW_M, TILE_VIEW_N), dtype=np.float32), requires_grad=True, device=device)
582
- b = wp.array(np.zeros((TILE_VIEW_M, TILE_VIEW_N), dtype=np.float32), requires_grad=True, device=device)
583
-
584
- with wp.Tape() as tape:
585
- wp.launch_tiled(test_tile_assign_kernel, dim=[1], inputs=[a, b], block_dim=32, device=device)
586
-
587
- assert_np_equal(b.numpy(), a.numpy())
588
-
589
- b.grad = wp.ones_like(b, device=device)
590
- tape.backward()
591
-
592
- assert_np_equal(a.grad.numpy(), np.ones_like(a.numpy()))
593
-
594
-
595
- # #-----------------------------------------
596
- # # center of mass computation
645
+ def test_tile_print_kernel():
646
+ # shared tile
647
+ a = wp.tile_ones(shape=(4, 3), dtype=float, storage="shared")
648
+ # register tile
649
+ b = wp.tile_ones(shape=(4, 3), dtype=float)
597
650
 
598
- # start = offset[i]
599
- # end = offset[i+1]
651
+ print(a)
652
+ print(b)
600
653
 
601
- # com = wp.tile_zeros(dtype=wp.vec3, M=1)
602
654
 
603
- # # load chunks of indices
604
- # for i in range(start, end, N):
655
+ def test_tile_print(test, device):
656
+ wp.launch_tiled(test_tile_print_kernel, dim=1, inputs=[], block_dim=64, device=device)
657
+ wp.synchronize()
605
658
 
606
- # count = wp.min(N, end-i)
607
-
608
- # idx = wp.tile_load(indices, i, N, max_col=count)
609
- # p = wp.tile_load(points, idx, max_col=count)
610
-
611
- # com += wp.tile_sum(p)
612
-
613
-
614
- # wp.tile_store(out[i], com)
615
-
616
-
617
- # #-------------------------------------------
618
- # # compute deformation gradient
619
-
620
- # i =
621
- # j =
622
- # k =
623
- # l =
624
-
625
- # f = wp.tile(F) # generate a block size tile of feature vectors
626
-
627
- # # layer 1
628
- # w1 = wp.tile_load(weights)
629
- # b1 = wp.tile_load(bias)
630
-
631
- # z = wp.tile_matmul(w1, f) + b1
632
- # z = wp.tile_map(relu, z)
633
-
634
- # # layer 2
635
- # w2 = wp.tile_load(weights)
636
- # b2 = wp.tile_load(bias)
637
-
638
- # z = wp.tile_matmul(w2, z) + b2
639
- # z = wp.tile_map(relu, z)
640
-
641
- # o = wp.untile(f)
642
-
643
-
644
- # #----------------------------------
645
- # # MLP with helper function for linear layers
646
- # # where shape is only partially known
647
- # # at compile time, and the other dims
648
- # # are inferred from the input vector
649
-
650
- # f = wp.tile(F)
651
-
652
- # z = wp.tile_linear(weights1, bias1, f, hidden=16)
653
- # z = wp.tile_map(relu, z)
654
-
655
- # z = wp.tile_linear(weights2, bias2, f, hidden=8)
656
- # z = wp.tile_map(relu, z)
657
-
658
- # z = wp.tile_linear(weights3, bias3, f, hidden=4)
659
- # z = wp.tile_map(relu, z)
660
-
661
- # o = wp.untile(z)
662
-
663
-
664
- # #----------------------------------
665
- # # softmax
666
-
667
- # def softmax(z: Any):
668
-
669
- # e = wp.tile_map(wp.exp, z)
670
- # s = wp.tile_sum(e, dim=0)
671
-
672
- # return z/s[0]
673
659
 
674
660
  devices = get_cuda_test_devices()
675
661
 
@@ -688,12 +674,13 @@ add_function_test(TestTile, "test_tile_transpose", test_tile_transpose, devices=
688
674
  add_function_test(TestTile, "test_tile_transpose_matmul", test_tile_transpose_matmul, devices=devices)
689
675
  add_function_test(TestTile, "test_tile_operators", test_tile_operators, devices=devices)
690
676
  add_function_test(TestTile, "test_tile_sum", test_tile_sum, devices=devices)
677
+ add_function_test(TestTile, "test_tile_sum_launch", test_tile_sum_launch, devices=devices)
691
678
  add_function_test(TestTile, "test_tile_extract", test_tile_extract, devices=devices)
679
+ add_function_test(TestTile, "test_tile_extract_repeated", test_tile_extract_repeated, devices=devices)
692
680
  add_function_test(TestTile, "test_tile_broadcast_add", test_tile_broadcast_add, devices=devices)
693
681
  add_function_test(TestTile, "test_tile_broadcast_grad", test_tile_broadcast_grad, devices=devices)
694
- add_function_test(TestTile, "test_tile_view", test_tile_view, devices=devices)
695
- add_function_test(TestTile, "test_tile_assign", test_tile_assign, devices=devices)
696
-
682
+ add_function_test(TestTile, "test_tile_len", test_tile_len, devices=devices)
683
+ add_function_test(TestTile, "test_tile_print", test_tile_print, devices=devices, check_output=False)
697
684
 
698
685
  if __name__ == "__main__":
699
686
  wp.clear_kernel_cache()