warp-lang 1.5.1__py3-none-manylinux2014_x86_64.whl → 1.6.0__py3-none-manylinux2014_x86_64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of warp-lang might be problematic. Click here for more details.

Files changed (123) hide show
  1. warp/__init__.py +5 -0
  2. warp/autograd.py +414 -191
  3. warp/bin/warp-clang.so +0 -0
  4. warp/bin/warp.so +0 -0
  5. warp/build.py +40 -12
  6. warp/build_dll.py +13 -6
  7. warp/builtins.py +1076 -480
  8. warp/codegen.py +240 -119
  9. warp/config.py +1 -1
  10. warp/context.py +298 -84
  11. warp/examples/assets/square_cloth.usd +0 -0
  12. warp/examples/benchmarks/benchmark_gemm.py +27 -18
  13. warp/examples/benchmarks/benchmark_interop_paddle.py +3 -3
  14. warp/examples/benchmarks/benchmark_interop_torch.py +3 -3
  15. warp/examples/core/example_torch.py +18 -34
  16. warp/examples/fem/example_apic_fluid.py +1 -0
  17. warp/examples/fem/example_mixed_elasticity.py +1 -1
  18. warp/examples/optim/example_bounce.py +1 -1
  19. warp/examples/optim/example_cloth_throw.py +1 -1
  20. warp/examples/optim/example_diffray.py +4 -15
  21. warp/examples/optim/example_drone.py +1 -1
  22. warp/examples/optim/example_softbody_properties.py +392 -0
  23. warp/examples/optim/example_trajectory.py +1 -3
  24. warp/examples/optim/example_walker.py +5 -0
  25. warp/examples/sim/example_cartpole.py +0 -2
  26. warp/examples/sim/example_cloth_self_contact.py +260 -0
  27. warp/examples/sim/example_granular_collision_sdf.py +4 -5
  28. warp/examples/sim/example_jacobian_ik.py +0 -2
  29. warp/examples/sim/example_quadruped.py +5 -2
  30. warp/examples/tile/example_tile_cholesky.py +79 -0
  31. warp/examples/tile/example_tile_convolution.py +2 -2
  32. warp/examples/tile/example_tile_fft.py +2 -2
  33. warp/examples/tile/example_tile_filtering.py +3 -3
  34. warp/examples/tile/example_tile_matmul.py +4 -4
  35. warp/examples/tile/example_tile_mlp.py +12 -12
  36. warp/examples/tile/example_tile_nbody.py +180 -0
  37. warp/examples/tile/example_tile_walker.py +319 -0
  38. warp/math.py +147 -0
  39. warp/native/array.h +12 -0
  40. warp/native/builtin.h +0 -1
  41. warp/native/bvh.cpp +149 -70
  42. warp/native/bvh.cu +287 -68
  43. warp/native/bvh.h +195 -85
  44. warp/native/clang/clang.cpp +5 -1
  45. warp/native/cuda_util.cpp +35 -0
  46. warp/native/cuda_util.h +5 -0
  47. warp/native/exports.h +40 -40
  48. warp/native/intersect.h +17 -0
  49. warp/native/mat.h +41 -0
  50. warp/native/mathdx.cpp +19 -0
  51. warp/native/mesh.cpp +25 -8
  52. warp/native/mesh.cu +153 -101
  53. warp/native/mesh.h +482 -403
  54. warp/native/quat.h +40 -0
  55. warp/native/solid_angle.h +7 -0
  56. warp/native/sort.cpp +85 -0
  57. warp/native/sort.cu +34 -0
  58. warp/native/sort.h +3 -1
  59. warp/native/spatial.h +11 -0
  60. warp/native/tile.h +1185 -664
  61. warp/native/tile_reduce.h +8 -6
  62. warp/native/vec.h +41 -0
  63. warp/native/warp.cpp +8 -1
  64. warp/native/warp.cu +263 -40
  65. warp/native/warp.h +19 -5
  66. warp/optim/linear.py +22 -4
  67. warp/render/render_opengl.py +124 -59
  68. warp/sim/__init__.py +6 -1
  69. warp/sim/collide.py +270 -26
  70. warp/sim/integrator_euler.py +25 -7
  71. warp/sim/integrator_featherstone.py +154 -35
  72. warp/sim/integrator_vbd.py +842 -40
  73. warp/sim/model.py +111 -53
  74. warp/stubs.py +248 -115
  75. warp/tape.py +28 -30
  76. warp/tests/aux_test_module_unload.py +15 -0
  77. warp/tests/{test_sim_grad.py → flaky_test_sim_grad.py} +104 -63
  78. warp/tests/test_array.py +74 -0
  79. warp/tests/test_assert.py +242 -0
  80. warp/tests/test_codegen.py +14 -61
  81. warp/tests/test_collision.py +2 -2
  82. warp/tests/test_examples.py +9 -0
  83. warp/tests/test_grad_debug.py +87 -2
  84. warp/tests/test_hash_grid.py +1 -1
  85. warp/tests/test_ipc.py +116 -0
  86. warp/tests/test_mat.py +138 -167
  87. warp/tests/test_math.py +47 -1
  88. warp/tests/test_matmul.py +11 -7
  89. warp/tests/test_matmul_lite.py +4 -4
  90. warp/tests/test_mesh.py +84 -60
  91. warp/tests/test_mesh_query_aabb.py +165 -0
  92. warp/tests/test_mesh_query_point.py +328 -286
  93. warp/tests/test_mesh_query_ray.py +134 -121
  94. warp/tests/test_mlp.py +2 -2
  95. warp/tests/test_operators.py +43 -0
  96. warp/tests/test_overwrite.py +2 -2
  97. warp/tests/test_quat.py +77 -0
  98. warp/tests/test_reload.py +29 -0
  99. warp/tests/test_sim_grad_bounce_linear.py +204 -0
  100. warp/tests/test_static.py +16 -0
  101. warp/tests/test_tape.py +25 -0
  102. warp/tests/test_tile.py +134 -191
  103. warp/tests/test_tile_load.py +356 -0
  104. warp/tests/test_tile_mathdx.py +61 -8
  105. warp/tests/test_tile_mlp.py +17 -17
  106. warp/tests/test_tile_reduce.py +24 -18
  107. warp/tests/test_tile_shared_memory.py +66 -17
  108. warp/tests/test_tile_view.py +165 -0
  109. warp/tests/test_torch.py +35 -0
  110. warp/tests/test_utils.py +36 -24
  111. warp/tests/test_vec.py +110 -0
  112. warp/tests/unittest_suites.py +29 -4
  113. warp/tests/unittest_utils.py +30 -11
  114. warp/thirdparty/unittest_parallel.py +2 -2
  115. warp/types.py +409 -99
  116. warp/utils.py +9 -5
  117. {warp_lang-1.5.1.dist-info → warp_lang-1.6.0.dist-info}/METADATA +68 -44
  118. {warp_lang-1.5.1.dist-info → warp_lang-1.6.0.dist-info}/RECORD +121 -110
  119. {warp_lang-1.5.1.dist-info → warp_lang-1.6.0.dist-info}/WHEEL +1 -1
  120. warp/examples/benchmarks/benchmark_tile.py +0 -179
  121. warp/native/tile_gemm.h +0 -341
  122. {warp_lang-1.5.1.dist-info → warp_lang-1.6.0.dist-info}/LICENSE.md +0 -0
  123. {warp_lang-1.5.1.dist-info → warp_lang-1.6.0.dist-info}/top_level.txt +0 -0
warp/tests/test_tile.py CHANGED
@@ -27,8 +27,8 @@ def tile_copy_1d_kernel(A: wp.array(dtype=float), B: wp.array(dtype=float)):
27
27
  # tile index
28
28
  i = wp.tid()
29
29
 
30
- a = wp.tile_load(A, i, n=TILE_N)
31
- wp.tile_store(B, i, a)
30
+ a = wp.tile_load(A, shape=TILE_N, offset=i * TILE_N)
31
+ wp.tile_store(B, a, offset=i * TILE_N)
32
32
 
33
33
 
34
34
  def test_tile_copy_1d(test, device):
@@ -66,8 +66,8 @@ def tile_copy_2d_kernel(A: wp.array2d(dtype=float), B: wp.array2d(dtype=float)):
66
66
  # tile index
67
67
  i, j = wp.tid()
68
68
 
69
- a = wp.tile_load(A, i, j, m=TILE_M, n=TILE_N)
70
- wp.tile_store(B, i, j, a)
69
+ a = wp.tile_load(A, shape=(TILE_M, TILE_N), offset=(i * TILE_M, j * TILE_N))
70
+ wp.tile_store(B, a, offset=(i * TILE_M, j * TILE_N))
71
71
 
72
72
 
73
73
  def test_tile_copy_2d(test, device):
@@ -111,11 +111,11 @@ def tile_unary_map(input: wp.array2d(dtype=float), output: wp.array2d(dtype=floa
111
111
  # tile index
112
112
  i, j = wp.tid()
113
113
 
114
- a = wp.tile_load(input, i, j, m=TILE_M, n=TILE_N)
114
+ a = wp.tile_load(input, shape=(TILE_M, TILE_N), offset=(i * TILE_M, j * TILE_N))
115
115
 
116
116
  sa = wp.tile_map(wp.sin, a)
117
117
 
118
- wp.tile_store(output, i, j, sa)
118
+ wp.tile_store(output, sa, offset=(i * TILE_M, j * TILE_N))
119
119
 
120
120
 
121
121
  def test_tile_unary_map(test, device):
@@ -163,12 +163,12 @@ def tile_binary_map(
163
163
  # tile index
164
164
  i, j = wp.tid()
165
165
 
166
- a = wp.tile_load(input_a, i, j, m=TILE_M, n=TILE_N)
167
- b = wp.tile_load(input_b, i, j, m=TILE_M, n=TILE_N)
166
+ a = wp.tile_load(input_a, shape=(TILE_M, TILE_N), offset=(i * TILE_M, j * TILE_N))
167
+ b = wp.tile_load(input_b, shape=(TILE_M, TILE_N), offset=(i * TILE_M, j * TILE_N))
168
168
 
169
169
  sa = wp.tile_map(binary_func, a, b)
170
170
 
171
- wp.tile_store(output, i, j, sa)
171
+ wp.tile_store(output, sa, offset=(i * TILE_M, j * TILE_N))
172
172
 
173
173
 
174
174
  def test_tile_binary_map(test, device):
@@ -215,14 +215,14 @@ def test_tile_grouped_gemm(test, device):
215
215
  # output tile index
216
216
  i = wp.tid()
217
217
 
218
- a = wp.tile_load(A[i], 0, 0, m=TILE_M, n=TILE_K)
219
- b = wp.tile_load(B[i], 0, 0, m=TILE_K, n=TILE_N)
218
+ a = wp.tile_load(A[i], shape=(TILE_M, TILE_K))
219
+ b = wp.tile_load(B[i], shape=(TILE_K, TILE_N))
220
220
 
221
- sum = wp.tile_zeros(m=TILE_M, n=TILE_N, dtype=wp.float32)
221
+ sum = wp.tile_zeros(shape=(TILE_M, TILE_N), dtype=wp.float32)
222
222
 
223
223
  wp.tile_matmul(a, b, sum)
224
224
 
225
- wp.tile_store(C[i], 0, 0, sum)
225
+ wp.tile_store(C[i], sum)
226
226
 
227
227
  batch_count = 56
228
228
 
@@ -245,7 +245,7 @@ def test_tile_grouped_gemm(test, device):
245
245
  )
246
246
 
247
247
  # TODO: 32 mismatched elements
248
- assert_np_equal(C_wp.numpy(), C)
248
+ assert_np_equal(C_wp.numpy(), C, 1e-6)
249
249
 
250
250
 
251
251
  @unittest.skipUnless(wp.context.runtime.core.is_mathdx_enabled(), "Warp was not built with MathDx support")
@@ -255,7 +255,7 @@ def test_tile_gemm(test, device):
255
255
  # output tile index
256
256
  i, j = wp.tid()
257
257
 
258
- sum = wp.tile_zeros(m=TILE_M, n=TILE_N, dtype=wp.float32)
258
+ sum = wp.tile_zeros(shape=(TILE_M, TILE_N), dtype=wp.float32)
259
259
 
260
260
  M = A.shape[0]
261
261
  N = B.shape[1]
@@ -264,13 +264,13 @@ def test_tile_gemm(test, device):
264
264
  count = int(K / TILE_K)
265
265
 
266
266
  for k in range(0, count):
267
- a = wp.tile_load(A, i, k, m=TILE_M, n=TILE_K)
268
- b = wp.tile_load(B, k, j, m=TILE_K, n=TILE_N)
267
+ a = wp.tile_load(A, shape=(TILE_M, TILE_K), offset=(i * TILE_M, k * TILE_K))
268
+ b = wp.tile_load(B, shape=(TILE_K, TILE_N), offset=(k * TILE_K, j * TILE_N))
269
269
 
270
270
  # sum += a*b
271
271
  wp.tile_matmul(a, b, sum)
272
272
 
273
- wp.tile_store(C, i, j, sum)
273
+ wp.tile_store(C, sum, offset=(i * TILE_M, j * TILE_N))
274
274
 
275
275
  M = TILE_M * 7
276
276
  K = TILE_K * 6
@@ -309,7 +309,7 @@ def tile_operators(input: wp.array3d(dtype=float), output: wp.array3d(dtype=floa
309
309
  # output tile index
310
310
  i = wp.tid()
311
311
 
312
- a = wp.tile_load(input[i], 0, 0, m=TILE_M, n=TILE_N)
312
+ a = wp.tile_load(input[i], shape=(TILE_M, TILE_N))
313
313
 
314
314
  # neg
315
315
  b = -a
@@ -323,7 +323,7 @@ def tile_operators(input: wp.array3d(dtype=float), output: wp.array3d(dtype=floa
323
323
  # add tiles
324
324
  e = a + d
325
325
 
326
- wp.tile_store(output[i], 0, 0, e)
326
+ wp.tile_store(output[i], e)
327
327
 
328
328
 
329
329
  def test_tile_operators(test, device):
@@ -358,10 +358,10 @@ def tile_sum_kernel(input: wp.array3d(dtype=float), output: wp.array(dtype=float
358
358
  # output tile index
359
359
  i = wp.tid()
360
360
 
361
- a = wp.tile_load(input[i], 0, 0, m=TILE_M, n=TILE_N)
361
+ a = wp.tile_load(input[i], shape=(TILE_M, TILE_N))
362
362
  s = wp.tile_sum(a) * 0.5
363
363
 
364
- wp.tile_store(output, i, s)
364
+ wp.tile_store(output, s, offset=i)
365
365
 
366
366
 
367
367
  def test_tile_sum(test, device):
@@ -442,47 +442,94 @@ def test_tile_sum_launch(test, device):
442
442
 
443
443
 
444
444
  @wp.kernel
445
- def tile_extract_kernel(input: wp.array2d(dtype=float), output: wp.array2d(dtype=float)):
446
- # output tile index
447
- i = wp.tid()
445
+ def test_tile_extract_kernel(a: wp.array2d(dtype=float), b: wp.array2d(dtype=float)):
446
+ i, j, x, y = wp.tid()
448
447
 
449
- t = wp.tile_load(input, 0, 0, m=TILE_M, n=TILE_N)
448
+ tile = wp.tile_load(a, shape=(TILE_M, TILE_N), offset=(i * TILE_M, j * TILE_N))
450
449
 
451
- # perform a scalar copy, extracting each
452
- # tile element individually
453
- for i in range(TILE_M):
454
- for j in range(TILE_N):
455
- output[i, j] = t[i, j]
450
+ # compute sum of array sub tile
451
+ wp.atomic_add(b, i, j, wp.tile_extract(tile, x, y))
456
452
 
457
453
 
458
454
  def test_tile_extract(test, device):
459
- M = TILE_M
460
- N = TILE_N
455
+ block_dim = 16
461
456
 
462
- rng = np.random.default_rng(42)
463
- input = rng.random((M, N), dtype=np.float32)
457
+ input = np.arange(TILE_M * TILE_N * 4).reshape((TILE_M * 2, TILE_N * 2))
464
458
 
465
- input_wp = wp.array(input, requires_grad=True, device=device)
466
- output_wp = wp.zeros_like(input_wp, requires_grad=True, device=device)
459
+ a = wp.array(input, dtype=float, requires_grad=True, device=device)
460
+ b = wp.zeros((2, 2), dtype=float, requires_grad=True, device=device)
467
461
 
468
462
  with wp.Tape() as tape:
469
- wp.launch_tiled(tile_extract_kernel, dim=[1], inputs=[input_wp, output_wp], block_dim=TILE_DIM, device=device)
463
+ wp.launch(
464
+ test_tile_extract_kernel, dim=[2, 2, TILE_M, TILE_N], inputs=[a, b], block_dim=block_dim, device=device
465
+ )
470
466
 
471
- assert_array_equal(output_wp, input_wp)
467
+ # compute sum of each sub-block
468
+ sums = input.reshape(2, input.shape[0] // 2, 2, input.shape[1] // 2).sum(axis=(1, 3))
472
469
 
473
- output_wp.grad.fill_(1.0)
470
+ assert_np_equal(b.numpy(), sums)
471
+
472
+ b.grad.fill_(1.0)
474
473
 
475
474
  tape.backward()
476
475
 
477
- assert_np_equal(input_wp.grad.numpy(), np.ones_like(input))
476
+ expected_grad = np.ones_like(input)
477
+ assert_np_equal(a.grad.numpy(), expected_grad)
478
+
479
+
480
+ @wp.kernel
481
+ def test_tile_extract_repeated_kernel(a: wp.array2d(dtype=float), b: wp.array2d(dtype=float)):
482
+ i, j, x, y = wp.tid()
483
+
484
+ tile = wp.tile_load(a, shape=(TILE_M, TILE_N), offset=(i * TILE_M, j * TILE_N))
485
+
486
+ # each thread extracts the first element of the sub-tile
487
+ # and accumulates the value onto the output
488
+ wp.atomic_add(b, i, j, wp.tile_extract(tile, 0, 0))
489
+
490
+
491
+ def test_tile_extract_repeated(test, device):
492
+ block_dim = 16
493
+
494
+ input = np.arange(TILE_M * TILE_N * 4).reshape((TILE_M * 2, TILE_N * 2))
495
+
496
+ a = wp.array(input, dtype=float, requires_grad=True, device=device)
497
+ b = wp.zeros((2, 2), dtype=float, requires_grad=True, device=device)
498
+
499
+ with wp.Tape() as tape:
500
+ wp.launch(
501
+ test_tile_extract_repeated_kernel,
502
+ dim=[2, 2, TILE_M, TILE_N],
503
+ inputs=[a, b],
504
+ block_dim=block_dim,
505
+ device=device,
506
+ )
507
+
508
+ # each thread adds the first element to the output
509
+ scale = TILE_M * TILE_N
510
+ sums = np.array([[input[0, 0], input[0, TILE_N]], [input[TILE_M, 0], input[TILE_M, TILE_N]]]) * scale
511
+
512
+ assert_np_equal(b.numpy(), sums)
513
+
514
+ b.grad.fill_(1.0)
515
+
516
+ tape.backward()
517
+
518
+ expected_grad = np.zeros_like(input)
519
+ expected_grad[0, 0] = scale
520
+ expected_grad[0, TILE_N] = scale
521
+ expected_grad[TILE_M, 0] = scale
522
+ expected_grad[TILE_M, TILE_N] = scale
523
+
524
+ assert_np_equal(a.grad.numpy(), expected_grad)
478
525
 
479
526
 
480
527
  @wp.kernel
481
528
  def test_tile_transpose_kernel(input: wp.array2d(dtype=float), output: wp.array2d(dtype=float)):
482
- x = wp.tile_load(input, 0, 0, m=TILE_M, n=TILE_N)
529
+ x = wp.tile_load(input, shape=(TILE_M, TILE_N))
483
530
  y = wp.tile_transpose(x)
484
531
 
485
- wp.tile_store(output, 0, 0, y)
532
+ wp.tile_store(output, y)
486
533
 
487
534
 
488
535
  def test_tile_transpose(test, device):
@@ -499,13 +546,13 @@ def test_tile_transpose(test, device):
499
546
  def test_tile_transpose_matmul(test, device):
500
547
  @wp.kernel
501
548
  def test_tile_transpose_matmul_kernel(input: wp.array2d(dtype=float), output: wp.array2d(dtype=float)):
502
- x = wp.tile_load(input, 0, 0, m=TILE_M, n=TILE_N)
549
+ x = wp.tile_load(input, shape=(TILE_M, TILE_N))
503
550
  y = wp.tile_transpose(x)
504
551
 
505
- z = wp.tile_zeros(dtype=float, m=TILE_N, n=TILE_N)
552
+ z = wp.tile_zeros(dtype=float, shape=(TILE_N, TILE_N))
506
553
  wp.tile_matmul(y, x, z)
507
554
 
508
- wp.tile_store(output, 0, 0, z)
555
+ wp.tile_store(output, z)
509
556
 
510
557
  rng = np.random.default_rng(42)
511
558
  input = wp.array(rng.random((TILE_M, TILE_N), dtype=np.float32), device=device)
@@ -520,13 +567,13 @@ def test_tile_transpose_matmul(test, device):
520
567
  def test_tile_broadcast_add_kernel(
521
568
  input_a: wp.array2d(dtype=float), input_b: wp.array(dtype=float), output: wp.array2d(dtype=float)
522
569
  ):
523
- a = wp.tile_load(input_a, 0, 0, m=10, n=10)
524
- b = wp.tile_load(input_b, 0, n=10)
570
+ a = wp.tile_load(input_a, shape=(10, 10))
571
+ b = wp.tile_load(input_b, shape=10)
525
572
 
526
- c = wp.tile_broadcast(b, 10, 10)
573
+ c = wp.tile_broadcast(b, shape=(10, 10))
527
574
  d = a + c
528
575
 
529
- wp.tile_store(output, 0, 0, d)
576
+ wp.tile_store(output, d)
530
577
 
531
578
 
532
579
  def test_tile_broadcast_add(test, device):
@@ -544,13 +591,13 @@ def test_tile_broadcast_add(test, device):
544
591
 
545
592
  @wp.kernel
546
593
  def test_tile_broadcast_grad_kernel(a: wp.array(dtype=float), b: wp.array2d(dtype=float)):
547
- x = wp.tile_load(a, i=0, n=5)
548
- y = wp.tile_broadcast(x, m=5, n=5)
594
+ x = wp.tile_load(a, shape=5)
595
+ y = wp.tile_broadcast(x, shape=(5, 5))
549
596
 
550
- w = wp.tile_ones(dtype=float, m=5, n=5)
597
+ w = wp.tile_ones(dtype=float, shape=(5, 5))
551
598
  z = w + y
552
599
 
553
- wp.tile_store(b, 0, 0, z)
600
+ wp.tile_store(b, z)
554
601
 
555
602
 
556
603
  def test_tile_broadcast_grad(test, device):
@@ -567,153 +614,49 @@ def test_tile_broadcast_grad(test, device):
567
614
  assert_np_equal(a.grad.numpy(), np.ones(5) * 5.0)
568
615
 
569
616
 
570
- TILE_VIEW_M = 16
571
- TILE_VIEW_N = 128
572
-
573
-
574
617
  @wp.kernel
575
- def test_tile_view_kernel(src: wp.array2d(dtype=float), dst: wp.array2d(dtype=float)):
576
- # load whole source into local memory
577
- a = wp.tile_load(src, 0, 0, TILE_VIEW_M, TILE_VIEW_N)
578
-
579
- # copy the source array row by row
580
- for i in range(TILE_VIEW_M):
581
- # create a view on original array and store
582
- row = a[i]
583
- wp.tile_store(dst, i, 0, row)
584
-
585
-
586
- def test_tile_view(test, device):
587
- rng = np.random.default_rng(42)
588
-
589
- a = wp.array(rng.random((TILE_VIEW_M, TILE_VIEW_N), dtype=np.float32), requires_grad=True, device=device)
590
- b = wp.array(np.zeros((TILE_VIEW_M, TILE_VIEW_N), dtype=np.float32), requires_grad=True, device=device)
618
+ def tile_len_kernel(
619
+ a: wp.array(dtype=float, ndim=2),
620
+ out: wp.array(dtype=int),
621
+ ):
622
+ x = wp.tile_load(a, shape=(TILE_M, TILE_N))
591
623
 
592
- with wp.Tape() as tape:
593
- wp.launch_tiled(test_tile_view_kernel, dim=[1], inputs=[a, b], block_dim=32, device=device)
624
+ length = wp.static(len(x))
625
+ wp.expect_eq(wp.static(len(x)), TILE_M)
626
+ out[0] = wp.static(len(x))
594
627
 
595
- assert_np_equal(b.numpy(), a.numpy())
596
628
 
597
- b.grad = wp.ones_like(b, device=device)
598
- tape.backward()
629
+ def test_tile_len(test, device):
630
+ a = wp.zeros((TILE_M, TILE_N), dtype=float, device=device)
631
+ out = wp.empty(1, dtype=int, device=device)
632
+ wp.launch_tiled(
633
+ tile_len_kernel,
634
+ dim=(1,),
635
+ inputs=(a,),
636
+ outputs=(out,),
637
+ block_dim=32,
638
+ device=device,
639
+ )
599
640
 
600
- assert_np_equal(a.grad.numpy(), np.ones_like(a.numpy()))
641
+ test.assertEqual(out.numpy()[0], TILE_M)
601
642
 
602
643
 
603
644
  @wp.kernel
604
- def test_tile_assign_kernel(src: wp.array2d(dtype=float), dst: wp.array2d(dtype=float)):
605
- # load whole source into local memory
606
- a = wp.tile_load(src, 0, 0, m=TILE_VIEW_M, n=TILE_VIEW_N)
607
- b = wp.tile_zeros(dtype=float, m=TILE_VIEW_M, n=TILE_VIEW_N)
608
-
609
- # copy the source array row by row
610
- for i in range(TILE_VIEW_M):
611
- # create views onto source and dest rows
612
- row_src = a[i]
613
- row_dst = b[i]
614
-
615
- # copy onto dest row
616
- wp.tile_assign(row_dst, 0, 0, row_src)
617
-
618
- wp.tile_store(dst, 0, 0, b)
619
-
620
-
621
- def test_tile_assign(test, device):
622
- rng = np.random.default_rng(42)
623
-
624
- a = wp.array(rng.random((TILE_VIEW_M, TILE_VIEW_N), dtype=np.float32), requires_grad=True, device=device)
625
- b = wp.array(np.zeros((TILE_VIEW_M, TILE_VIEW_N), dtype=np.float32), requires_grad=True, device=device)
626
-
627
- with wp.Tape() as tape:
628
- wp.launch_tiled(test_tile_assign_kernel, dim=[1], inputs=[a, b], block_dim=32, device=device)
629
-
630
- assert_np_equal(b.numpy(), a.numpy())
631
-
632
- b.grad = wp.ones_like(b, device=device)
633
- tape.backward()
634
-
635
- assert_np_equal(a.grad.numpy(), np.ones_like(a.numpy()))
636
-
637
-
638
- # #-----------------------------------------
639
- # # center of mass computation
640
-
641
- # start = offset[i]
642
- # end = offset[i+1]
643
-
644
- # com = wp.tile_zeros(dtype=wp.vec3, M=1)
645
-
646
- # # load chunks of indices
647
- # for i in range(start, end, N):
648
-
649
- # count = wp.min(N, end-i)
650
-
651
- # idx = wp.tile_load(indices, i, N, max_col=count)
652
- # p = wp.tile_load(points, idx, max_col=count)
645
+ def test_tile_print_kernel():
646
+ # shared tile
647
+ a = wp.tile_ones(shape=(4, 3), dtype=float, storage="shared")
648
+ # register tile
649
+ b = wp.tile_ones(shape=(4, 3), dtype=float)
653
650
 
654
- # com += wp.tile_sum(p)
651
+ print(a)
652
+ print(b)
655
653
 
656
654
 
657
- # wp.tile_store(out[i], com)
655
+ def test_tile_print(test, device):
656
+ wp.launch_tiled(test_tile_print_kernel, dim=1, inputs=[], block_dim=64, device=device)
657
+ wp.synchronize()
658
658
 
659
659
 
660
- # #-------------------------------------------
661
- # # compute deformation gradient
662
-
663
- # i =
664
- # j =
665
- # k =
666
- # l =
667
-
668
- # f = wp.tile(F) # generate a block size tile of feature vectors
669
-
670
- # # layer 1
671
- # w1 = wp.tile_load(weights)
672
- # b1 = wp.tile_load(bias)
673
-
674
- # z = wp.tile_matmul(w1, f) + b1
675
- # z = wp.tile_map(relu, z)
676
-
677
- # # layer 2
678
- # w2 = wp.tile_load(weights)
679
- # b2 = wp.tile_load(bias)
680
-
681
- # z = wp.tile_matmul(w2, z) + b2
682
- # z = wp.tile_map(relu, z)
683
-
684
- # o = wp.untile(f)
685
-
686
-
687
- # #----------------------------------
688
- # # MLP with helper function for linear layers
689
- # # where shape is only partially known
690
- # # at compile time, and the other dims
691
- # # are inferred from the input vector
692
-
693
- # f = wp.tile(F)
694
-
695
- # z = wp.tile_linear(weights1, bias1, f, hidden=16)
696
- # z = wp.tile_map(relu, z)
697
-
698
- # z = wp.tile_linear(weights2, bias2, f, hidden=8)
699
- # z = wp.tile_map(relu, z)
700
-
701
- # z = wp.tile_linear(weights3, bias3, f, hidden=4)
702
- # z = wp.tile_map(relu, z)
703
-
704
- # o = wp.untile(z)
705
-
706
-
707
- # #----------------------------------
708
- # # softmax
709
-
710
- # def softmax(z: Any):
711
-
712
- # e = wp.tile_map(wp.exp, z)
713
- # s = wp.tile_sum(e, dim=0)
714
-
715
- # return z/s[0]
716
-
717
660
  devices = get_cuda_test_devices()
718
661
 
719
662
 
@@ -733,11 +676,11 @@ add_function_test(TestTile, "test_tile_operators", test_tile_operators, devices=
733
676
  add_function_test(TestTile, "test_tile_sum", test_tile_sum, devices=devices)
734
677
  add_function_test(TestTile, "test_tile_sum_launch", test_tile_sum_launch, devices=devices)
735
678
  add_function_test(TestTile, "test_tile_extract", test_tile_extract, devices=devices)
679
+ add_function_test(TestTile, "test_tile_extract_repeated", test_tile_extract_repeated, devices=devices)
736
680
  add_function_test(TestTile, "test_tile_broadcast_add", test_tile_broadcast_add, devices=devices)
737
681
  add_function_test(TestTile, "test_tile_broadcast_grad", test_tile_broadcast_grad, devices=devices)
738
- add_function_test(TestTile, "test_tile_view", test_tile_view, devices=devices)
739
- add_function_test(TestTile, "test_tile_assign", test_tile_assign, devices=devices)
740
-
682
+ add_function_test(TestTile, "test_tile_len", test_tile_len, devices=devices)
683
+ add_function_test(TestTile, "test_tile_print", test_tile_print, devices=devices, check_output=False)
741
684
 
742
685
  if __name__ == "__main__":
743
686
  wp.clear_kernel_cache()