warp-lang 1.5.1__py3-none-win_amd64.whl → 1.6.0__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of warp-lang might be problematic. Click here for more details.

Files changed (123) hide show
  1. warp/__init__.py +5 -0
  2. warp/autograd.py +414 -191
  3. warp/bin/warp-clang.dll +0 -0
  4. warp/bin/warp.dll +0 -0
  5. warp/build.py +40 -12
  6. warp/build_dll.py +13 -6
  7. warp/builtins.py +1076 -480
  8. warp/codegen.py +240 -119
  9. warp/config.py +1 -1
  10. warp/context.py +298 -84
  11. warp/examples/assets/square_cloth.usd +0 -0
  12. warp/examples/benchmarks/benchmark_gemm.py +27 -18
  13. warp/examples/benchmarks/benchmark_interop_paddle.py +3 -3
  14. warp/examples/benchmarks/benchmark_interop_torch.py +3 -3
  15. warp/examples/core/example_torch.py +18 -34
  16. warp/examples/fem/example_apic_fluid.py +1 -0
  17. warp/examples/fem/example_mixed_elasticity.py +1 -1
  18. warp/examples/optim/example_bounce.py +1 -1
  19. warp/examples/optim/example_cloth_throw.py +1 -1
  20. warp/examples/optim/example_diffray.py +4 -15
  21. warp/examples/optim/example_drone.py +1 -1
  22. warp/examples/optim/example_softbody_properties.py +392 -0
  23. warp/examples/optim/example_trajectory.py +1 -3
  24. warp/examples/optim/example_walker.py +5 -0
  25. warp/examples/sim/example_cartpole.py +0 -2
  26. warp/examples/sim/example_cloth_self_contact.py +260 -0
  27. warp/examples/sim/example_granular_collision_sdf.py +4 -5
  28. warp/examples/sim/example_jacobian_ik.py +0 -2
  29. warp/examples/sim/example_quadruped.py +5 -2
  30. warp/examples/tile/example_tile_cholesky.py +79 -0
  31. warp/examples/tile/example_tile_convolution.py +2 -2
  32. warp/examples/tile/example_tile_fft.py +2 -2
  33. warp/examples/tile/example_tile_filtering.py +3 -3
  34. warp/examples/tile/example_tile_matmul.py +4 -4
  35. warp/examples/tile/example_tile_mlp.py +12 -12
  36. warp/examples/tile/example_tile_nbody.py +180 -0
  37. warp/examples/tile/example_tile_walker.py +319 -0
  38. warp/math.py +147 -0
  39. warp/native/array.h +12 -0
  40. warp/native/builtin.h +0 -1
  41. warp/native/bvh.cpp +149 -70
  42. warp/native/bvh.cu +287 -68
  43. warp/native/bvh.h +195 -85
  44. warp/native/clang/clang.cpp +5 -1
  45. warp/native/cuda_util.cpp +35 -0
  46. warp/native/cuda_util.h +5 -0
  47. warp/native/exports.h +40 -40
  48. warp/native/intersect.h +17 -0
  49. warp/native/mat.h +41 -0
  50. warp/native/mathdx.cpp +19 -0
  51. warp/native/mesh.cpp +25 -8
  52. warp/native/mesh.cu +153 -101
  53. warp/native/mesh.h +482 -403
  54. warp/native/quat.h +40 -0
  55. warp/native/solid_angle.h +7 -0
  56. warp/native/sort.cpp +85 -0
  57. warp/native/sort.cu +34 -0
  58. warp/native/sort.h +3 -1
  59. warp/native/spatial.h +11 -0
  60. warp/native/tile.h +1185 -664
  61. warp/native/tile_reduce.h +8 -6
  62. warp/native/vec.h +41 -0
  63. warp/native/warp.cpp +8 -1
  64. warp/native/warp.cu +263 -40
  65. warp/native/warp.h +19 -5
  66. warp/optim/linear.py +22 -4
  67. warp/render/render_opengl.py +124 -59
  68. warp/sim/__init__.py +6 -1
  69. warp/sim/collide.py +270 -26
  70. warp/sim/integrator_euler.py +25 -7
  71. warp/sim/integrator_featherstone.py +154 -35
  72. warp/sim/integrator_vbd.py +842 -40
  73. warp/sim/model.py +111 -53
  74. warp/stubs.py +248 -115
  75. warp/tape.py +28 -30
  76. warp/tests/aux_test_module_unload.py +15 -0
  77. warp/tests/{test_sim_grad.py → flaky_test_sim_grad.py} +104 -63
  78. warp/tests/test_array.py +74 -0
  79. warp/tests/test_assert.py +242 -0
  80. warp/tests/test_codegen.py +14 -61
  81. warp/tests/test_collision.py +2 -2
  82. warp/tests/test_examples.py +9 -0
  83. warp/tests/test_grad_debug.py +87 -2
  84. warp/tests/test_hash_grid.py +1 -1
  85. warp/tests/test_ipc.py +116 -0
  86. warp/tests/test_mat.py +138 -167
  87. warp/tests/test_math.py +47 -1
  88. warp/tests/test_matmul.py +11 -7
  89. warp/tests/test_matmul_lite.py +4 -4
  90. warp/tests/test_mesh.py +84 -60
  91. warp/tests/test_mesh_query_aabb.py +165 -0
  92. warp/tests/test_mesh_query_point.py +328 -286
  93. warp/tests/test_mesh_query_ray.py +134 -121
  94. warp/tests/test_mlp.py +2 -2
  95. warp/tests/test_operators.py +43 -0
  96. warp/tests/test_overwrite.py +2 -2
  97. warp/tests/test_quat.py +77 -0
  98. warp/tests/test_reload.py +29 -0
  99. warp/tests/test_sim_grad_bounce_linear.py +204 -0
  100. warp/tests/test_static.py +16 -0
  101. warp/tests/test_tape.py +25 -0
  102. warp/tests/test_tile.py +134 -191
  103. warp/tests/test_tile_load.py +356 -0
  104. warp/tests/test_tile_mathdx.py +61 -8
  105. warp/tests/test_tile_mlp.py +17 -17
  106. warp/tests/test_tile_reduce.py +24 -18
  107. warp/tests/test_tile_shared_memory.py +66 -17
  108. warp/tests/test_tile_view.py +165 -0
  109. warp/tests/test_torch.py +35 -0
  110. warp/tests/test_utils.py +36 -24
  111. warp/tests/test_vec.py +110 -0
  112. warp/tests/unittest_suites.py +29 -4
  113. warp/tests/unittest_utils.py +30 -11
  114. warp/thirdparty/unittest_parallel.py +2 -2
  115. warp/types.py +409 -99
  116. warp/utils.py +9 -5
  117. {warp_lang-1.5.1.dist-info → warp_lang-1.6.0.dist-info}/METADATA +68 -44
  118. {warp_lang-1.5.1.dist-info → warp_lang-1.6.0.dist-info}/RECORD +121 -110
  119. {warp_lang-1.5.1.dist-info → warp_lang-1.6.0.dist-info}/WHEEL +1 -1
  120. warp/examples/benchmarks/benchmark_tile.py +0 -179
  121. warp/native/tile_gemm.h +0 -341
  122. {warp_lang-1.5.1.dist-info → warp_lang-1.6.0.dist-info}/LICENSE.md +0 -0
  123. {warp_lang-1.5.1.dist-info → warp_lang-1.6.0.dist-info}/top_level.txt +0 -0
@@ -0,0 +1,356 @@
1
+ # Copyright (c) 2024 NVIDIA CORPORATION. All rights reserved.
2
+ # NVIDIA CORPORATION and its licensors retain all intellectual property
3
+ # and proprietary rights in and to this software, related documentation
4
+ # and any modifications thereto. Any use, reproduction, disclosure or
5
+ # distribution of this software and related documentation without an express
6
+ # license agreement from NVIDIA CORPORATION is strictly prohibited.
7
+
8
+ import unittest
9
+
10
+ import numpy as np
11
+
12
+ import warp as wp
13
+ from warp.tests.unittest_utils import *
14
+
15
+ TILE_DIM = 64
16
+
17
+ TILE_M = wp.constant(16)
18
+ TILE_N = wp.constant(8)
19
+ TILE_O = wp.constant(8)
20
+ TILE_P = wp.constant(6)
21
+
22
+ TILE_OFFSET = 5
23
+
24
+
25
+ @wp.kernel
26
+ def tile_load_1d_kernel(
27
+ input: wp.array1d(dtype=float),
28
+ out_full: wp.array1d(dtype=float),
29
+ out_padded: wp.array1d(dtype=float),
30
+ out_offset: wp.array1d(dtype=float),
31
+ ):
32
+ full0 = wp.tile_load(input, TILE_M)
33
+ full1 = wp.tile_load(input, shape=TILE_M)
34
+ full2 = wp.tile_load(input, shape=(TILE_M,))
35
+
36
+ padded0 = wp.tile_load(input, TILE_M, TILE_OFFSET)
37
+ padded1 = wp.tile_load(input, shape=TILE_M, offset=TILE_OFFSET)
38
+ padded2 = wp.tile_load(input, shape=(TILE_M,), offset=(TILE_OFFSET,))
39
+
40
+ wp.tile_store(out_full, full0)
41
+ wp.tile_store(out_padded, padded0)
42
+ wp.tile_store(out_offset, full0, offset=(TILE_OFFSET,))
43
+
44
+
45
+ @wp.kernel
46
+ def tile_load_2d_kernel(
47
+ input: wp.array2d(dtype=float),
48
+ out_full: wp.array2d(dtype=float),
49
+ out_padded: wp.array2d(dtype=float),
50
+ out_offset: wp.array2d(dtype=float),
51
+ ):
52
+ full0 = wp.tile_load(input, shape=(TILE_M, TILE_N))
53
+ padded0 = wp.tile_load(input, shape=(TILE_M, TILE_N), offset=(TILE_OFFSET, TILE_OFFSET))
54
+
55
+ wp.tile_store(out_full, full0)
56
+ wp.tile_store(out_padded, padded0)
57
+ wp.tile_store(out_offset, full0, offset=(TILE_OFFSET, TILE_OFFSET))
58
+
59
+
60
+ @wp.kernel
61
+ def tile_load_3d_kernel(
62
+ input: wp.array3d(dtype=float),
63
+ out_full: wp.array3d(dtype=float),
64
+ out_padded: wp.array3d(dtype=float),
65
+ out_offset: wp.array3d(dtype=float),
66
+ ):
67
+ full0 = wp.tile_load(input, shape=(TILE_M, TILE_N, TILE_O))
68
+ padded0 = wp.tile_load(input, shape=(TILE_M, TILE_N, TILE_O), offset=(TILE_OFFSET, TILE_OFFSET, TILE_OFFSET))
69
+
70
+ wp.tile_store(out_full, full0)
71
+ wp.tile_store(out_padded, padded0)
72
+ wp.tile_store(out_offset, full0, offset=(TILE_OFFSET, TILE_OFFSET, TILE_OFFSET))
73
+
74
+
75
+ @wp.kernel
76
+ def tile_load_4d_kernel(
77
+ input: wp.array4d(dtype=float),
78
+ out_full: wp.array4d(dtype=float),
79
+ out_padded: wp.array4d(dtype=float),
80
+ out_offset: wp.array4d(dtype=float),
81
+ ):
82
+ full0 = wp.tile_load(input, shape=(TILE_M, TILE_N, TILE_O, TILE_P))
83
+ padded0 = wp.tile_load(
84
+ input, shape=(TILE_M, TILE_N, TILE_O, TILE_P), offset=(TILE_OFFSET, TILE_OFFSET, TILE_OFFSET, TILE_OFFSET)
85
+ )
86
+
87
+ wp.tile_store(out_full, full0)
88
+ wp.tile_store(out_padded, padded0)
89
+ wp.tile_store(out_offset, full0, offset=(TILE_OFFSET, TILE_OFFSET, TILE_OFFSET, TILE_OFFSET))
90
+
91
+
92
+ def test_tile_load(kernel, ndim):
93
+ def test(test, device):
94
+ rng = np.random.default_rng(42)
95
+
96
+ shape = [TILE_M, TILE_N, TILE_O, TILE_P]
97
+ shape = shape[0:ndim]
98
+
99
+ input = wp.array(rng.random(shape), dtype=float, requires_grad=True, device=device)
100
+ output_full = wp.zeros(shape, dtype=float, device=device)
101
+ output_padded = wp.zeros(shape, dtype=float, device=device)
102
+ output_offset = wp.zeros(shape, dtype=float, device=device)
103
+
104
+ with wp.Tape() as tape:
105
+ wp.launch_tiled(
106
+ kernel,
107
+ dim=[1],
108
+ inputs=[input, output_full, output_padded, output_offset],
109
+ block_dim=TILE_DIM,
110
+ device=device,
111
+ )
112
+
113
+ # construct a slice for the offset portion of the source/dest arrays
114
+ src_slice = tuple(slice(TILE_OFFSET, dim) for dim in shape)
115
+ dest_slice = tuple(slice(None, dim - TILE_OFFSET) for dim in shape)
116
+
117
+ ref_full = input.numpy()
118
+ ref_padded = np.zeros_like(ref_full)
119
+ ref_padded[dest_slice] = ref_full[src_slice]
120
+
121
+ ref_offset = np.zeros_like(ref_full)
122
+ ref_offset[src_slice] = ref_full[dest_slice]
123
+
124
+ assert_np_equal(output_full.numpy(), ref_full)
125
+ assert_np_equal(output_padded.numpy(), ref_padded)
126
+ assert_np_equal(output_offset.numpy(), ref_offset)
127
+
128
+ output_full.grad = wp.ones_like(output_full)
129
+ tape.backward()
130
+
131
+ assert_np_equal(input.grad.numpy(), np.ones_like(input.grad.numpy()))
132
+
133
+ return test
134
+
135
+
136
+ # ----------------------------------------------------------------------------------------
137
+
138
+ TILE_SIZE = 4
139
+
140
+
141
+ @wp.kernel
142
+ def tile_extract_1d_kernel(input: wp.array1d(dtype=float), output: wp.array1d(dtype=float)):
143
+ i = wp.tid()
144
+
145
+ t = wp.tile_load(input, shape=TILE_SIZE)
146
+
147
+ output[i] = t[i]
148
+
149
+
150
+ @wp.kernel
151
+ def tile_extract_2d_kernel(input: wp.array2d(dtype=float), output: wp.array2d(dtype=float)):
152
+ i, j = wp.tid()
153
+
154
+ t = wp.tile_load(input, shape=(TILE_SIZE, TILE_SIZE))
155
+
156
+ output[i, j] = t[i, j]
157
+
158
+
159
+ @wp.kernel
160
+ def tile_extract_3d_kernel(input: wp.array3d(dtype=float), output: wp.array3d(dtype=float)):
161
+ i, j, k = wp.tid()
162
+
163
+ t = wp.tile_load(input, shape=(TILE_SIZE, TILE_SIZE, TILE_SIZE))
164
+
165
+ output[i, j, k] = t[i, j, k]
166
+
167
+
168
+ @wp.kernel
169
+ def tile_extract_4d_kernel(input: wp.array4d(dtype=float), output: wp.array4d(dtype=float)):
170
+ i, j, k, l = wp.tid()
171
+
172
+ t = wp.tile_load(input, shape=(TILE_SIZE, TILE_SIZE, TILE_SIZE, TILE_SIZE))
173
+
174
+ output[i, j, k, l] = t[i, j, k, l]
175
+
176
+
177
+ def test_tile_extract(kernel, ndim):
178
+ shape = (TILE_SIZE,) * ndim
179
+
180
+ def test_run(test, device):
181
+ rng = np.random.default_rng(42)
182
+
183
+ input = wp.array(rng.random(shape), dtype=float, requires_grad=True, device=device)
184
+ output = wp.zeros_like(input)
185
+
186
+ with wp.Tape() as tape:
187
+ wp.launch(
188
+ kernel,
189
+ dim=shape,
190
+ inputs=[input, output],
191
+ block_dim=1024,
192
+ device=device,
193
+ )
194
+
195
+ assert_np_equal(output.numpy(), input.numpy())
196
+
197
+ output.grad = wp.ones_like(output)
198
+ tape.backward()
199
+
200
+ assert_np_equal(input.grad.numpy(), np.ones_like(input.numpy()))
201
+
202
+ return test_run
203
+
204
+
205
+ # ----------------------------------------------------------------------------------------
206
+
207
+ TILE_SIZE = 4
208
+
209
+
210
+ @wp.kernel
211
+ def tile_assign_1d_kernel(input: wp.array1d(dtype=float), output: wp.array1d(dtype=float)):
212
+ i = wp.tid()
213
+
214
+ t = wp.tile_zeros(shape=(TILE_SIZE,), dtype=float)
215
+
216
+ # assign to tile
217
+ t[i] = input[i] * 2.0
218
+
219
+ output[i] = t[i]
220
+
221
+
222
+ @wp.kernel
223
+ def tile_assign_2d_kernel(input: wp.array2d(dtype=float), output: wp.array2d(dtype=float)):
224
+ i, j = wp.tid()
225
+
226
+ t = wp.tile_zeros(shape=(TILE_SIZE, TILE_SIZE), dtype=float)
227
+
228
+ # assign to tile
229
+ t[i, j] = input[i, j] * 2.0
230
+
231
+ output[i, j] = t[i, j]
232
+
233
+
234
+ @wp.kernel
235
+ def tile_assign_3d_kernel(input: wp.array3d(dtype=float), output: wp.array3d(dtype=float)):
236
+ i, j, k = wp.tid()
237
+
238
+ t = wp.tile_zeros(shape=(TILE_SIZE, TILE_SIZE, TILE_SIZE), dtype=float)
239
+
240
+ # assign to tile
241
+ t[i, j, k] = input[i, j, k] * 2.0
242
+
243
+ output[i, j, k] = t[i, j, k]
244
+
245
+
246
+ @wp.kernel
247
+ def tile_assign_4d_kernel(input: wp.array4d(dtype=float), output: wp.array4d(dtype=float)):
248
+ i, j, k, l = wp.tid()
249
+
250
+ t = wp.tile_zeros(shape=(TILE_SIZE, TILE_SIZE, TILE_SIZE, TILE_SIZE), dtype=float)
251
+
252
+ # assign to tile
253
+ t[i, j, k, l] = input[i, j, k, l] * 2.0
254
+
255
+ output[i, j, k, l] = t[i, j, k, l]
256
+
257
+
258
+ def test_tile_assign(kernel, ndim):
259
+ shape = (TILE_SIZE,) * ndim
260
+
261
+ def test_run(test, device):
262
+ rng = np.random.default_rng(42)
263
+
264
+ input = wp.array(rng.random(shape), dtype=float, requires_grad=True, device=device)
265
+ output = wp.zeros_like(input)
266
+
267
+ with wp.Tape() as tape:
268
+ wp.launch(
269
+ kernel,
270
+ dim=shape,
271
+ inputs=[input, output],
272
+ block_dim=1024,
273
+ device=device,
274
+ )
275
+
276
+ assert_np_equal(output.numpy(), input.numpy() * 2.0)
277
+
278
+ return test_run
279
+
280
+
281
+ # ----------------------------------------------------------------------------------------
282
+
283
+
284
+ @wp.kernel
285
+ def tile_load_fortran_kernel(A: wp.array2d(dtype=float), B: wp.array2d(dtype=float)):
286
+ # tile index
287
+ i, j = wp.tid()
288
+
289
+ a = wp.tile_load(A, shape=(TILE_M, TILE_N), offset=(i * TILE_M, j * TILE_N))
290
+ wp.tile_store(B, t=a, offset=(i * TILE_M, j * TILE_N))
291
+
292
+
293
+ def test_tile_load_fortran(test, device):
294
+ rng = np.random.default_rng(42)
295
+
296
+ M = TILE_M * 7
297
+ N = TILE_N * 5
298
+
299
+ A = rng.random((M, N), dtype=np.float32)
300
+ B = rng.random((M, N), dtype=np.float32)
301
+
302
+ # convert to column major layout
303
+ A = np.asfortranarray(A)
304
+ B = np.asfortranarray(B)
305
+
306
+ A_wp = wp.array(A, requires_grad=True, device=device)
307
+ B_wp = wp.array(B, requires_grad=True, device=device)
308
+
309
+ with wp.Tape() as tape:
310
+ wp.launch_tiled(
311
+ tile_load_fortran_kernel,
312
+ dim=[int(M / TILE_M), int(N / TILE_N)],
313
+ inputs=[A_wp, B_wp],
314
+ block_dim=TILE_DIM,
315
+ device=device,
316
+ )
317
+
318
+ # verify forward pass
319
+ assert_array_equal(B_wp, A_wp)
320
+
321
+ # verify backward pass
322
+ B_wp.grad = wp.ones_like(B_wp, device=device)
323
+ tape.backward()
324
+
325
+ assert_array_equal(B_wp.grad, A_wp.grad)
326
+
327
+
328
+ devices = get_cuda_test_devices()
329
+
330
+
331
+ class TestTileLoad(unittest.TestCase):
332
+ pass
333
+
334
+
335
+ add_function_test(TestTileLoad, "test_tile_load_1d", test_tile_load(tile_load_1d_kernel, 1), devices=devices)
336
+ add_function_test(TestTileLoad, "test_tile_load_2d", test_tile_load(tile_load_2d_kernel, 2), devices=devices)
337
+ add_function_test(TestTileLoad, "test_tile_load_3d", test_tile_load(tile_load_3d_kernel, 3), devices=devices)
338
+ add_function_test(TestTileLoad, "test_tile_load_4d", test_tile_load(tile_load_4d_kernel, 4), devices=devices)
339
+
340
+
341
+ add_function_test(TestTileLoad, "test_tile_extract_1d", test_tile_extract(tile_extract_1d_kernel, 1), devices=devices)
342
+ add_function_test(TestTileLoad, "test_tile_extract_2d", test_tile_extract(tile_extract_2d_kernel, 2), devices=devices)
343
+ add_function_test(TestTileLoad, "test_tile_extract_3d", test_tile_extract(tile_extract_3d_kernel, 3), devices=devices)
344
+ add_function_test(TestTileLoad, "test_tile_extract_4d", test_tile_extract(tile_extract_4d_kernel, 4), devices=devices)
345
+
346
+ add_function_test(TestTileLoad, "test_tile_assign_1d", test_tile_assign(tile_assign_1d_kernel, 1), devices=devices)
347
+ add_function_test(TestTileLoad, "test_tile_assign_2d", test_tile_assign(tile_assign_2d_kernel, 2), devices=devices)
348
+ add_function_test(TestTileLoad, "test_tile_assign_3d", test_tile_assign(tile_assign_3d_kernel, 3), devices=devices)
349
+ add_function_test(TestTileLoad, "test_tile_assign_4d", test_tile_assign(tile_assign_4d_kernel, 4), devices=devices)
350
+
351
+ add_function_test(TestTileLoad, "test_tile_load_fortran", test_tile_load_fortran, devices=devices)
352
+
353
+
354
+ if __name__ == "__main__":
355
+ wp.clear_kernel_cache()
356
+ unittest.main(verbosity=2, failfast=True)
@@ -30,11 +30,11 @@ def tile_math_matmul_kernel(
30
30
  ga: wp.array2d(dtype=wp.float16), gb: wp.array2d(dtype=wp.float32), gc: wp.array2d(dtype=wp.float64)
31
31
  ):
32
32
  i, j = wp.tid()
33
- a = wp.tile_load(ga, i, j, m=TILE_M, n=TILE_K)
34
- b = wp.tile_load(gb, i, j, m=TILE_K, n=TILE_N)
35
- c = wp.tile_zeros(m=TILE_M, n=TILE_N, dtype=wp.float64)
33
+ a = wp.tile_load(ga, shape=(TILE_M, TILE_K), offset=(i * TILE_M, j * TILE_K))
34
+ b = wp.tile_load(gb, shape=(TILE_K, TILE_N), offset=(i * TILE_K, j * TILE_N))
35
+ c = wp.tile_zeros(shape=(TILE_M, TILE_N), dtype=wp.float64)
36
36
  wp.tile_matmul(a, b, c)
37
- wp.tile_store(gc, i, j, c)
37
+ wp.tile_store(gc, c, offset=(i * TILE_M, j * TILE_N))
38
38
 
39
39
 
40
40
  def test_tile_math_matmul(test, device):
@@ -71,17 +71,17 @@ def test_tile_math_matmul(test, device):
71
71
  @wp.kernel()
72
72
  def tile_math_fft_kernel_vec2f(gx: wp.array2d(dtype=wp.vec2f), gy: wp.array2d(dtype=wp.vec2f)):
73
73
  i, j = wp.tid()
74
- xy = wp.tile_load(gx, i, j, m=FFT_SIZE_FP32, n=FFT_SIZE_FP32)
74
+ xy = wp.tile_load(gx, shape=(FFT_SIZE_FP32, FFT_SIZE_FP32))
75
75
  wp.tile_fft(xy)
76
- wp.tile_store(gy, i, j, xy)
76
+ wp.tile_store(gy, xy)
77
77
 
78
78
 
79
79
  @wp.kernel()
80
80
  def tile_math_fft_kernel_vec2d(gx: wp.array2d(dtype=wp.vec2d), gy: wp.array2d(dtype=wp.vec2d)):
81
81
  i, j = wp.tid()
82
- xy = wp.tile_load(gx, i, j, m=FFT_SIZE_FP64, n=FFT_SIZE_FP64)
82
+ xy = wp.tile_load(gx, shape=(FFT_SIZE_FP64, FFT_SIZE_FP64))
83
83
  wp.tile_fft(xy)
84
- wp.tile_store(gy, i, j, xy)
84
+ wp.tile_store(gy, xy)
85
85
 
86
86
 
87
87
  def test_tile_math_fft(test, device, wp_dtype):
@@ -114,6 +114,56 @@ def test_tile_math_fft(test, device, wp_dtype):
114
114
  # TODO: implement and test backward pass
115
115
 
116
116
 
117
+ @wp.kernel()
118
+ def tile_math_cholesky(
119
+ gA: wp.array2d(dtype=wp.float64),
120
+ gD: wp.array1d(dtype=wp.float64),
121
+ gL: wp.array2d(dtype=wp.float64),
122
+ gx: wp.array1d(dtype=wp.float64),
123
+ gy: wp.array1d(dtype=wp.float64),
124
+ ):
125
+ i, j = wp.tid()
126
+ # Load A, D & x
127
+ a = wp.tile_load(gA, shape=(TILE_M, TILE_M), storage="shared")
128
+ d = wp.tile_load(gD, shape=TILE_M, storage="shared")
129
+ x = wp.tile_load(gx, shape=TILE_M, storage="shared")
130
+ # Compute L st LL^T = A + diag(D)
131
+ b = wp.tile_diag_add(a, d)
132
+ l = wp.tile_cholesky(b)
133
+ # Solve for y in LL^T y = x
134
+ y = wp.tile_cholesky_solve(l, x)
135
+ # Store L & y
136
+ wp.tile_store(gL, l)
137
+ wp.tile_store(gy, y)
138
+
139
+
140
+ def test_tile_math_cholesky(test, device):
141
+ A_h = np.ones((TILE_M, TILE_M), dtype=np.float64)
142
+ D_h = 8.0 * np.ones(TILE_M, dtype=np.float64)
143
+ L_h = np.zeros_like(A_h)
144
+ X_h = np.arange(TILE_M, dtype=np.float64)
145
+ Y_h = np.zeros_like(X_h)
146
+
147
+ A_np = A_h + np.diag(D_h)
148
+ L_np = np.linalg.cholesky(A_np)
149
+ Y_np = np.linalg.solve(A_np, X_h)
150
+
151
+ A_wp = wp.array2d(A_h, requires_grad=True, dtype=wp.float64, device=device)
152
+ D_wp = wp.array2d(D_h, requires_grad=True, dtype=wp.float64, device=device)
153
+ L_wp = wp.array2d(L_h, requires_grad=True, dtype=wp.float64, device=device)
154
+ X_wp = wp.array2d(X_h, requires_grad=True, dtype=wp.float64, device=device)
155
+ Y_wp = wp.array2d(Y_h, requires_grad=True, dtype=wp.float64, device=device)
156
+
157
+ wp.launch_tiled(
158
+ tile_math_cholesky, dim=[1, 1], inputs=[A_wp, D_wp, L_wp, X_wp, Y_wp], block_dim=TILE_DIM, device=device
159
+ )
160
+ wp.synchronize_device()
161
+
162
+ assert np.allclose(Y_wp.numpy(), Y_np) and np.allclose(L_wp.numpy(), L_np)
163
+
164
+ # TODO: implement and test backward pass
165
+
166
+
117
167
  devices = get_cuda_test_devices()
118
168
 
119
169
 
@@ -124,6 +174,9 @@ class TestTileMathDx(unittest.TestCase):
124
174
 
125
175
  # check_output=False so we can enable libmathdx's logging without failing the tests
126
176
  add_function_test(TestTileMathDx, "test_tile_math_matmul", test_tile_math_matmul, devices=devices, check_output=False)
177
+ add_function_test(
178
+ TestTileMathDx, "test_tile_math_cholesky", test_tile_math_cholesky, devices=devices, check_output=False
179
+ )
127
180
  add_function_test(
128
181
  TestTileMathDx,
129
182
  "test_tile_math_fft_vec2f",
@@ -114,23 +114,23 @@ def test_multi_layer_nn(test, device):
114
114
  f = wp.tile(local)
115
115
 
116
116
  # input layer
117
- w0 = wp.tile_load(weights_0, 0, 0, m=DIM_HID, n=DIM_IN)
118
- b0 = wp.tile_load(bias_0, 0, 0, m=DIM_HID, n=1)
119
- z = wp.tile_map(relu, wp.tile_matmul(w0, f) + wp.tile_broadcast(b0, m=DIM_HID, n=NUM_THREADS))
117
+ w0 = wp.tile_load(weights_0, shape=(DIM_HID, DIM_IN))
118
+ b0 = wp.tile_load(bias_0, shape=(DIM_HID, 1))
119
+ z = wp.tile_map(relu, wp.tile_matmul(w0, f) + wp.tile_broadcast(b0, shape=(DIM_HID, NUM_THREADS)))
120
120
 
121
121
  # hidden layer
122
- w1 = wp.tile_load(weights_1, 0, 0, m=DIM_HID, n=DIM_HID)
123
- b1 = wp.tile_load(bias_1, 0, 0, m=DIM_HID, n=1)
124
- z = wp.tile_map(relu, wp.tile_matmul(w1, z) + wp.tile_broadcast(b1, m=DIM_HID, n=NUM_THREADS))
122
+ w1 = wp.tile_load(weights_1, shape=(DIM_HID, DIM_HID))
123
+ b1 = wp.tile_load(bias_1, shape=(DIM_HID, 1))
124
+ z = wp.tile_map(relu, wp.tile_matmul(w1, z) + wp.tile_broadcast(b1, shape=(DIM_HID, NUM_THREADS)))
125
125
 
126
- w2 = wp.tile_load(weights_2, 0, 0, m=DIM_HID, n=DIM_HID)
127
- b2 = wp.tile_load(bias_2, 0, 0, m=DIM_HID, n=1)
128
- z = wp.tile_map(relu, wp.tile_matmul(w2, z) + wp.tile_broadcast(b2, m=DIM_HID, n=NUM_THREADS))
126
+ w2 = wp.tile_load(weights_2, shape=(DIM_HID, DIM_HID))
127
+ b2 = wp.tile_load(bias_2, shape=(DIM_HID, 1))
128
+ z = wp.tile_map(relu, wp.tile_matmul(w2, z) + wp.tile_broadcast(b2, shape=(DIM_HID, NUM_THREADS)))
129
129
 
130
130
  # output layer
131
- w3 = wp.tile_load(weights_3, 0, 0, m=DIM_OUT, n=DIM_HID)
132
- b3 = wp.tile_load(bias_3, 0, 0, m=DIM_OUT, n=1)
133
- o = wp.tile_map(relu, wp.tile_matmul(w3, z) + wp.tile_broadcast(b3, m=DIM_OUT, n=NUM_THREADS))
131
+ w3 = wp.tile_load(weights_3, shape=(DIM_OUT, DIM_HID))
132
+ b3 = wp.tile_load(bias_3, shape=(DIM_OUT, 1))
133
+ o = wp.tile_map(relu, wp.tile_matmul(w3, z) + wp.tile_broadcast(b3, shape=(DIM_OUT, NUM_THREADS)))
134
134
 
135
135
  # untile back to SIMT
136
136
  output = wp.untile(o)
@@ -292,14 +292,14 @@ def test_single_layer_nn(test, device):
292
292
  ):
293
293
  i = wp.tid()
294
294
 
295
- f = wp.tile_load(input, 0, i, m=DIM_IN, n=NUM_THREADS)
295
+ f = wp.tile_load(input, shape=(DIM_IN, NUM_THREADS), offset=(0, i * NUM_THREADS))
296
296
 
297
- w = wp.tile_load(weights, 0, 0, DIM_OUT, DIM_IN)
298
- b = wp.tile_load(bias, 0, 0, m=DIM_OUT, n=1)
297
+ w = wp.tile_load(weights, shape=(DIM_OUT, DIM_IN))
298
+ b = wp.tile_load(bias, shape=(DIM_OUT, 1))
299
299
 
300
- o = wp.tile_map(relu, wp.tile_matmul(w, f) + wp.tile_broadcast(b, m=DIM_OUT, n=NUM_THREADS))
300
+ o = wp.tile_map(relu, wp.tile_matmul(w, f) + wp.tile_broadcast(b, shape=(DIM_OUT, NUM_THREADS)))
301
301
 
302
- wp.tile_store(out, 0, i, o)
302
+ wp.tile_store(out, o, offset=(0, i * NUM_THREADS))
303
303
 
304
304
  with wp.ScopedDevice(device):
305
305
  rng = np.random.default_rng(45)
@@ -28,13 +28,13 @@ def tile_sum_kernel(input: wp.array2d(dtype=float), output: wp.array(dtype=float
28
28
  n = input.shape[1]
29
29
  count = int(n / TILE_DIM)
30
30
 
31
- s = wp.tile_zeros(m=1, n=1, dtype=float)
31
+ s = wp.tile_zeros(shape=1, dtype=float)
32
32
 
33
33
  for j in range(count):
34
- a = wp.tile_load(input, i, j, m=1, n=TILE_DIM)
34
+ a = wp.tile_load(input[i], shape=TILE_DIM, offset=j * TILE_DIM)
35
35
  s += wp.tile_sum(a) * 0.5
36
36
 
37
- wp.tile_store(output, i, s)
37
+ wp.tile_store(output, s, offset=i)
38
38
 
39
39
 
40
40
  def test_tile_reduce_sum(test, device):
@@ -70,10 +70,10 @@ def tile_min_kernel(input: wp.array2d(dtype=float), output: wp.array(dtype=float
70
70
  # output tile index
71
71
  i = wp.tid()
72
72
 
73
- a = wp.tile_load(input, i, 0, m=1, n=TILE_DIM)
73
+ a = wp.tile_load(input[i], shape=TILE_DIM)
74
74
  m = wp.tile_min(a)
75
75
 
76
- wp.tile_store(output, i, m)
76
+ wp.tile_store(output, m, offset=i)
77
77
 
78
78
 
79
79
  def test_tile_reduce_min(test, device):
@@ -103,10 +103,10 @@ def tile_max_kernel(input: wp.array2d(dtype=float), output: wp.array(dtype=float
103
103
  # output tile index
104
104
  i = wp.tid()
105
105
 
106
- a = wp.tile_load(input, i, 0, m=1, n=TILE_DIM)
106
+ a = wp.tile_load(input[i], shape=TILE_DIM)
107
107
  m = wp.tile_max(a)
108
108
 
109
- wp.tile_store(output, i, m)
109
+ wp.tile_store(output, m, offset=i)
110
110
 
111
111
 
112
112
  def test_tile_reduce_max(test, device):
@@ -136,10 +136,10 @@ def tile_reduce_custom_kernel(input: wp.array2d(dtype=float), output: wp.array(d
136
136
  # output tile index
137
137
  i = wp.tid()
138
138
 
139
- a = wp.tile_load(input, i, 0, m=1, n=TILE_DIM)
139
+ a = wp.tile_load(input[i], shape=TILE_DIM)
140
140
  m = wp.tile_reduce(wp.mul, a)
141
141
 
142
- wp.tile_store(output, i, m)
142
+ wp.tile_store(output, m, offset=i)
143
143
 
144
144
 
145
145
  def test_tile_reduce_custom(test, device):
@@ -173,10 +173,10 @@ def tile_grouped_sum_kernel(input: wp.array3d(dtype=float), output: wp.array(dty
173
173
  # output tile index
174
174
  i = wp.tid()
175
175
 
176
- a = wp.tile_load(input[i], 0, 0, m=TILE_M, n=TILE_N)
176
+ a = wp.tile_load(input[i], shape=(TILE_M, TILE_N))
177
177
  s = wp.tile_sum(a) * 0.5
178
178
 
179
- wp.tile_store(output, i, s)
179
+ wp.tile_store(output, s, offset=i)
180
180
 
181
181
 
182
182
  def test_tile_reduce_grouped_sum(test, device):
@@ -217,7 +217,7 @@ def tile_reduce_simt_kernel(output: wp.array(dtype=int)):
217
217
  s = wp.tile_sum(t) # sum over block
218
218
 
219
219
  # update global sum
220
- wp.tile_atomic_add(output, 0, 0, s)
220
+ wp.tile_atomic_add(output, s)
221
221
 
222
222
 
223
223
  def test_tile_reduce_simt(test, device):
@@ -310,10 +310,10 @@ def test_tile_untile_vector(test, device):
310
310
  def tile_ones_kernel(out: wp.array(dtype=float)):
311
311
  i = wp.tid()
312
312
 
313
- t = wp.tile_ones(dtype=float, m=16, n=16)
313
+ t = wp.tile_ones(dtype=float, shape=(16, 16))
314
314
  s = wp.tile_sum(t)
315
315
 
316
- wp.tile_store(out, 0, s)
316
+ wp.tile_store(out, s)
317
317
 
318
318
 
319
319
  def test_tile_ones(test, device):
@@ -332,16 +332,20 @@ def tile_arange_kernel(out: wp.array2d(dtype=int)):
332
332
  a = wp.tile_arange(17, dtype=int)
333
333
  b = wp.tile_arange(5, 23, dtype=int)
334
334
  c = wp.tile_arange(0, 34, 2, dtype=int)
335
+ d = wp.tile_arange(-1, 16, dtype=int)
336
+ e = wp.tile_arange(17, 0, -1, dtype=int)
335
337
 
336
- wp.tile_store(out, 0, 0, a)
337
- wp.tile_store(out, 1, 0, b)
338
- wp.tile_store(out, 2, 0, c)
338
+ wp.tile_store(out[0], a)
339
+ wp.tile_store(out[1], b)
340
+ wp.tile_store(out[2], c)
341
+ wp.tile_store(out[3], d)
342
+ wp.tile_store(out[4], e)
339
343
 
340
344
 
341
345
  def test_tile_arange(test, device):
342
346
  N = 17
343
347
 
344
- output = wp.zeros(shape=(3, N), dtype=int, device=device)
348
+ output = wp.zeros(shape=(5, N), dtype=int, device=device)
345
349
 
346
350
  with wp.Tape() as tape:
347
351
  wp.launch_tiled(tile_arange_kernel, dim=[1], inputs=[output], block_dim=TILE_DIM, device=device)
@@ -349,6 +353,8 @@ def test_tile_arange(test, device):
349
353
  assert_np_equal(output.numpy()[0], np.arange(17))
350
354
  assert_np_equal(output.numpy()[1], np.arange(5, 22))
351
355
  assert_np_equal(output.numpy()[2], np.arange(0, 34, 2))
356
+ assert_np_equal(output.numpy()[3], np.arange(-1, 16))
357
+ assert_np_equal(output.numpy()[4], np.arange(17, 0, -1))
352
358
 
353
359
 
354
360
  devices = get_cuda_test_devices()