warp-lang 1.8.1__py3-none-macosx_10_13_universal2.whl → 1.9.1__py3-none-macosx_10_13_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of warp-lang might be problematic. Click here for more details.

Files changed (141) hide show
  1. warp/__init__.py +282 -103
  2. warp/__init__.pyi +1904 -114
  3. warp/bin/libwarp-clang.dylib +0 -0
  4. warp/bin/libwarp.dylib +0 -0
  5. warp/build.py +93 -30
  6. warp/build_dll.py +331 -101
  7. warp/builtins.py +1244 -160
  8. warp/codegen.py +317 -206
  9. warp/config.py +1 -1
  10. warp/context.py +1465 -789
  11. warp/examples/core/example_marching_cubes.py +1 -0
  12. warp/examples/core/example_render_opengl.py +100 -3
  13. warp/examples/fem/example_apic_fluid.py +98 -52
  14. warp/examples/fem/example_convection_diffusion_dg.py +25 -4
  15. warp/examples/fem/example_diffusion_mgpu.py +8 -3
  16. warp/examples/fem/utils.py +68 -22
  17. warp/examples/interop/example_jax_kernel.py +2 -1
  18. warp/fabric.py +1 -1
  19. warp/fem/cache.py +27 -19
  20. warp/fem/domain.py +2 -2
  21. warp/fem/field/nodal_field.py +2 -2
  22. warp/fem/field/virtual.py +264 -166
  23. warp/fem/geometry/geometry.py +5 -5
  24. warp/fem/integrate.py +129 -51
  25. warp/fem/space/restriction.py +4 -0
  26. warp/fem/space/shape/tet_shape_function.py +3 -10
  27. warp/jax_experimental/custom_call.py +25 -2
  28. warp/jax_experimental/ffi.py +22 -1
  29. warp/jax_experimental/xla_ffi.py +16 -7
  30. warp/marching_cubes.py +708 -0
  31. warp/native/array.h +99 -4
  32. warp/native/builtin.h +86 -9
  33. warp/native/bvh.cpp +64 -28
  34. warp/native/bvh.cu +58 -58
  35. warp/native/bvh.h +2 -2
  36. warp/native/clang/clang.cpp +7 -7
  37. warp/native/coloring.cpp +8 -2
  38. warp/native/crt.cpp +2 -2
  39. warp/native/crt.h +3 -5
  40. warp/native/cuda_util.cpp +41 -10
  41. warp/native/cuda_util.h +10 -4
  42. warp/native/exports.h +1842 -1908
  43. warp/native/fabric.h +2 -1
  44. warp/native/hashgrid.cpp +37 -37
  45. warp/native/hashgrid.cu +2 -2
  46. warp/native/initializer_array.h +1 -1
  47. warp/native/intersect.h +2 -2
  48. warp/native/mat.h +1910 -116
  49. warp/native/mathdx.cpp +43 -43
  50. warp/native/mesh.cpp +24 -24
  51. warp/native/mesh.cu +26 -26
  52. warp/native/mesh.h +4 -2
  53. warp/native/nanovdb/GridHandle.h +179 -12
  54. warp/native/nanovdb/HostBuffer.h +8 -7
  55. warp/native/nanovdb/NanoVDB.h +517 -895
  56. warp/native/nanovdb/NodeManager.h +323 -0
  57. warp/native/nanovdb/PNanoVDB.h +2 -2
  58. warp/native/quat.h +331 -14
  59. warp/native/range.h +7 -1
  60. warp/native/reduce.cpp +10 -10
  61. warp/native/reduce.cu +13 -14
  62. warp/native/runlength_encode.cpp +2 -2
  63. warp/native/runlength_encode.cu +5 -5
  64. warp/native/scan.cpp +3 -3
  65. warp/native/scan.cu +4 -4
  66. warp/native/sort.cpp +10 -10
  67. warp/native/sort.cu +40 -31
  68. warp/native/sort.h +2 -0
  69. warp/native/sparse.cpp +8 -8
  70. warp/native/sparse.cu +13 -13
  71. warp/native/spatial.h +366 -17
  72. warp/native/temp_buffer.h +2 -2
  73. warp/native/tile.h +471 -82
  74. warp/native/vec.h +328 -14
  75. warp/native/volume.cpp +54 -54
  76. warp/native/volume.cu +1 -1
  77. warp/native/volume.h +2 -1
  78. warp/native/volume_builder.cu +30 -37
  79. warp/native/warp.cpp +150 -149
  80. warp/native/warp.cu +377 -216
  81. warp/native/warp.h +227 -226
  82. warp/optim/linear.py +736 -271
  83. warp/render/imgui_manager.py +289 -0
  84. warp/render/render_opengl.py +99 -18
  85. warp/render/render_usd.py +1 -0
  86. warp/sim/graph_coloring.py +2 -2
  87. warp/sparse.py +558 -175
  88. warp/tests/aux_test_module_aot.py +7 -0
  89. warp/tests/cuda/test_async.py +3 -3
  90. warp/tests/cuda/test_conditional_captures.py +101 -0
  91. warp/tests/geometry/test_hash_grid.py +38 -0
  92. warp/tests/geometry/test_marching_cubes.py +233 -12
  93. warp/tests/interop/test_jax.py +608 -28
  94. warp/tests/sim/test_coloring.py +6 -6
  95. warp/tests/test_array.py +58 -5
  96. warp/tests/test_codegen.py +4 -3
  97. warp/tests/test_context.py +8 -15
  98. warp/tests/test_enum.py +136 -0
  99. warp/tests/test_examples.py +2 -2
  100. warp/tests/test_fem.py +49 -6
  101. warp/tests/test_fixedarray.py +229 -0
  102. warp/tests/test_func.py +18 -15
  103. warp/tests/test_future_annotations.py +7 -5
  104. warp/tests/test_linear_solvers.py +30 -0
  105. warp/tests/test_map.py +15 -1
  106. warp/tests/test_mat.py +1518 -378
  107. warp/tests/test_mat_assign_copy.py +178 -0
  108. warp/tests/test_mat_constructors.py +574 -0
  109. warp/tests/test_module_aot.py +287 -0
  110. warp/tests/test_print.py +69 -0
  111. warp/tests/test_quat.py +140 -34
  112. warp/tests/test_quat_assign_copy.py +145 -0
  113. warp/tests/test_reload.py +2 -1
  114. warp/tests/test_sparse.py +71 -0
  115. warp/tests/test_spatial.py +140 -34
  116. warp/tests/test_spatial_assign_copy.py +160 -0
  117. warp/tests/test_struct.py +43 -3
  118. warp/tests/test_tuple.py +96 -0
  119. warp/tests/test_types.py +61 -20
  120. warp/tests/test_vec.py +179 -34
  121. warp/tests/test_vec_assign_copy.py +143 -0
  122. warp/tests/tile/test_tile.py +245 -18
  123. warp/tests/tile/test_tile_cholesky.py +605 -0
  124. warp/tests/tile/test_tile_load.py +169 -0
  125. warp/tests/tile/test_tile_mathdx.py +2 -558
  126. warp/tests/tile/test_tile_matmul.py +1 -1
  127. warp/tests/tile/test_tile_mlp.py +1 -1
  128. warp/tests/tile/test_tile_shared_memory.py +5 -5
  129. warp/tests/unittest_suites.py +6 -0
  130. warp/tests/walkthrough_debug.py +1 -1
  131. warp/thirdparty/unittest_parallel.py +108 -9
  132. warp/types.py +571 -267
  133. warp/utils.py +68 -86
  134. {warp_lang-1.8.1.dist-info → warp_lang-1.9.1.dist-info}/METADATA +29 -69
  135. {warp_lang-1.8.1.dist-info → warp_lang-1.9.1.dist-info}/RECORD +138 -128
  136. warp/native/marching.cpp +0 -19
  137. warp/native/marching.cu +0 -514
  138. warp/native/marching.h +0 -19
  139. {warp_lang-1.8.1.dist-info → warp_lang-1.9.1.dist-info}/WHEEL +0 -0
  140. {warp_lang-1.8.1.dist-info → warp_lang-1.9.1.dist-info}/licenses/LICENSE.md +0 -0
  141. {warp_lang-1.8.1.dist-info → warp_lang-1.9.1.dist-info}/top_level.txt +0 -0
@@ -21,7 +21,7 @@ import numpy as np
21
21
  import warp as wp
22
22
  from warp.tests.unittest_utils import *
23
23
 
24
- wp.init() # For wp.context.runtime.core.is_mathdx_enabled()
24
+ wp.init() # For wp.context.runtime.core.wp_is_mathdx_enabled()
25
25
 
26
26
  TILE_M = wp.constant(8)
27
27
  TILE_N = wp.constant(4)
@@ -45,7 +45,6 @@ def tile_math_matmul_kernel(
45
45
  wp.tile_store(gc, c, offset=(i * TILE_M, j * TILE_N))
46
46
 
47
47
 
48
- @unittest.skipUnless(wp.context.runtime.core.cuda_toolkit_version() >= 12060, "CUDA toolkit version is less than 12.6")
49
48
  def test_tile_math_matmul(test, device):
50
49
  rng = np.random.default_rng(42)
51
50
 
@@ -93,7 +92,7 @@ def tile_math_fft_kernel_vec2d(gx: wp.array2d(dtype=wp.vec2d), gy: wp.array2d(dt
93
92
  wp.tile_store(gy, xy)
94
93
 
95
94
 
96
- @unittest.skipUnless(wp.context.runtime.core.is_mathdx_enabled(), "Warp was not built with MathDx support")
95
+ @unittest.skipUnless(wp.context.runtime.core.wp_is_mathdx_enabled(), "Warp was not built with MathDx support")
97
96
  def test_tile_math_fft(test, device, wp_dtype):
98
97
  np_real_dtype = {wp.vec2f: np.float32, wp.vec2d: np.float64}[wp_dtype]
99
98
  np_cplx_dtype = {wp.vec2f: np.complex64, wp.vec2d: np.complex128}[wp_dtype]
@@ -124,503 +123,6 @@ def test_tile_math_fft(test, device, wp_dtype):
124
123
  # TODO: implement and test backward pass
125
124
 
126
125
 
127
- @wp.kernel()
128
- def tile_math_cholesky(
129
- gA: wp.array2d(dtype=wp.float64),
130
- gD: wp.array1d(dtype=wp.float64),
131
- gL: wp.array2d(dtype=wp.float64),
132
- gy: wp.array1d(dtype=wp.float64),
133
- gx: wp.array1d(dtype=wp.float64),
134
- ):
135
- i, j = wp.tid()
136
- # Load A, D & y
137
- a = wp.tile_load(gA, shape=(TILE_M, TILE_M), storage="shared")
138
- d = wp.tile_load(gD, shape=TILE_M, storage="shared")
139
- y = wp.tile_load(gy, shape=TILE_M, storage="shared")
140
- # Ensure tile_diag_add() and tile_cholesky_solve() work with transposed matrices
141
- a_t = wp.tile_transpose(a)
142
- # Compute L st LL^T = A^T + diag(D)
143
- b = wp.tile_diag_add(a_t, d)
144
- l = wp.tile_cholesky(b)
145
- # Solve for y in LL^T x = y
146
- x = wp.tile_cholesky_solve(l, y)
147
- # Store L & y
148
- wp.tile_store(gL, l)
149
- wp.tile_store(gx, x)
150
-
151
-
152
- @unittest.skipUnless(wp.context.runtime.core.cuda_toolkit_version() >= 12060, "CUDA toolkit version is less than 12.6")
153
- def test_tile_math_cholesky(test, device):
154
- A_h = np.ones((TILE_M, TILE_M), dtype=np.float64)
155
- D_h = 8.0 * np.ones(TILE_M, dtype=np.float64)
156
- L_h = np.zeros_like(A_h)
157
- Y_h = np.arange(TILE_M, dtype=np.float64)
158
- X_h = np.zeros_like(Y_h)
159
-
160
- A_np = A_h.T + np.diag(D_h)
161
- L_np = np.linalg.cholesky(A_np)
162
- X_np = np.linalg.solve(A_np, Y_h)
163
-
164
- A_wp = wp.array2d(A_h, requires_grad=True, dtype=wp.float64, device=device)
165
- D_wp = wp.array2d(D_h, requires_grad=True, dtype=wp.float64, device=device)
166
- L_wp = wp.array2d(L_h, requires_grad=True, dtype=wp.float64, device=device)
167
- Y_wp = wp.array2d(Y_h, requires_grad=True, dtype=wp.float64, device=device)
168
- X_wp = wp.array2d(X_h, requires_grad=True, dtype=wp.float64, device=device)
169
-
170
- wp.launch_tiled(
171
- tile_math_cholesky, dim=[1, 1], inputs=[A_wp, D_wp, L_wp, Y_wp, X_wp], block_dim=TILE_DIM, device=device
172
- )
173
- wp.synchronize_device(device)
174
-
175
- np.testing.assert_allclose(X_wp.numpy(), X_np)
176
- np.testing.assert_allclose(L_wp.numpy(), L_np)
177
-
178
- # TODO: implement and test backward pass
179
-
180
-
181
- @wp.kernel()
182
- def tile_math_cholesky_multiple_rhs(
183
- gA: wp.array2d(dtype=wp.float64),
184
- gD: wp.array1d(dtype=wp.float64),
185
- gL: wp.array2d(dtype=wp.float64),
186
- gy: wp.array2d(dtype=wp.float64),
187
- gx: wp.array2d(dtype=wp.float64),
188
- gz: wp.array2d(dtype=wp.float64),
189
- ):
190
- i, j = wp.tid()
191
- # Load A, D & y
192
- a = wp.tile_load(gA, shape=(TILE_M, TILE_M), storage="shared")
193
- d = wp.tile_load(gD, shape=TILE_M, storage="shared")
194
- y = wp.tile_load(gy, shape=(TILE_M, TILE_M), storage="shared")
195
- # Ensure tile_diag_add() and tile_cholesky_solve() work with transposed matrices
196
- a_t = wp.tile_transpose(a)
197
- # Compute L st LL^T = A.T + diag(D)
198
- b = wp.tile_diag_add(a_t, d)
199
- l = wp.tile_cholesky(b)
200
- # Solve for y in LL^T x = y.T
201
- y_t = wp.tile_transpose(y)
202
- x = wp.tile_cholesky_solve(l, y_t)
203
- # Ensure matmul receives correct layout information
204
- z = wp.tile_matmul(x, x)
205
- # Store L & y
206
- wp.tile_store(gL, l)
207
- wp.tile_store(gx, x)
208
- wp.tile_store(gz, z)
209
-
210
-
211
- @unittest.skipUnless(wp.context.runtime.core.cuda_toolkit_version() >= 12060, "CUDA toolkit version is less than 12.6")
212
- def test_tile_math_cholesky_multiple_rhs(test, device):
213
- A_h = np.ones((TILE_M, TILE_M), dtype=np.float64)
214
- D_h = 8.0 * np.ones(TILE_M, dtype=np.float64)
215
- L_h = np.zeros_like(A_h)
216
- Y_h = np.arange((TILE_M, TILE_M), dtype=np.float64)
217
- X_h = np.zeros_like(Y_h)
218
- Z_h = np.zeros_like(Y_h)
219
-
220
- A_np = A_h.T + np.diag(D_h)
221
- L_np = np.linalg.cholesky(A_np)
222
- X_np = np.linalg.solve(A_np, Y_h.T)
223
- Z_np = X_np @ X_np
224
-
225
- A_wp = wp.array2d(A_h, requires_grad=True, dtype=wp.float64, device=device)
226
- D_wp = wp.array2d(D_h, requires_grad=True, dtype=wp.float64, device=device)
227
- L_wp = wp.array2d(L_h, requires_grad=True, dtype=wp.float64, device=device)
228
- Y_wp = wp.array2d(Y_h, requires_grad=True, dtype=wp.float64, device=device)
229
- X_wp = wp.array2d(X_h, requires_grad=True, dtype=wp.float64, device=device)
230
- Z_wp = wp.array2d(Z_h, requires_grad=True, dtype=wp.float64, device=device)
231
-
232
- wp.launch_tiled(
233
- tile_math_cholesky_multiple_rhs,
234
- dim=[1, 1],
235
- inputs=[A_wp, D_wp, L_wp, Y_wp, X_wp, Z_wp],
236
- block_dim=TILE_DIM,
237
- device=device,
238
- )
239
- wp.synchronize_device(device)
240
-
241
- np.testing.assert_allclose(L_wp.numpy(), L_np)
242
- np.testing.assert_allclose(X_wp.numpy(), X_np)
243
- np.testing.assert_allclose(Z_wp.numpy(), Z_np)
244
-
245
- # TODO: implement and test backward pass
246
-
247
-
248
- @wp.kernel
249
- def tile_math_forward_substitution(
250
- gL: wp.array2d(dtype=wp.float64), gx: wp.array1d(dtype=wp.float64), gz: wp.array1d(dtype=wp.float64)
251
- ):
252
- i, j = wp.tid()
253
- # Load L & x
254
- L = wp.tile_load(gL, shape=(TILE_M, TILE_M), storage="shared")
255
- x = wp.tile_load(gx, shape=TILE_M, storage="shared")
256
- # Solve for z in Lz = x
257
- # Transpose because we loaded an upper triangular matrix
258
- z = wp.tile_lower_solve(wp.tile_transpose(L), x)
259
- # Store z
260
- wp.tile_store(gz, z)
261
-
262
-
263
- @unittest.skipUnless(wp.context.runtime.core.cuda_toolkit_version() >= 12060, "CUDA toolkit version is less than 12.6")
264
- def test_tile_math_forward_substitution(test, device):
265
- # Create test data
266
- rng = np.random.default_rng(42)
267
- L_h = np.triu(rng.random((TILE_M, TILE_M))) # Upper triangular matrix
268
- x_h = rng.random(TILE_M)
269
- z_h = np.zeros_like(x_h)
270
-
271
- # Compute reference solution using numpy
272
- z_np = np.linalg.solve(L_h.T, x_h)
273
-
274
- # Create Warp arrays
275
- L_wp = wp.array2d(L_h, requires_grad=True, dtype=wp.float64, device=device)
276
- x_wp = wp.array1d(x_h, requires_grad=True, dtype=wp.float64, device=device)
277
- z_wp = wp.array1d(z_h, requires_grad=True, dtype=wp.float64, device=device)
278
-
279
- # Run kernel
280
- wp.launch_tiled(
281
- tile_math_forward_substitution, dim=[1, 1], inputs=[L_wp, x_wp, z_wp], block_dim=TILE_DIM, device=device
282
- )
283
- wp.synchronize_device(device)
284
-
285
- # Verify results
286
- np.testing.assert_allclose(z_wp.numpy(), z_np)
287
-
288
- # TODO: implement and test backward pass
289
-
290
-
291
- @wp.kernel
292
- def tile_math_back_substitution(
293
- gL: wp.array2d(dtype=wp.float64), gx: wp.array1d(dtype=wp.float64), gz: wp.array1d(dtype=wp.float64)
294
- ):
295
- i, j = wp.tid()
296
- # Load L & x
297
- L = wp.tile_load(gL, shape=(TILE_M, TILE_M), storage="shared")
298
- x = wp.tile_load(gx, shape=TILE_M, storage="shared")
299
- # Solve for z in L^T z = x
300
- # Transpose because we loaded a lower triangular matrix
301
- z = wp.tile_upper_solve(wp.tile_transpose(L), x)
302
- # Store z
303
- wp.tile_store(gz, z)
304
-
305
-
306
- @unittest.skipUnless(wp.context.runtime.core.cuda_toolkit_version() >= 12060, "CUDA toolkit version is less than 12.6")
307
- def test_tile_math_back_substitution(test, device):
308
- # Create test data
309
- rng = np.random.default_rng(42)
310
- L_h = np.tril(rng.random((TILE_M, TILE_M))) # Lower triangular matrix
311
- x_h = rng.random(TILE_M)
312
- z_h = np.zeros_like(x_h)
313
-
314
- # Compute reference solution using numpy
315
- z_np = np.linalg.solve(L_h.T, x_h)
316
-
317
- # Create Warp arrays
318
- L_wp = wp.array2d(L_h, requires_grad=True, dtype=wp.float64, device=device)
319
- x_wp = wp.array1d(x_h, requires_grad=True, dtype=wp.float64, device=device)
320
- z_wp = wp.array1d(z_h, requires_grad=True, dtype=wp.float64, device=device)
321
-
322
- # Run kernel
323
- wp.launch_tiled(
324
- tile_math_back_substitution, dim=[1, 1], inputs=[L_wp, x_wp, z_wp], block_dim=TILE_DIM, device=device
325
- )
326
- wp.synchronize_device(device)
327
-
328
- # Verify results
329
- np.testing.assert_allclose(z_wp.numpy(), z_np)
330
-
331
- # TODO: implement and test backward pass
332
-
333
-
334
- @wp.kernel
335
- def tile_math_forward_substitution_multiple_rhs(
336
- gL: wp.array2d(dtype=wp.float64),
337
- gx: wp.array2d(dtype=wp.float64),
338
- gz: wp.array2d(dtype=wp.float64),
339
- gc: wp.array2d(dtype=wp.float64),
340
- ):
341
- i, j = wp.tid()
342
- # Load L & x
343
- L = wp.tile_load(gL, shape=(TILE_M, TILE_M), storage="shared")
344
- x = wp.tile_load(gx, shape=(TILE_M, TILE_M), storage="shared")
345
- # Solve for z in Lz = x.T
346
- x_t = wp.tile_transpose(x)
347
- z = wp.tile_lower_solve(L, x_t)
348
- # Ensure matmul receives correct layout information
349
- c = wp.tile_matmul(z, z)
350
- # Store z and c
351
- wp.tile_store(gz, z)
352
- wp.tile_store(gc, c)
353
-
354
-
355
- @unittest.skipUnless(wp.context.runtime.core.cuda_toolkit_version() >= 12060, "CUDA toolkit version is less than 12.6")
356
- def test_tile_math_forward_substitution_multiple_rhs(test, device):
357
- # Create test data
358
- rng = np.random.default_rng(42)
359
- L_h = np.tril(rng.random((TILE_M, TILE_M))) # Lower triangular matrix
360
- x_h = rng.random((TILE_M, TILE_M)) # Multiple right-hand sides
361
- z_h = np.zeros_like(x_h)
362
- c_h = np.zeros_like(x_h)
363
-
364
- # Compute reference solution using numpy
365
- z_np = np.linalg.solve(L_h, x_h.T)
366
- c_np = z_np @ z_np
367
-
368
- # Create Warp arrays
369
- L_wp = wp.array2d(L_h, requires_grad=True, dtype=wp.float64, device=device)
370
- x_wp = wp.array2d(x_h, requires_grad=True, dtype=wp.float64, device=device)
371
- z_wp = wp.array2d(z_h, requires_grad=True, dtype=wp.float64, device=device)
372
- c_wp = wp.array2d(c_h, requires_grad=True, dtype=wp.float64, device=device)
373
-
374
- # Run kernel
375
- wp.launch_tiled(
376
- tile_math_forward_substitution_multiple_rhs,
377
- dim=[1, 1],
378
- inputs=[L_wp, x_wp, z_wp, c_wp],
379
- block_dim=TILE_DIM,
380
- device=device,
381
- )
382
- wp.synchronize_device()
383
-
384
- # Verify results
385
- assert np.allclose(z_wp.numpy(), z_np)
386
- assert np.allclose(c_wp.numpy(), c_np)
387
-
388
- # TODO: implement and test backward pass
389
-
390
-
391
- @wp.kernel
392
- def tile_math_back_substitution_multiple_rhs(
393
- gL: wp.array2d(dtype=wp.float64),
394
- gx: wp.array2d(dtype=wp.float64),
395
- gz: wp.array2d(dtype=wp.float64),
396
- gc: wp.array2d(dtype=wp.float64),
397
- ):
398
- i, j = wp.tid()
399
- # Load L & x
400
- L = wp.tile_load(gL, shape=(TILE_M, TILE_M), storage="shared")
401
- x = wp.tile_load(gx, shape=(TILE_M, TILE_M), storage="shared")
402
- # Solve for z in L^T z = x.T
403
- x_t = wp.tile_transpose(x)
404
- z = wp.tile_upper_solve(wp.tile_transpose(L), x_t)
405
- # Ensure matmul receives correct layout information
406
- c = wp.tile_matmul(z, z)
407
- # Store z and c
408
- wp.tile_store(gz, z)
409
- wp.tile_store(gc, c)
410
-
411
-
412
- @unittest.skipUnless(wp.context.runtime.core.cuda_toolkit_version() >= 12060, "CUDA toolkit version is less than 12.6")
413
- def test_tile_math_back_substitution_multiple_rhs(test, device):
414
- # Create test data
415
- rng = np.random.default_rng(42)
416
- L_h = np.tril(rng.random((TILE_M, TILE_M))) # Lower triangular matrix
417
- x_h = rng.random((TILE_M, TILE_M)) # Multiple right-hand sides
418
- z_h = np.zeros_like(x_h)
419
- c_h = np.zeros_like(x_h)
420
-
421
- # Compute reference solution using numpy
422
- z_np = np.linalg.solve(L_h.T, x_h.T)
423
- c_np = z_np @ z_np
424
-
425
- # Create Warp arrays
426
- L_wp = wp.array2d(L_h, requires_grad=True, dtype=wp.float64, device=device)
427
- x_wp = wp.array2d(x_h, requires_grad=True, dtype=wp.float64, device=device)
428
- z_wp = wp.array2d(z_h, requires_grad=True, dtype=wp.float64, device=device)
429
- c_wp = wp.array2d(c_h, requires_grad=True, dtype=wp.float64, device=device)
430
-
431
- # Run kernel
432
- wp.launch_tiled(
433
- tile_math_back_substitution_multiple_rhs,
434
- dim=[1, 1],
435
- inputs=[L_wp, x_wp, z_wp, c_wp],
436
- block_dim=TILE_DIM,
437
- device=device,
438
- )
439
- wp.synchronize_device()
440
-
441
- # Verify results
442
- assert np.allclose(z_wp.numpy(), z_np)
443
- assert np.allclose(c_wp.numpy(), c_np)
444
-
445
- # TODO: implement and test backward pass
446
-
447
-
448
- # tests a complex composition of most libmathdx calls
449
- @unittest.skipUnless(wp.context.runtime.core.cuda_toolkit_version() >= 12060, "CUDA toolkit version is less than 12.6")
450
- def test_tile_math_block_cholesky(test, device):
451
- BLOCK_SIZE = wp.constant(TILE_M // 2)
452
-
453
- @wp.kernel(module="unique")
454
- def block_cholesky_kernel(
455
- A: wp.array2d(dtype=float),
456
- L: wp.array2d(dtype=float),
457
- ):
458
- """
459
- Computes the Cholesky factorization of a symmetric positive definite matrix A in blocks.
460
- It returns a lower-triangular matrix L such that A = L L^T.
461
- """
462
-
463
- # Process the matrix in blocks along its leading dimension.
464
- for k in range(0, TILE_M, BLOCK_SIZE):
465
- end = k + BLOCK_SIZE
466
-
467
- # Load current diagonal block A[k:end, k:end]
468
- # and update with contributions from previously computed blocks.
469
- A_kk_tile = wp.tile_load(A, shape=(BLOCK_SIZE, BLOCK_SIZE), offset=(k, k), storage="shared")
470
-
471
- for j in range(0, k, BLOCK_SIZE):
472
- L_block = wp.tile_load(L, shape=(BLOCK_SIZE, BLOCK_SIZE), offset=(k, j))
473
- L_block_T = wp.tile_transpose(L_block)
474
- L_L_T_block = wp.tile_matmul(L_block, L_block_T)
475
- A_kk_tile -= L_L_T_block
476
-
477
- # Compute the Cholesky factorization for the block
478
- # print(A_kk_tile)
479
- L_kk_tile = wp.tile_cholesky(A_kk_tile)
480
- wp.tile_store(L, L_kk_tile, offset=(k, k))
481
-
482
- # Process the blocks below the current block
483
- for i in range(end, TILE_M, BLOCK_SIZE):
484
- A_ik_tile = wp.tile_load(A, shape=(BLOCK_SIZE, BLOCK_SIZE), offset=(i, k), storage="shared")
485
-
486
- for j in range(0, k, BLOCK_SIZE):
487
- L_tile = wp.tile_load(L, shape=(BLOCK_SIZE, BLOCK_SIZE), offset=(i, j))
488
- L_2_tile = wp.tile_load(L, shape=(BLOCK_SIZE, BLOCK_SIZE), offset=(k, j))
489
- L_T_tile = wp.tile_transpose(L_2_tile)
490
- L_L_T_tile = wp.tile_matmul(L_tile, L_T_tile)
491
- A_ik_tile -= L_L_T_tile
492
-
493
- A_ik_T_tile = wp.tile_transpose(A_ik_tile)
494
- sol_T_tile = wp.tile_lower_solve(L_kk_tile, A_ik_T_tile)
495
- sol_tile = wp.tile_transpose(sol_T_tile)
496
-
497
- wp.tile_store(L, sol_tile, offset=(i, k))
498
-
499
- @wp.kernel(module="unique")
500
- def block_cholesky_solve_kernel(
501
- L: wp.array2d(dtype=float),
502
- b: wp.array2d(dtype=float),
503
- scratch: wp.array2d(dtype=float),
504
- x: wp.array2d(dtype=float),
505
- ):
506
- """
507
- Solves A x = b given the Cholesky factor L (A = L L^T) using
508
- blocked forward and backward substitution.
509
- """
510
-
511
- # Forward substitution: solve L y = b
512
- for i in range(0, TILE_M, BLOCK_SIZE):
513
- i_end = i + BLOCK_SIZE
514
- rhs_tile = wp.tile_load(b, shape=(BLOCK_SIZE, 1), offset=(i, 0))
515
- for j in range(0, i, BLOCK_SIZE):
516
- L_block = wp.tile_load(L, shape=(BLOCK_SIZE, BLOCK_SIZE), offset=(i, j))
517
- y_block = wp.tile_load(scratch, shape=(BLOCK_SIZE, 1), offset=(j, 0))
518
- Ly_block = wp.tile_matmul(L_block, y_block)
519
- rhs_tile -= Ly_block
520
- L_tile = wp.tile_load(L, shape=(BLOCK_SIZE, BLOCK_SIZE), offset=(i, i))
521
- y_tile = wp.tile_lower_solve(L_tile, rhs_tile)
522
- wp.tile_store(scratch, y_tile, offset=(i, 0))
523
-
524
- # Backward substitution: solve L^T x = y
525
- for i in range(TILE_M - BLOCK_SIZE, -1, -BLOCK_SIZE):
526
- i_start = i
527
- i_end = i_start + BLOCK_SIZE
528
- rhs_tile = wp.tile_load(scratch, shape=(BLOCK_SIZE, 1), offset=(i_start, 0))
529
- for j in range(i_end, TILE_M, BLOCK_SIZE):
530
- L_tile = wp.tile_load(L, shape=(BLOCK_SIZE, BLOCK_SIZE), offset=(j, i_start))
531
- L_T_tile = wp.tile_transpose(L_tile)
532
- x_tile = wp.tile_load(x, shape=(BLOCK_SIZE, 1), offset=(j, 0))
533
- L_T_x_tile = wp.tile_matmul(L_T_tile, x_tile)
534
- rhs_tile -= L_T_x_tile
535
- L_tile = wp.tile_load(L, shape=(BLOCK_SIZE, BLOCK_SIZE), offset=(i_start, i_start))
536
- x_tile = wp.tile_upper_solve(wp.tile_transpose(L_tile), rhs_tile)
537
- wp.tile_store(x, x_tile, offset=(i_start, 0))
538
-
539
- # check block cholesky decomposition
540
-
541
- rng = np.random.default_rng(42)
542
-
543
- M = np.array(rng.random((TILE_M, TILE_M)), dtype=float)
544
-
545
- A_np = M.T @ M + np.eye(TILE_M, TILE_M)
546
- L_np = np.linalg.cholesky(A_np)
547
-
548
- A_wp = wp.array2d(A_np, dtype=float, device=device)
549
- L_wp = wp.zeros_like(A_wp)
550
-
551
- wp.launch_tiled(block_cholesky_kernel, dim=1, inputs=[A_wp], outputs=[L_wp], block_dim=TILE_DIM, device=device)
552
-
553
- # check block cholesky solve
554
-
555
- assert_np_equal(L_wp.numpy(), L_np, tol=1e-6)
556
-
557
- b_np = np.array(rng.random((TILE_M, 1)), dtype=float)
558
- b_wp = wp.array(b_np, dtype=float, device=device)
559
-
560
- scratch = wp.zeros_like(b_wp)
561
-
562
- x_np = np.linalg.solve(L_np.T, np.linalg.solve(L_np, b_np))
563
- x_wp = wp.zeros_like(b_wp)
564
-
565
- wp.launch_tiled(
566
- block_cholesky_solve_kernel,
567
- dim=1,
568
- inputs=[L_wp, b_wp, scratch],
569
- outputs=[x_wp],
570
- block_dim=TILE_DIM,
571
- device=device,
572
- )
573
-
574
- assert_np_equal(x_wp.numpy(), x_np, tol=1e-6)
575
-
576
-
577
- @wp.kernel
578
- def test_tile_lower_solve(L: wp.array2d(dtype=float), y: wp.array(dtype=float), x: wp.array(dtype=float)):
579
- L_tile = wp.tile_load(L, shape=(TILE_M, TILE_M))
580
- y_tile = wp.tile_load(x, shape=(TILE_M,))
581
- sol = wp.tile_lower_solve(L_tile, y_tile)
582
- wp.tile_store(x, sol)
583
-
584
-
585
- @wp.kernel
586
- def test_tile_upper_solve(L: wp.array2d(dtype=float), y: wp.array(dtype=float), x: wp.array(dtype=float)):
587
- L_tile = wp.tile_load(L, shape=(TILE_M, TILE_M))
588
- y_tile = wp.tile_load(x, shape=(TILE_M,))
589
- sol = wp.tile_upper_solve(L_tile, y_tile)
590
- wp.tile_store(x, sol)
591
-
592
-
593
- @unittest.skipUnless(wp.context.runtime.core.cuda_toolkit_version() >= 12060, "CUDA toolkit version is less than 12.6")
594
- def test_tile_math_singular_matrices(test, device):
595
- rng = np.random.default_rng(42)
596
- L_np = np.tril(rng.random((TILE_M, TILE_M))) # Lower triangular matrix
597
- L_np[-1, -1] = 0.0 # Make it singular
598
- y_np = rng.random(TILE_M)
599
-
600
- L_wp = wp.array2d(L_np, dtype=float, device=device)
601
- y_wp = wp.array(y_np, dtype=float, device=device)
602
- x_wp = wp.zeros_like(y_wp)
603
-
604
- wp.launch_tiled(
605
- test_tile_lower_solve, dim=1, inputs=[L_wp, y_wp], outputs=[x_wp], block_dim=TILE_DIM, device=device
606
- )
607
-
608
- assert np.isnan(x_wp.numpy()).any()
609
-
610
- L_np = np.triu(rng.random((TILE_M, TILE_M))) # Upper triangular matrix
611
- L_np[-1, -1] = 0.0 # Make it singular
612
-
613
- L_wp = wp.array2d(L_np, dtype=float, device=device)
614
- y_wp = wp.array(y_np, dtype=float, device=device)
615
- x_wp = wp.zeros_like(y_wp)
616
-
617
- wp.launch_tiled(
618
- test_tile_upper_solve, dim=1, inputs=[L_wp, y_wp], outputs=[x_wp], block_dim=TILE_DIM, device=device
619
- )
620
-
621
- assert np.isnan(x_wp.numpy()).any()
622
-
623
-
624
126
  all_devices = get_test_devices()
625
127
  cuda_devices = get_cuda_test_devices()
626
128
 
@@ -633,16 +135,6 @@ class TestTileMathDx(unittest.TestCase):
633
135
  add_function_test(
634
136
  TestTileMathDx, "test_tile_math_matmul", test_tile_math_matmul, devices=all_devices, check_output=False
635
137
  )
636
- add_function_test(
637
- TestTileMathDx, "test_tile_math_cholesky", test_tile_math_cholesky, devices=all_devices, check_output=False
638
- )
639
- add_function_test(
640
- TestTileMathDx,
641
- "tile_math_cholesky_multiple_rhs",
642
- tile_math_cholesky_multiple_rhs,
643
- devices=all_devices,
644
- check_output=False,
645
- )
646
138
  add_function_test(
647
139
  TestTileMathDx,
648
140
  "test_tile_math_fft_vec2f",
@@ -658,54 +150,6 @@ add_function_test(
658
150
  check_output=False,
659
151
  )
660
152
 
661
- add_function_test(
662
- TestTileMathDx,
663
- "test_tile_math_forward_substitution",
664
- test_tile_math_forward_substitution,
665
- devices=cuda_devices,
666
- check_output=False,
667
- )
668
-
669
- add_function_test(
670
- TestTileMathDx,
671
- "test_tile_math_back_substitution",
672
- test_tile_math_back_substitution,
673
- devices=cuda_devices,
674
- check_output=False,
675
- )
676
-
677
- add_function_test(
678
- TestTileMathDx,
679
- "test_tile_math_forward_substitution_multiple_rhs",
680
- test_tile_math_forward_substitution_multiple_rhs,
681
- devices=cuda_devices,
682
- check_output=False,
683
- )
684
-
685
- add_function_test(
686
- TestTileMathDx,
687
- "test_tile_math_back_substitution_multiple_rhs",
688
- test_tile_math_back_substitution_multiple_rhs,
689
- devices=cuda_devices,
690
- check_output=False,
691
- )
692
-
693
- add_function_test(
694
- TestTileMathDx,
695
- "test_tile_math_block_cholesky",
696
- test_tile_math_block_cholesky,
697
- devices=cuda_devices,
698
- check_output=False,
699
- )
700
-
701
- add_function_test(
702
- TestTileMathDx,
703
- "test_tile_math_singular_matrices",
704
- test_tile_math_singular_matrices,
705
- devices=cuda_devices,
706
- check_output=False,
707
- )
708
-
709
153
 
710
154
  if __name__ == "__main__":
711
155
  wp.clear_kernel_cache()
@@ -159,7 +159,7 @@ def test_tile_transpose_matmul(test, device):
159
159
  test_tile_transpose_matmul_kernel, dim=[1], inputs=[input, output], block_dim=TILE_DIM, device=device
160
160
  )
161
161
 
162
- assert_np_equal(output.numpy(), input.numpy().T @ input.numpy())
162
+ assert_np_equal(output.numpy(), input.numpy().T @ input.numpy(), 1e-6)
163
163
 
164
164
 
165
165
  class TestTileMatmul(unittest.TestCase):
@@ -43,7 +43,7 @@ def create_array(rng, dim_in, dim_hid, dtype=float):
43
43
  def test_multi_layer_nn(test, device):
44
44
  import torch as tc
45
45
 
46
- if device.is_cuda and not wp.context.runtime.core.is_mathdx_enabled():
46
+ if device.is_cuda and not wp.context.runtime.core.wp_is_mathdx_enabled():
47
47
  test.skipTest("Skipping test on CUDA device without MathDx (tolerance)")
48
48
 
49
49
  NUM_FREQ = wp.constant(8)
@@ -110,13 +110,13 @@ def test_tile_shared_mem_graph(test, device):
110
110
 
111
111
  out = wp.empty((DIM_M, DIM_N), dtype=float, device=device)
112
112
 
113
- compute.module.load(device)
113
+ # preload the unique module
114
+ wp.load_module(compute.module, device=device, block_dim=BLOCK_DIM)
114
115
 
115
- wp.capture_begin(device, force_module_load=False)
116
- wp.launch_tiled(compute, dim=[1], inputs=[out], block_dim=BLOCK_DIM, device=device)
117
- graph = wp.capture_end(device)
116
+ with wp.ScopedCapture(device, force_module_load=False) as capture:
117
+ wp.launch_tiled(compute, dim=[1], inputs=[out], block_dim=BLOCK_DIM, device=device)
118
118
 
119
- wp.capture_launch(graph)
119
+ wp.capture_launch(capture.graph)
120
120
 
121
121
  # check output
122
122
  assert_np_equal(out.numpy(), np.ones((DIM_M, DIM_N)) * 3.0)
@@ -164,6 +164,7 @@ def default_suite(test_loader: unittest.TestLoader = unittest.defaultTestLoader)
164
164
  from warp.tests.test_linear_solvers import TestLinearSolvers
165
165
  from warp.tests.test_lvalue import TestLValue
166
166
  from warp.tests.test_mat import TestMat
167
+ from warp.tests.test_mat_constructors import TestMatConstructors
167
168
  from warp.tests.test_mat_lite import TestMatLite
168
169
  from warp.tests.test_mat_scalar_ops import TestMatScalarOps
169
170
  from warp.tests.test_math import TestMath
@@ -198,6 +199,7 @@ def default_suite(test_loader: unittest.TestLoader = unittest.defaultTestLoader)
198
199
  from warp.tests.test_vec_scalar_ops import TestVecScalarOps
199
200
  from warp.tests.test_verify_fp import TestVerifyFP
200
201
  from warp.tests.tile.test_tile import TestTile
202
+ from warp.tests.tile.test_tile_cholesky import TestTileCholesky
201
203
  from warp.tests.tile.test_tile_load import TestTileLoad
202
204
  from warp.tests.tile.test_tile_mathdx import TestTileMathDx
203
205
  from warp.tests.tile.test_tile_matmul import TestTileMatmul
@@ -261,6 +263,7 @@ def default_suite(test_loader: unittest.TestLoader = unittest.defaultTestLoader)
261
263
  TestLValue,
262
264
  TestMarchingCubes,
263
265
  TestMat,
266
+ TestMatConstructors,
264
267
  TestMatLite,
265
268
  TestMatScalarOps,
266
269
  TestMath,
@@ -298,6 +301,7 @@ def default_suite(test_loader: unittest.TestLoader = unittest.defaultTestLoader)
298
301
  TestStruct,
299
302
  TestTape,
300
303
  TestTile,
304
+ TestTileCholesky,
301
305
  TestTileLoad,
302
306
  TestTileMathDx,
303
307
  TestTileMatmul,
@@ -360,6 +364,7 @@ def kit_suite(test_loader: unittest.TestLoader = unittest.defaultTestLoader):
360
364
  from warp.tests.test_lvalue import TestLValue
361
365
  from warp.tests.test_mat_lite import TestMatLite
362
366
  from warp.tests.test_math import TestMath
367
+ from warp.tests.test_module_aot import TestModuleAOT
363
368
  from warp.tests.test_module_hashing import TestModuleHashing
364
369
  from warp.tests.test_modules_lite import TestModuleLite
365
370
  from warp.tests.test_noise import TestNoise
@@ -406,6 +411,7 @@ def kit_suite(test_loader: unittest.TestLoader = unittest.defaultTestLoader):
406
411
  TestMeshQueryAABBMethods,
407
412
  TestMeshQueryPoint,
408
413
  TestMeshQueryRay,
414
+ TestModuleAOT,
409
415
  TestModuleHashing,
410
416
  TestModuleLite,
411
417
  TestNoise,
@@ -68,7 +68,7 @@ wp.init()
68
68
  wp.config.mode = "debug"
69
69
 
70
70
  # Make sure Warp was built with `build_lib.py --mode=debug`
71
- assert wp.context.runtime.core.is_debug_enabled(), "Warp must be built in debug mode to enable debugging kernels"
71
+ assert wp.context.runtime.core.wp_is_debug_enabled(), "Warp must be built in debug mode to enable debugging kernels"
72
72
 
73
73
 
74
74
  @wp.kernel