warp-lang 1.8.0__py3-none-macosx_10_13_universal2.whl → 1.9.0__py3-none-macosx_10_13_universal2.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of warp-lang might be problematic. Click here for more details.

Files changed (153) hide show
  1. warp/__init__.py +282 -103
  2. warp/__init__.pyi +482 -110
  3. warp/bin/libwarp-clang.dylib +0 -0
  4. warp/bin/libwarp.dylib +0 -0
  5. warp/build.py +93 -30
  6. warp/build_dll.py +48 -63
  7. warp/builtins.py +955 -137
  8. warp/codegen.py +327 -209
  9. warp/config.py +1 -1
  10. warp/context.py +1363 -800
  11. warp/examples/core/example_marching_cubes.py +1 -0
  12. warp/examples/core/example_render_opengl.py +100 -3
  13. warp/examples/fem/example_apic_fluid.py +98 -52
  14. warp/examples/fem/example_convection_diffusion_dg.py +25 -4
  15. warp/examples/fem/example_diffusion_mgpu.py +8 -3
  16. warp/examples/fem/utils.py +68 -22
  17. warp/examples/interop/example_jax_callable.py +34 -4
  18. warp/examples/interop/example_jax_kernel.py +27 -1
  19. warp/fabric.py +1 -1
  20. warp/fem/cache.py +27 -19
  21. warp/fem/domain.py +2 -2
  22. warp/fem/field/nodal_field.py +2 -2
  23. warp/fem/field/virtual.py +266 -166
  24. warp/fem/geometry/geometry.py +5 -5
  25. warp/fem/integrate.py +200 -91
  26. warp/fem/space/restriction.py +4 -0
  27. warp/fem/space/shape/tet_shape_function.py +3 -10
  28. warp/jax_experimental/custom_call.py +1 -1
  29. warp/jax_experimental/ffi.py +203 -54
  30. warp/marching_cubes.py +708 -0
  31. warp/native/array.h +103 -8
  32. warp/native/builtin.h +90 -9
  33. warp/native/bvh.cpp +64 -28
  34. warp/native/bvh.cu +58 -58
  35. warp/native/bvh.h +2 -2
  36. warp/native/clang/clang.cpp +7 -7
  37. warp/native/coloring.cpp +13 -3
  38. warp/native/crt.cpp +2 -2
  39. warp/native/crt.h +3 -5
  40. warp/native/cuda_util.cpp +42 -11
  41. warp/native/cuda_util.h +10 -4
  42. warp/native/exports.h +1842 -1908
  43. warp/native/fabric.h +2 -1
  44. warp/native/hashgrid.cpp +37 -37
  45. warp/native/hashgrid.cu +2 -2
  46. warp/native/initializer_array.h +1 -1
  47. warp/native/intersect.h +4 -4
  48. warp/native/mat.h +1913 -119
  49. warp/native/mathdx.cpp +43 -43
  50. warp/native/mesh.cpp +24 -24
  51. warp/native/mesh.cu +26 -26
  52. warp/native/mesh.h +5 -3
  53. warp/native/nanovdb/GridHandle.h +179 -12
  54. warp/native/nanovdb/HostBuffer.h +8 -7
  55. warp/native/nanovdb/NanoVDB.h +517 -895
  56. warp/native/nanovdb/NodeManager.h +323 -0
  57. warp/native/nanovdb/PNanoVDB.h +2 -2
  58. warp/native/quat.h +337 -16
  59. warp/native/rand.h +7 -7
  60. warp/native/range.h +7 -1
  61. warp/native/reduce.cpp +10 -10
  62. warp/native/reduce.cu +13 -14
  63. warp/native/runlength_encode.cpp +2 -2
  64. warp/native/runlength_encode.cu +5 -5
  65. warp/native/scan.cpp +3 -3
  66. warp/native/scan.cu +4 -4
  67. warp/native/sort.cpp +10 -10
  68. warp/native/sort.cu +22 -22
  69. warp/native/sparse.cpp +8 -8
  70. warp/native/sparse.cu +14 -14
  71. warp/native/spatial.h +366 -17
  72. warp/native/svd.h +23 -8
  73. warp/native/temp_buffer.h +2 -2
  74. warp/native/tile.h +303 -70
  75. warp/native/tile_radix_sort.h +5 -1
  76. warp/native/tile_reduce.h +16 -25
  77. warp/native/tuple.h +2 -2
  78. warp/native/vec.h +385 -18
  79. warp/native/volume.cpp +54 -54
  80. warp/native/volume.cu +1 -1
  81. warp/native/volume.h +2 -1
  82. warp/native/volume_builder.cu +30 -37
  83. warp/native/warp.cpp +150 -149
  84. warp/native/warp.cu +337 -193
  85. warp/native/warp.h +227 -226
  86. warp/optim/linear.py +736 -271
  87. warp/render/imgui_manager.py +289 -0
  88. warp/render/render_opengl.py +137 -57
  89. warp/render/render_usd.py +0 -1
  90. warp/sim/collide.py +1 -2
  91. warp/sim/graph_coloring.py +2 -2
  92. warp/sim/integrator_vbd.py +10 -2
  93. warp/sparse.py +559 -176
  94. warp/tape.py +2 -0
  95. warp/tests/aux_test_module_aot.py +7 -0
  96. warp/tests/cuda/test_async.py +3 -3
  97. warp/tests/cuda/test_conditional_captures.py +101 -0
  98. warp/tests/geometry/test_marching_cubes.py +233 -12
  99. warp/tests/sim/test_cloth.py +89 -6
  100. warp/tests/sim/test_coloring.py +82 -7
  101. warp/tests/test_array.py +56 -5
  102. warp/tests/test_assert.py +53 -0
  103. warp/tests/test_atomic_cas.py +127 -114
  104. warp/tests/test_codegen.py +3 -2
  105. warp/tests/test_context.py +8 -15
  106. warp/tests/test_enum.py +136 -0
  107. warp/tests/test_examples.py +2 -2
  108. warp/tests/test_fem.py +45 -2
  109. warp/tests/test_fixedarray.py +229 -0
  110. warp/tests/test_func.py +18 -15
  111. warp/tests/test_future_annotations.py +7 -5
  112. warp/tests/test_linear_solvers.py +30 -0
  113. warp/tests/test_map.py +1 -1
  114. warp/tests/test_mat.py +1540 -378
  115. warp/tests/test_mat_assign_copy.py +178 -0
  116. warp/tests/test_mat_constructors.py +574 -0
  117. warp/tests/test_module_aot.py +287 -0
  118. warp/tests/test_print.py +69 -0
  119. warp/tests/test_quat.py +162 -34
  120. warp/tests/test_quat_assign_copy.py +145 -0
  121. warp/tests/test_reload.py +2 -1
  122. warp/tests/test_sparse.py +103 -0
  123. warp/tests/test_spatial.py +140 -34
  124. warp/tests/test_spatial_assign_copy.py +160 -0
  125. warp/tests/test_static.py +48 -0
  126. warp/tests/test_struct.py +43 -3
  127. warp/tests/test_tape.py +38 -0
  128. warp/tests/test_types.py +0 -20
  129. warp/tests/test_vec.py +216 -441
  130. warp/tests/test_vec_assign_copy.py +143 -0
  131. warp/tests/test_vec_constructors.py +325 -0
  132. warp/tests/tile/test_tile.py +206 -152
  133. warp/tests/tile/test_tile_cholesky.py +605 -0
  134. warp/tests/tile/test_tile_load.py +169 -0
  135. warp/tests/tile/test_tile_mathdx.py +2 -558
  136. warp/tests/tile/test_tile_matmul.py +179 -0
  137. warp/tests/tile/test_tile_mlp.py +1 -1
  138. warp/tests/tile/test_tile_reduce.py +100 -11
  139. warp/tests/tile/test_tile_shared_memory.py +16 -16
  140. warp/tests/tile/test_tile_sort.py +59 -55
  141. warp/tests/unittest_suites.py +16 -0
  142. warp/tests/walkthrough_debug.py +1 -1
  143. warp/thirdparty/unittest_parallel.py +108 -9
  144. warp/types.py +554 -264
  145. warp/utils.py +68 -86
  146. {warp_lang-1.8.0.dist-info → warp_lang-1.9.0.dist-info}/METADATA +28 -65
  147. {warp_lang-1.8.0.dist-info → warp_lang-1.9.0.dist-info}/RECORD +150 -138
  148. warp/native/marching.cpp +0 -19
  149. warp/native/marching.cu +0 -514
  150. warp/native/marching.h +0 -19
  151. {warp_lang-1.8.0.dist-info → warp_lang-1.9.0.dist-info}/WHEEL +0 -0
  152. {warp_lang-1.8.0.dist-info → warp_lang-1.9.0.dist-info}/licenses/LICENSE.md +0 -0
  153. {warp_lang-1.8.0.dist-info → warp_lang-1.9.0.dist-info}/top_level.txt +0 -0
@@ -21,7 +21,7 @@ import numpy as np
21
21
  import warp as wp
22
22
  from warp.tests.unittest_utils import *
23
23
 
24
- wp.init() # For wp.context.runtime.core.is_mathdx_enabled()
24
+ wp.init() # For wp.context.runtime.core.wp_is_mathdx_enabled()
25
25
 
26
26
  TILE_M = wp.constant(8)
27
27
  TILE_N = wp.constant(4)
@@ -45,7 +45,6 @@ def tile_math_matmul_kernel(
45
45
  wp.tile_store(gc, c, offset=(i * TILE_M, j * TILE_N))
46
46
 
47
47
 
48
- @unittest.skipUnless(wp.context.runtime.core.cuda_toolkit_version() >= 12060, "CUDA toolkit version is less than 12.6")
49
48
  def test_tile_math_matmul(test, device):
50
49
  rng = np.random.default_rng(42)
51
50
 
@@ -93,7 +92,7 @@ def tile_math_fft_kernel_vec2d(gx: wp.array2d(dtype=wp.vec2d), gy: wp.array2d(dt
93
92
  wp.tile_store(gy, xy)
94
93
 
95
94
 
96
- @unittest.skipUnless(wp.context.runtime.core.is_mathdx_enabled(), "Warp was not built with MathDx support")
95
+ @unittest.skipUnless(wp.context.runtime.core.wp_is_mathdx_enabled(), "Warp was not built with MathDx support")
97
96
  def test_tile_math_fft(test, device, wp_dtype):
98
97
  np_real_dtype = {wp.vec2f: np.float32, wp.vec2d: np.float64}[wp_dtype]
99
98
  np_cplx_dtype = {wp.vec2f: np.complex64, wp.vec2d: np.complex128}[wp_dtype]
@@ -124,503 +123,6 @@ def test_tile_math_fft(test, device, wp_dtype):
124
123
  # TODO: implement and test backward pass
125
124
 
126
125
 
127
- @wp.kernel()
128
- def tile_math_cholesky(
129
- gA: wp.array2d(dtype=wp.float64),
130
- gD: wp.array1d(dtype=wp.float64),
131
- gL: wp.array2d(dtype=wp.float64),
132
- gy: wp.array1d(dtype=wp.float64),
133
- gx: wp.array1d(dtype=wp.float64),
134
- ):
135
- i, j = wp.tid()
136
- # Load A, D & y
137
- a = wp.tile_load(gA, shape=(TILE_M, TILE_M), storage="shared")
138
- d = wp.tile_load(gD, shape=TILE_M, storage="shared")
139
- y = wp.tile_load(gy, shape=TILE_M, storage="shared")
140
- # Ensure tile_diag_add() and tile_cholesky_solve() work with transposed matrices
141
- a_t = wp.tile_transpose(a)
142
- # Compute L st LL^T = A^T + diag(D)
143
- b = wp.tile_diag_add(a_t, d)
144
- l = wp.tile_cholesky(b)
145
- # Solve for y in LL^T x = y
146
- x = wp.tile_cholesky_solve(l, y)
147
- # Store L & y
148
- wp.tile_store(gL, l)
149
- wp.tile_store(gx, x)
150
-
151
-
152
- @unittest.skipUnless(wp.context.runtime.core.cuda_toolkit_version() >= 12060, "CUDA toolkit version is less than 12.6")
153
- def test_tile_math_cholesky(test, device):
154
- A_h = np.ones((TILE_M, TILE_M), dtype=np.float64)
155
- D_h = 8.0 * np.ones(TILE_M, dtype=np.float64)
156
- L_h = np.zeros_like(A_h)
157
- Y_h = np.arange(TILE_M, dtype=np.float64)
158
- X_h = np.zeros_like(Y_h)
159
-
160
- A_np = A_h.T + np.diag(D_h)
161
- L_np = np.linalg.cholesky(A_np)
162
- X_np = np.linalg.solve(A_np, Y_h)
163
-
164
- A_wp = wp.array2d(A_h, requires_grad=True, dtype=wp.float64, device=device)
165
- D_wp = wp.array2d(D_h, requires_grad=True, dtype=wp.float64, device=device)
166
- L_wp = wp.array2d(L_h, requires_grad=True, dtype=wp.float64, device=device)
167
- Y_wp = wp.array2d(Y_h, requires_grad=True, dtype=wp.float64, device=device)
168
- X_wp = wp.array2d(X_h, requires_grad=True, dtype=wp.float64, device=device)
169
-
170
- wp.launch_tiled(
171
- tile_math_cholesky, dim=[1, 1], inputs=[A_wp, D_wp, L_wp, Y_wp, X_wp], block_dim=TILE_DIM, device=device
172
- )
173
- wp.synchronize_device(device)
174
-
175
- np.testing.assert_allclose(X_wp.numpy(), X_np)
176
- np.testing.assert_allclose(L_wp.numpy(), L_np)
177
-
178
- # TODO: implement and test backward pass
179
-
180
-
181
- @wp.kernel()
182
- def tile_math_cholesky_multiple_rhs(
183
- gA: wp.array2d(dtype=wp.float64),
184
- gD: wp.array1d(dtype=wp.float64),
185
- gL: wp.array2d(dtype=wp.float64),
186
- gy: wp.array2d(dtype=wp.float64),
187
- gx: wp.array2d(dtype=wp.float64),
188
- gz: wp.array2d(dtype=wp.float64),
189
- ):
190
- i, j = wp.tid()
191
- # Load A, D & y
192
- a = wp.tile_load(gA, shape=(TILE_M, TILE_M), storage="shared")
193
- d = wp.tile_load(gD, shape=TILE_M, storage="shared")
194
- y = wp.tile_load(gy, shape=(TILE_M, TILE_M), storage="shared")
195
- # Ensure tile_diag_add() and tile_cholesky_solve() work with transposed matrices
196
- a_t = wp.tile_transpose(a)
197
- # Compute L st LL^T = A.T + diag(D)
198
- b = wp.tile_diag_add(a_t, d)
199
- l = wp.tile_cholesky(b)
200
- # Solve for y in LL^T x = y.T
201
- y_t = wp.tile_transpose(y)
202
- x = wp.tile_cholesky_solve(l, y_t)
203
- # Ensure matmul receives correct layout information
204
- z = wp.tile_matmul(x, x)
205
- # Store L & y
206
- wp.tile_store(gL, l)
207
- wp.tile_store(gx, x)
208
- wp.tile_store(gz, z)
209
-
210
-
211
- @unittest.skipUnless(wp.context.runtime.core.cuda_toolkit_version() >= 12060, "CUDA toolkit version is less than 12.6")
212
- def test_tile_math_cholesky_multiple_rhs(test, device):
213
- A_h = np.ones((TILE_M, TILE_M), dtype=np.float64)
214
- D_h = 8.0 * np.ones(TILE_M, dtype=np.float64)
215
- L_h = np.zeros_like(A_h)
216
- Y_h = np.arange((TILE_M, TILE_M), dtype=np.float64)
217
- X_h = np.zeros_like(Y_h)
218
- Z_h = np.zeros_like(Y_h)
219
-
220
- A_np = A_h.T + np.diag(D_h)
221
- L_np = np.linalg.cholesky(A_np)
222
- X_np = np.linalg.solve(A_np, Y_h.T)
223
- Z_np = X_np @ X_np
224
-
225
- A_wp = wp.array2d(A_h, requires_grad=True, dtype=wp.float64, device=device)
226
- D_wp = wp.array2d(D_h, requires_grad=True, dtype=wp.float64, device=device)
227
- L_wp = wp.array2d(L_h, requires_grad=True, dtype=wp.float64, device=device)
228
- Y_wp = wp.array2d(Y_h, requires_grad=True, dtype=wp.float64, device=device)
229
- X_wp = wp.array2d(X_h, requires_grad=True, dtype=wp.float64, device=device)
230
- Z_wp = wp.array2d(Z_h, requires_grad=True, dtype=wp.float64, device=device)
231
-
232
- wp.launch_tiled(
233
- tile_math_cholesky_multiple_rhs,
234
- dim=[1, 1],
235
- inputs=[A_wp, D_wp, L_wp, Y_wp, X_wp, Z_wp],
236
- block_dim=TILE_DIM,
237
- device=device,
238
- )
239
- wp.synchronize_device(device)
240
-
241
- np.testing.assert_allclose(L_wp.numpy(), L_np)
242
- np.testing.assert_allclose(X_wp.numpy(), X_np)
243
- np.testing.assert_allclose(Z_wp.numpy(), Z_np)
244
-
245
- # TODO: implement and test backward pass
246
-
247
-
248
- @wp.kernel
249
- def tile_math_forward_substitution(
250
- gL: wp.array2d(dtype=wp.float64), gx: wp.array1d(dtype=wp.float64), gz: wp.array1d(dtype=wp.float64)
251
- ):
252
- i, j = wp.tid()
253
- # Load L & x
254
- L = wp.tile_load(gL, shape=(TILE_M, TILE_M), storage="shared")
255
- x = wp.tile_load(gx, shape=TILE_M, storage="shared")
256
- # Solve for z in Lz = x
257
- # Transpose because we loaded an upper triangular matrix
258
- z = wp.tile_lower_solve(wp.tile_transpose(L), x)
259
- # Store z
260
- wp.tile_store(gz, z)
261
-
262
-
263
- @unittest.skipUnless(wp.context.runtime.core.cuda_toolkit_version() >= 12060, "CUDA toolkit version is less than 12.6")
264
- def test_tile_math_forward_substitution(test, device):
265
- # Create test data
266
- rng = np.random.default_rng(42)
267
- L_h = np.triu(rng.random((TILE_M, TILE_M))) # Upper triangular matrix
268
- x_h = rng.random(TILE_M)
269
- z_h = np.zeros_like(x_h)
270
-
271
- # Compute reference solution using numpy
272
- z_np = np.linalg.solve(L_h.T, x_h)
273
-
274
- # Create Warp arrays
275
- L_wp = wp.array2d(L_h, requires_grad=True, dtype=wp.float64, device=device)
276
- x_wp = wp.array1d(x_h, requires_grad=True, dtype=wp.float64, device=device)
277
- z_wp = wp.array1d(z_h, requires_grad=True, dtype=wp.float64, device=device)
278
-
279
- # Run kernel
280
- wp.launch_tiled(
281
- tile_math_forward_substitution, dim=[1, 1], inputs=[L_wp, x_wp, z_wp], block_dim=TILE_DIM, device=device
282
- )
283
- wp.synchronize_device(device)
284
-
285
- # Verify results
286
- np.testing.assert_allclose(z_wp.numpy(), z_np)
287
-
288
- # TODO: implement and test backward pass
289
-
290
-
291
- @wp.kernel
292
- def tile_math_back_substitution(
293
- gL: wp.array2d(dtype=wp.float64), gx: wp.array1d(dtype=wp.float64), gz: wp.array1d(dtype=wp.float64)
294
- ):
295
- i, j = wp.tid()
296
- # Load L & x
297
- L = wp.tile_load(gL, shape=(TILE_M, TILE_M), storage="shared")
298
- x = wp.tile_load(gx, shape=TILE_M, storage="shared")
299
- # Solve for z in L^T z = x
300
- # Transpose because we loaded a lower triangular matrix
301
- z = wp.tile_upper_solve(wp.tile_transpose(L), x)
302
- # Store z
303
- wp.tile_store(gz, z)
304
-
305
-
306
- @unittest.skipUnless(wp.context.runtime.core.cuda_toolkit_version() >= 12060, "CUDA toolkit version is less than 12.6")
307
- def test_tile_math_back_substitution(test, device):
308
- # Create test data
309
- rng = np.random.default_rng(42)
310
- L_h = np.tril(rng.random((TILE_M, TILE_M))) # Lower triangular matrix
311
- x_h = rng.random(TILE_M)
312
- z_h = np.zeros_like(x_h)
313
-
314
- # Compute reference solution using numpy
315
- z_np = np.linalg.solve(L_h.T, x_h)
316
-
317
- # Create Warp arrays
318
- L_wp = wp.array2d(L_h, requires_grad=True, dtype=wp.float64, device=device)
319
- x_wp = wp.array1d(x_h, requires_grad=True, dtype=wp.float64, device=device)
320
- z_wp = wp.array1d(z_h, requires_grad=True, dtype=wp.float64, device=device)
321
-
322
- # Run kernel
323
- wp.launch_tiled(
324
- tile_math_back_substitution, dim=[1, 1], inputs=[L_wp, x_wp, z_wp], block_dim=TILE_DIM, device=device
325
- )
326
- wp.synchronize_device(device)
327
-
328
- # Verify results
329
- np.testing.assert_allclose(z_wp.numpy(), z_np)
330
-
331
- # TODO: implement and test backward pass
332
-
333
-
334
- @wp.kernel
335
- def tile_math_forward_substitution_multiple_rhs(
336
- gL: wp.array2d(dtype=wp.float64),
337
- gx: wp.array2d(dtype=wp.float64),
338
- gz: wp.array2d(dtype=wp.float64),
339
- gc: wp.array2d(dtype=wp.float64),
340
- ):
341
- i, j = wp.tid()
342
- # Load L & x
343
- L = wp.tile_load(gL, shape=(TILE_M, TILE_M), storage="shared")
344
- x = wp.tile_load(gx, shape=(TILE_M, TILE_M), storage="shared")
345
- # Solve for z in Lz = x.T
346
- x_t = wp.tile_transpose(x)
347
- z = wp.tile_lower_solve(L, x_t)
348
- # Ensure matmul receives correct layout information
349
- c = wp.tile_matmul(z, z)
350
- # Store z and c
351
- wp.tile_store(gz, z)
352
- wp.tile_store(gc, c)
353
-
354
-
355
- @unittest.skipUnless(wp.context.runtime.core.cuda_toolkit_version() >= 12060, "CUDA toolkit version is less than 12.6")
356
- def test_tile_math_forward_substitution_multiple_rhs(test, device):
357
- # Create test data
358
- rng = np.random.default_rng(42)
359
- L_h = np.tril(rng.random((TILE_M, TILE_M))) # Lower triangular matrix
360
- x_h = rng.random((TILE_M, TILE_M)) # Multiple right-hand sides
361
- z_h = np.zeros_like(x_h)
362
- c_h = np.zeros_like(x_h)
363
-
364
- # Compute reference solution using numpy
365
- z_np = np.linalg.solve(L_h, x_h.T)
366
- c_np = z_np @ z_np
367
-
368
- # Create Warp arrays
369
- L_wp = wp.array2d(L_h, requires_grad=True, dtype=wp.float64, device=device)
370
- x_wp = wp.array2d(x_h, requires_grad=True, dtype=wp.float64, device=device)
371
- z_wp = wp.array2d(z_h, requires_grad=True, dtype=wp.float64, device=device)
372
- c_wp = wp.array2d(c_h, requires_grad=True, dtype=wp.float64, device=device)
373
-
374
- # Run kernel
375
- wp.launch_tiled(
376
- tile_math_forward_substitution_multiple_rhs,
377
- dim=[1, 1],
378
- inputs=[L_wp, x_wp, z_wp, c_wp],
379
- block_dim=TILE_DIM,
380
- device=device,
381
- )
382
- wp.synchronize_device()
383
-
384
- # Verify results
385
- assert np.allclose(z_wp.numpy(), z_np)
386
- assert np.allclose(c_wp.numpy(), c_np)
387
-
388
- # TODO: implement and test backward pass
389
-
390
-
391
- @wp.kernel
392
- def tile_math_back_substitution_multiple_rhs(
393
- gL: wp.array2d(dtype=wp.float64),
394
- gx: wp.array2d(dtype=wp.float64),
395
- gz: wp.array2d(dtype=wp.float64),
396
- gc: wp.array2d(dtype=wp.float64),
397
- ):
398
- i, j = wp.tid()
399
- # Load L & x
400
- L = wp.tile_load(gL, shape=(TILE_M, TILE_M), storage="shared")
401
- x = wp.tile_load(gx, shape=(TILE_M, TILE_M), storage="shared")
402
- # Solve for z in L^T z = x.T
403
- x_t = wp.tile_transpose(x)
404
- z = wp.tile_upper_solve(wp.tile_transpose(L), x_t)
405
- # Ensure matmul receives correct layout information
406
- c = wp.tile_matmul(z, z)
407
- # Store z and c
408
- wp.tile_store(gz, z)
409
- wp.tile_store(gc, c)
410
-
411
-
412
- @unittest.skipUnless(wp.context.runtime.core.cuda_toolkit_version() >= 12060, "CUDA toolkit version is less than 12.6")
413
- def test_tile_math_back_substitution_multiple_rhs(test, device):
414
- # Create test data
415
- rng = np.random.default_rng(42)
416
- L_h = np.tril(rng.random((TILE_M, TILE_M))) # Lower triangular matrix
417
- x_h = rng.random((TILE_M, TILE_M)) # Multiple right-hand sides
418
- z_h = np.zeros_like(x_h)
419
- c_h = np.zeros_like(x_h)
420
-
421
- # Compute reference solution using numpy
422
- z_np = np.linalg.solve(L_h.T, x_h.T)
423
- c_np = z_np @ z_np
424
-
425
- # Create Warp arrays
426
- L_wp = wp.array2d(L_h, requires_grad=True, dtype=wp.float64, device=device)
427
- x_wp = wp.array2d(x_h, requires_grad=True, dtype=wp.float64, device=device)
428
- z_wp = wp.array2d(z_h, requires_grad=True, dtype=wp.float64, device=device)
429
- c_wp = wp.array2d(c_h, requires_grad=True, dtype=wp.float64, device=device)
430
-
431
- # Run kernel
432
- wp.launch_tiled(
433
- tile_math_back_substitution_multiple_rhs,
434
- dim=[1, 1],
435
- inputs=[L_wp, x_wp, z_wp, c_wp],
436
- block_dim=TILE_DIM,
437
- device=device,
438
- )
439
- wp.synchronize_device()
440
-
441
- # Verify results
442
- assert np.allclose(z_wp.numpy(), z_np)
443
- assert np.allclose(c_wp.numpy(), c_np)
444
-
445
- # TODO: implement and test backward pass
446
-
447
-
448
- # tests a complex composition of most libmathdx calls
449
- @unittest.skipUnless(wp.context.runtime.core.cuda_toolkit_version() >= 12060, "CUDA toolkit version is less than 12.6")
450
- def test_tile_math_block_cholesky(test, device):
451
- BLOCK_SIZE = wp.constant(TILE_M // 2)
452
-
453
- @wp.kernel
454
- def block_cholesky_kernel(
455
- A: wp.array2d(dtype=float),
456
- L: wp.array2d(dtype=float),
457
- ):
458
- """
459
- Computes the Cholesky factorization of a symmetric positive definite matrix A in blocks.
460
- It returns a lower-triangular matrix L such that A = L L^T.
461
- """
462
-
463
- # Process the matrix in blocks along its leading dimension.
464
- for k in range(0, TILE_M, BLOCK_SIZE):
465
- end = k + BLOCK_SIZE
466
-
467
- # Load current diagonal block A[k:end, k:end]
468
- # and update with contributions from previously computed blocks.
469
- A_kk_tile = wp.tile_load(A, shape=(BLOCK_SIZE, BLOCK_SIZE), offset=(k, k), storage="shared")
470
-
471
- for j in range(0, k, BLOCK_SIZE):
472
- L_block = wp.tile_load(L, shape=(BLOCK_SIZE, BLOCK_SIZE), offset=(k, j))
473
- L_block_T = wp.tile_transpose(L_block)
474
- L_L_T_block = wp.tile_matmul(L_block, L_block_T)
475
- A_kk_tile -= L_L_T_block
476
-
477
- # Compute the Cholesky factorization for the block
478
- # print(A_kk_tile)
479
- L_kk_tile = wp.tile_cholesky(A_kk_tile)
480
- wp.tile_store(L, L_kk_tile, offset=(k, k))
481
-
482
- # Process the blocks below the current block
483
- for i in range(end, TILE_M, BLOCK_SIZE):
484
- A_ik_tile = wp.tile_load(A, shape=(BLOCK_SIZE, BLOCK_SIZE), offset=(i, k), storage="shared")
485
-
486
- for j in range(0, k, BLOCK_SIZE):
487
- L_tile = wp.tile_load(L, shape=(BLOCK_SIZE, BLOCK_SIZE), offset=(i, j))
488
- L_2_tile = wp.tile_load(L, shape=(BLOCK_SIZE, BLOCK_SIZE), offset=(k, j))
489
- L_T_tile = wp.tile_transpose(L_2_tile)
490
- L_L_T_tile = wp.tile_matmul(L_tile, L_T_tile)
491
- A_ik_tile -= L_L_T_tile
492
-
493
- A_ik_T_tile = wp.tile_transpose(A_ik_tile)
494
- sol_T_tile = wp.tile_lower_solve(L_kk_tile, A_ik_T_tile)
495
- sol_tile = wp.tile_transpose(sol_T_tile)
496
-
497
- wp.tile_store(L, sol_tile, offset=(i, k))
498
-
499
- @wp.kernel
500
- def block_cholesky_solve_kernel(
501
- L: wp.array2d(dtype=float),
502
- b: wp.array2d(dtype=float),
503
- scratch: wp.array2d(dtype=float),
504
- x: wp.array2d(dtype=float),
505
- ):
506
- """
507
- Solves A x = b given the Cholesky factor L (A = L L^T) using
508
- blocked forward and backward substitution.
509
- """
510
-
511
- # Forward substitution: solve L y = b
512
- for i in range(0, TILE_M, BLOCK_SIZE):
513
- i_end = i + BLOCK_SIZE
514
- rhs_tile = wp.tile_load(b, shape=(BLOCK_SIZE, 1), offset=(i, 0))
515
- for j in range(0, i, BLOCK_SIZE):
516
- L_block = wp.tile_load(L, shape=(BLOCK_SIZE, BLOCK_SIZE), offset=(i, j))
517
- y_block = wp.tile_load(scratch, shape=(BLOCK_SIZE, 1), offset=(j, 0))
518
- Ly_block = wp.tile_matmul(L_block, y_block)
519
- rhs_tile -= Ly_block
520
- L_tile = wp.tile_load(L, shape=(BLOCK_SIZE, BLOCK_SIZE), offset=(i, i))
521
- y_tile = wp.tile_lower_solve(L_tile, rhs_tile)
522
- wp.tile_store(scratch, y_tile, offset=(i, 0))
523
-
524
- # Backward substitution: solve L^T x = y
525
- for i in range(TILE_M - BLOCK_SIZE, -1, -BLOCK_SIZE):
526
- i_start = i
527
- i_end = i_start + BLOCK_SIZE
528
- rhs_tile = wp.tile_load(scratch, shape=(BLOCK_SIZE, 1), offset=(i_start, 0))
529
- for j in range(i_end, TILE_M, BLOCK_SIZE):
530
- L_tile = wp.tile_load(L, shape=(BLOCK_SIZE, BLOCK_SIZE), offset=(j, i_start))
531
- L_T_tile = wp.tile_transpose(L_tile)
532
- x_tile = wp.tile_load(x, shape=(BLOCK_SIZE, 1), offset=(j, 0))
533
- L_T_x_tile = wp.tile_matmul(L_T_tile, x_tile)
534
- rhs_tile -= L_T_x_tile
535
- L_tile = wp.tile_load(L, shape=(BLOCK_SIZE, BLOCK_SIZE), offset=(i_start, i_start))
536
- x_tile = wp.tile_upper_solve(wp.tile_transpose(L_tile), rhs_tile)
537
- wp.tile_store(x, x_tile, offset=(i_start, 0))
538
-
539
- # check block cholesky decomposition
540
-
541
- rng = np.random.default_rng(42)
542
-
543
- M = np.array(rng.random((TILE_M, TILE_M)), dtype=float)
544
-
545
- A_np = M.T @ M + np.eye(TILE_M, TILE_M)
546
- L_np = np.linalg.cholesky(A_np)
547
-
548
- A_wp = wp.array2d(A_np, dtype=float, device=device)
549
- L_wp = wp.zeros_like(A_wp)
550
-
551
- wp.launch_tiled(block_cholesky_kernel, dim=1, inputs=[A_wp], outputs=[L_wp], block_dim=TILE_DIM, device=device)
552
-
553
- # check block cholesky solve
554
-
555
- assert_np_equal(L_wp.numpy(), L_np, tol=1e-6)
556
-
557
- b_np = np.array(rng.random((TILE_M, 1)), dtype=float)
558
- b_wp = wp.array(b_np, dtype=float, device=device)
559
-
560
- scratch = wp.zeros_like(b_wp)
561
-
562
- x_np = np.linalg.solve(L_np.T, np.linalg.solve(L_np, b_np))
563
- x_wp = wp.zeros_like(b_wp)
564
-
565
- wp.launch_tiled(
566
- block_cholesky_solve_kernel,
567
- dim=1,
568
- inputs=[L_wp, b_wp, scratch],
569
- outputs=[x_wp],
570
- block_dim=TILE_DIM,
571
- device=device,
572
- )
573
-
574
- assert_np_equal(x_wp.numpy(), x_np, tol=1e-6)
575
-
576
-
577
- @wp.kernel
578
- def test_tile_lower_solve(L: wp.array2d(dtype=float), y: wp.array(dtype=float), x: wp.array(dtype=float)):
579
- L_tile = wp.tile_load(L, shape=(TILE_M, TILE_M))
580
- y_tile = wp.tile_load(x, shape=(TILE_M,))
581
- sol = wp.tile_lower_solve(L_tile, y_tile)
582
- wp.tile_store(x, sol)
583
-
584
-
585
- @wp.kernel
586
- def test_tile_upper_solve(L: wp.array2d(dtype=float), y: wp.array(dtype=float), x: wp.array(dtype=float)):
587
- L_tile = wp.tile_load(L, shape=(TILE_M, TILE_M))
588
- y_tile = wp.tile_load(x, shape=(TILE_M,))
589
- sol = wp.tile_upper_solve(L_tile, y_tile)
590
- wp.tile_store(x, sol)
591
-
592
-
593
- @unittest.skipUnless(wp.context.runtime.core.cuda_toolkit_version() >= 12060, "CUDA toolkit version is less than 12.6")
594
- def test_tile_math_singular_matrices(test, device):
595
- rng = np.random.default_rng(42)
596
- L_np = np.tril(rng.random((TILE_M, TILE_M))) # Lower triangular matrix
597
- L_np[-1, -1] = 0.0 # Make it singular
598
- y_np = rng.random(TILE_M)
599
-
600
- L_wp = wp.array2d(L_np, dtype=float, device=device)
601
- y_wp = wp.array(y_np, dtype=float, device=device)
602
- x_wp = wp.zeros_like(y_wp)
603
-
604
- wp.launch_tiled(
605
- test_tile_lower_solve, dim=1, inputs=[L_wp, y_wp], outputs=[x_wp], block_dim=TILE_DIM, device=device
606
- )
607
-
608
- assert np.isnan(x_wp.numpy()).any()
609
-
610
- L_np = np.triu(rng.random((TILE_M, TILE_M))) # Upper triangular matrix
611
- L_np[-1, -1] = 0.0 # Make it singular
612
-
613
- L_wp = wp.array2d(L_np, dtype=float, device=device)
614
- y_wp = wp.array(y_np, dtype=float, device=device)
615
- x_wp = wp.zeros_like(y_wp)
616
-
617
- wp.launch_tiled(
618
- test_tile_upper_solve, dim=1, inputs=[L_wp, y_wp], outputs=[x_wp], block_dim=TILE_DIM, device=device
619
- )
620
-
621
- assert np.isnan(x_wp.numpy()).any()
622
-
623
-
624
126
  all_devices = get_test_devices()
625
127
  cuda_devices = get_cuda_test_devices()
626
128
 
@@ -633,16 +135,6 @@ class TestTileMathDx(unittest.TestCase):
633
135
  add_function_test(
634
136
  TestTileMathDx, "test_tile_math_matmul", test_tile_math_matmul, devices=all_devices, check_output=False
635
137
  )
636
- add_function_test(
637
- TestTileMathDx, "test_tile_math_cholesky", test_tile_math_cholesky, devices=all_devices, check_output=False
638
- )
639
- add_function_test(
640
- TestTileMathDx,
641
- "tile_math_cholesky_multiple_rhs",
642
- tile_math_cholesky_multiple_rhs,
643
- devices=all_devices,
644
- check_output=False,
645
- )
646
138
  add_function_test(
647
139
  TestTileMathDx,
648
140
  "test_tile_math_fft_vec2f",
@@ -658,54 +150,6 @@ add_function_test(
658
150
  check_output=False,
659
151
  )
660
152
 
661
- add_function_test(
662
- TestTileMathDx,
663
- "test_tile_math_forward_substitution",
664
- test_tile_math_forward_substitution,
665
- devices=cuda_devices,
666
- check_output=False,
667
- )
668
-
669
- add_function_test(
670
- TestTileMathDx,
671
- "test_tile_math_back_substitution",
672
- test_tile_math_back_substitution,
673
- devices=cuda_devices,
674
- check_output=False,
675
- )
676
-
677
- add_function_test(
678
- TestTileMathDx,
679
- "test_tile_math_forward_substitution_multiple_rhs",
680
- test_tile_math_forward_substitution_multiple_rhs,
681
- devices=cuda_devices,
682
- check_output=False,
683
- )
684
-
685
- add_function_test(
686
- TestTileMathDx,
687
- "test_tile_math_back_substitution_multiple_rhs",
688
- test_tile_math_back_substitution_multiple_rhs,
689
- devices=cuda_devices,
690
- check_output=False,
691
- )
692
-
693
- add_function_test(
694
- TestTileMathDx,
695
- "test_tile_math_block_cholesky",
696
- test_tile_math_block_cholesky,
697
- devices=cuda_devices,
698
- check_output=False,
699
- )
700
-
701
- add_function_test(
702
- TestTileMathDx,
703
- "test_tile_math_singular_matrices",
704
- test_tile_math_singular_matrices,
705
- devices=cuda_devices,
706
- check_output=False,
707
- )
708
-
709
153
 
710
154
  if __name__ == "__main__":
711
155
  wp.clear_kernel_cache()