warp-lang 1.7.2rc1__py3-none-win_amd64.whl → 1.8.1__py3-none-win_amd64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of warp-lang might be problematic. Click here for more details.

Files changed (193) hide show
  1. warp/__init__.py +3 -1
  2. warp/__init__.pyi +3489 -1
  3. warp/autograd.py +45 -122
  4. warp/bin/warp-clang.dll +0 -0
  5. warp/bin/warp.dll +0 -0
  6. warp/build.py +241 -252
  7. warp/build_dll.py +130 -26
  8. warp/builtins.py +1907 -384
  9. warp/codegen.py +272 -104
  10. warp/config.py +12 -1
  11. warp/constants.py +1 -1
  12. warp/context.py +770 -238
  13. warp/dlpack.py +1 -1
  14. warp/examples/benchmarks/benchmark_cloth.py +2 -2
  15. warp/examples/benchmarks/benchmark_tile_sort.py +155 -0
  16. warp/examples/core/example_sample_mesh.py +1 -1
  17. warp/examples/core/example_spin_lock.py +93 -0
  18. warp/examples/core/example_work_queue.py +118 -0
  19. warp/examples/fem/example_adaptive_grid.py +5 -5
  20. warp/examples/fem/example_apic_fluid.py +1 -1
  21. warp/examples/fem/example_burgers.py +1 -1
  22. warp/examples/fem/example_convection_diffusion.py +9 -6
  23. warp/examples/fem/example_darcy_ls_optimization.py +489 -0
  24. warp/examples/fem/example_deformed_geometry.py +1 -1
  25. warp/examples/fem/example_diffusion.py +2 -2
  26. warp/examples/fem/example_diffusion_3d.py +1 -1
  27. warp/examples/fem/example_distortion_energy.py +1 -1
  28. warp/examples/fem/example_elastic_shape_optimization.py +387 -0
  29. warp/examples/fem/example_magnetostatics.py +5 -3
  30. warp/examples/fem/example_mixed_elasticity.py +5 -3
  31. warp/examples/fem/example_navier_stokes.py +11 -9
  32. warp/examples/fem/example_nonconforming_contact.py +5 -3
  33. warp/examples/fem/example_streamlines.py +8 -3
  34. warp/examples/fem/utils.py +9 -8
  35. warp/examples/interop/example_jax_callable.py +34 -4
  36. warp/examples/interop/example_jax_ffi_callback.py +2 -2
  37. warp/examples/interop/example_jax_kernel.py +27 -1
  38. warp/examples/optim/example_drone.py +1 -1
  39. warp/examples/sim/example_cloth.py +1 -1
  40. warp/examples/sim/example_cloth_self_contact.py +48 -54
  41. warp/examples/tile/example_tile_block_cholesky.py +502 -0
  42. warp/examples/tile/example_tile_cholesky.py +2 -1
  43. warp/examples/tile/example_tile_convolution.py +1 -1
  44. warp/examples/tile/example_tile_filtering.py +1 -1
  45. warp/examples/tile/example_tile_matmul.py +1 -1
  46. warp/examples/tile/example_tile_mlp.py +2 -0
  47. warp/fabric.py +7 -7
  48. warp/fem/__init__.py +5 -0
  49. warp/fem/adaptivity.py +1 -1
  50. warp/fem/cache.py +152 -63
  51. warp/fem/dirichlet.py +2 -2
  52. warp/fem/domain.py +136 -6
  53. warp/fem/field/field.py +141 -99
  54. warp/fem/field/nodal_field.py +85 -39
  55. warp/fem/field/virtual.py +99 -52
  56. warp/fem/geometry/adaptive_nanogrid.py +91 -86
  57. warp/fem/geometry/closest_point.py +13 -0
  58. warp/fem/geometry/deformed_geometry.py +102 -40
  59. warp/fem/geometry/element.py +56 -2
  60. warp/fem/geometry/geometry.py +323 -22
  61. warp/fem/geometry/grid_2d.py +157 -62
  62. warp/fem/geometry/grid_3d.py +116 -20
  63. warp/fem/geometry/hexmesh.py +86 -20
  64. warp/fem/geometry/nanogrid.py +166 -86
  65. warp/fem/geometry/partition.py +59 -25
  66. warp/fem/geometry/quadmesh.py +86 -135
  67. warp/fem/geometry/tetmesh.py +47 -119
  68. warp/fem/geometry/trimesh.py +77 -270
  69. warp/fem/integrate.py +181 -95
  70. warp/fem/linalg.py +25 -58
  71. warp/fem/operator.py +124 -27
  72. warp/fem/quadrature/pic_quadrature.py +36 -14
  73. warp/fem/quadrature/quadrature.py +40 -16
  74. warp/fem/space/__init__.py +1 -1
  75. warp/fem/space/basis_function_space.py +66 -46
  76. warp/fem/space/basis_space.py +17 -4
  77. warp/fem/space/dof_mapper.py +1 -1
  78. warp/fem/space/function_space.py +2 -2
  79. warp/fem/space/grid_2d_function_space.py +4 -1
  80. warp/fem/space/hexmesh_function_space.py +4 -2
  81. warp/fem/space/nanogrid_function_space.py +3 -1
  82. warp/fem/space/partition.py +11 -2
  83. warp/fem/space/quadmesh_function_space.py +4 -1
  84. warp/fem/space/restriction.py +5 -2
  85. warp/fem/space/shape/__init__.py +10 -8
  86. warp/fem/space/tetmesh_function_space.py +4 -1
  87. warp/fem/space/topology.py +52 -21
  88. warp/fem/space/trimesh_function_space.py +4 -1
  89. warp/fem/utils.py +53 -8
  90. warp/jax.py +1 -2
  91. warp/jax_experimental/ffi.py +210 -67
  92. warp/jax_experimental/xla_ffi.py +37 -24
  93. warp/math.py +171 -1
  94. warp/native/array.h +103 -4
  95. warp/native/builtin.h +182 -35
  96. warp/native/coloring.cpp +6 -2
  97. warp/native/cuda_util.cpp +1 -1
  98. warp/native/exports.h +118 -63
  99. warp/native/intersect.h +5 -5
  100. warp/native/mat.h +8 -13
  101. warp/native/mathdx.cpp +11 -5
  102. warp/native/matnn.h +1 -123
  103. warp/native/mesh.h +1 -1
  104. warp/native/quat.h +34 -6
  105. warp/native/rand.h +7 -7
  106. warp/native/sparse.cpp +121 -258
  107. warp/native/sparse.cu +181 -274
  108. warp/native/spatial.h +305 -17
  109. warp/native/svd.h +23 -8
  110. warp/native/tile.h +603 -73
  111. warp/native/tile_radix_sort.h +1112 -0
  112. warp/native/tile_reduce.h +239 -13
  113. warp/native/tile_scan.h +240 -0
  114. warp/native/tuple.h +189 -0
  115. warp/native/vec.h +10 -20
  116. warp/native/warp.cpp +36 -4
  117. warp/native/warp.cu +588 -52
  118. warp/native/warp.h +47 -74
  119. warp/optim/linear.py +5 -1
  120. warp/paddle.py +7 -8
  121. warp/py.typed +0 -0
  122. warp/render/render_opengl.py +110 -80
  123. warp/render/render_usd.py +124 -62
  124. warp/sim/__init__.py +9 -0
  125. warp/sim/collide.py +253 -80
  126. warp/sim/graph_coloring.py +8 -1
  127. warp/sim/import_mjcf.py +4 -3
  128. warp/sim/import_usd.py +11 -7
  129. warp/sim/integrator.py +5 -2
  130. warp/sim/integrator_euler.py +1 -1
  131. warp/sim/integrator_featherstone.py +1 -1
  132. warp/sim/integrator_vbd.py +761 -322
  133. warp/sim/integrator_xpbd.py +1 -1
  134. warp/sim/model.py +265 -260
  135. warp/sim/utils.py +10 -7
  136. warp/sparse.py +303 -166
  137. warp/tape.py +54 -51
  138. warp/tests/cuda/test_conditional_captures.py +1046 -0
  139. warp/tests/cuda/test_streams.py +1 -1
  140. warp/tests/geometry/test_volume.py +2 -2
  141. warp/tests/interop/test_dlpack.py +9 -9
  142. warp/tests/interop/test_jax.py +0 -1
  143. warp/tests/run_coverage_serial.py +1 -1
  144. warp/tests/sim/disabled_kinematics.py +2 -2
  145. warp/tests/sim/{test_vbd.py → test_cloth.py} +378 -112
  146. warp/tests/sim/test_collision.py +159 -51
  147. warp/tests/sim/test_coloring.py +91 -2
  148. warp/tests/test_array.py +254 -2
  149. warp/tests/test_array_reduce.py +2 -2
  150. warp/tests/test_assert.py +53 -0
  151. warp/tests/test_atomic_cas.py +312 -0
  152. warp/tests/test_codegen.py +142 -19
  153. warp/tests/test_conditional.py +47 -1
  154. warp/tests/test_ctypes.py +0 -20
  155. warp/tests/test_devices.py +8 -0
  156. warp/tests/test_fabricarray.py +4 -2
  157. warp/tests/test_fem.py +58 -25
  158. warp/tests/test_func.py +42 -1
  159. warp/tests/test_grad.py +1 -1
  160. warp/tests/test_lerp.py +1 -3
  161. warp/tests/test_map.py +481 -0
  162. warp/tests/test_mat.py +23 -24
  163. warp/tests/test_quat.py +28 -15
  164. warp/tests/test_rounding.py +10 -38
  165. warp/tests/test_runlength_encode.py +7 -7
  166. warp/tests/test_smoothstep.py +1 -1
  167. warp/tests/test_sparse.py +83 -2
  168. warp/tests/test_spatial.py +507 -1
  169. warp/tests/test_static.py +48 -0
  170. warp/tests/test_struct.py +2 -2
  171. warp/tests/test_tape.py +38 -0
  172. warp/tests/test_tuple.py +265 -0
  173. warp/tests/test_types.py +2 -2
  174. warp/tests/test_utils.py +24 -18
  175. warp/tests/test_vec.py +38 -408
  176. warp/tests/test_vec_constructors.py +325 -0
  177. warp/tests/tile/test_tile.py +438 -131
  178. warp/tests/tile/test_tile_mathdx.py +518 -14
  179. warp/tests/tile/test_tile_matmul.py +179 -0
  180. warp/tests/tile/test_tile_reduce.py +307 -5
  181. warp/tests/tile/test_tile_shared_memory.py +136 -7
  182. warp/tests/tile/test_tile_sort.py +121 -0
  183. warp/tests/unittest_suites.py +14 -6
  184. warp/types.py +462 -308
  185. warp/utils.py +647 -86
  186. {warp_lang-1.7.2rc1.dist-info → warp_lang-1.8.1.dist-info}/METADATA +20 -6
  187. {warp_lang-1.7.2rc1.dist-info → warp_lang-1.8.1.dist-info}/RECORD +190 -176
  188. warp/stubs.py +0 -3381
  189. warp/tests/sim/test_xpbd.py +0 -399
  190. warp/tests/test_mlp.py +0 -282
  191. {warp_lang-1.7.2rc1.dist-info → warp_lang-1.8.1.dist-info}/WHEEL +0 -0
  192. {warp_lang-1.7.2rc1.dist-info → warp_lang-1.8.1.dist-info}/licenses/LICENSE.md +0 -0
  193. {warp_lang-1.7.2rc1.dist-info → warp_lang-1.8.1.dist-info}/top_level.txt +0 -0
@@ -45,6 +45,7 @@ def tile_math_matmul_kernel(
45
45
  wp.tile_store(gc, c, offset=(i * TILE_M, j * TILE_N))
46
46
 
47
47
 
48
+ @unittest.skipUnless(wp.context.runtime.core.cuda_toolkit_version() >= 12060, "CUDA toolkit version is less than 12.6")
48
49
  def test_tile_math_matmul(test, device):
49
50
  rng = np.random.default_rng(42)
50
51
 
@@ -128,51 +129,498 @@ def tile_math_cholesky(
128
129
  gA: wp.array2d(dtype=wp.float64),
129
130
  gD: wp.array1d(dtype=wp.float64),
130
131
  gL: wp.array2d(dtype=wp.float64),
131
- gx: wp.array1d(dtype=wp.float64),
132
132
  gy: wp.array1d(dtype=wp.float64),
133
+ gx: wp.array1d(dtype=wp.float64),
133
134
  ):
134
135
  i, j = wp.tid()
135
- # Load A, D & x
136
+ # Load A, D & y
136
137
  a = wp.tile_load(gA, shape=(TILE_M, TILE_M), storage="shared")
137
138
  d = wp.tile_load(gD, shape=TILE_M, storage="shared")
138
- x = wp.tile_load(gx, shape=TILE_M, storage="shared")
139
- # Compute L st LL^T = A + diag(D)
140
- b = wp.tile_diag_add(a, d)
139
+ y = wp.tile_load(gy, shape=TILE_M, storage="shared")
140
+ # Ensure tile_diag_add() and tile_cholesky_solve() work with transposed matrices
141
+ a_t = wp.tile_transpose(a)
142
+ # Compute L st LL^T = A^T + diag(D)
143
+ b = wp.tile_diag_add(a_t, d)
141
144
  l = wp.tile_cholesky(b)
142
- # Solve for y in LL^T y = x
143
- y = wp.tile_cholesky_solve(l, x)
145
+ # Solve for y in LL^T x = y
146
+ x = wp.tile_cholesky_solve(l, y)
144
147
  # Store L & y
145
148
  wp.tile_store(gL, l)
146
- wp.tile_store(gy, y)
149
+ wp.tile_store(gx, x)
147
150
 
148
151
 
152
+ @unittest.skipUnless(wp.context.runtime.core.cuda_toolkit_version() >= 12060, "CUDA toolkit version is less than 12.6")
149
153
  def test_tile_math_cholesky(test, device):
150
154
  A_h = np.ones((TILE_M, TILE_M), dtype=np.float64)
151
155
  D_h = 8.0 * np.ones(TILE_M, dtype=np.float64)
152
156
  L_h = np.zeros_like(A_h)
153
- X_h = np.arange(TILE_M, dtype=np.float64)
154
- Y_h = np.zeros_like(X_h)
157
+ Y_h = np.arange(TILE_M, dtype=np.float64)
158
+ X_h = np.zeros_like(Y_h)
155
159
 
156
- A_np = A_h + np.diag(D_h)
160
+ A_np = A_h.T + np.diag(D_h)
157
161
  L_np = np.linalg.cholesky(A_np)
158
- Y_np = np.linalg.solve(A_np, X_h)
162
+ X_np = np.linalg.solve(A_np, Y_h)
159
163
 
160
164
  A_wp = wp.array2d(A_h, requires_grad=True, dtype=wp.float64, device=device)
161
165
  D_wp = wp.array2d(D_h, requires_grad=True, dtype=wp.float64, device=device)
162
166
  L_wp = wp.array2d(L_h, requires_grad=True, dtype=wp.float64, device=device)
167
+ Y_wp = wp.array2d(Y_h, requires_grad=True, dtype=wp.float64, device=device)
163
168
  X_wp = wp.array2d(X_h, requires_grad=True, dtype=wp.float64, device=device)
169
+
170
+ wp.launch_tiled(
171
+ tile_math_cholesky, dim=[1, 1], inputs=[A_wp, D_wp, L_wp, Y_wp, X_wp], block_dim=TILE_DIM, device=device
172
+ )
173
+ wp.synchronize_device(device)
174
+
175
+ np.testing.assert_allclose(X_wp.numpy(), X_np)
176
+ np.testing.assert_allclose(L_wp.numpy(), L_np)
177
+
178
+ # TODO: implement and test backward pass
179
+
180
+
181
+ @wp.kernel()
182
+ def tile_math_cholesky_multiple_rhs(
183
+ gA: wp.array2d(dtype=wp.float64),
184
+ gD: wp.array1d(dtype=wp.float64),
185
+ gL: wp.array2d(dtype=wp.float64),
186
+ gy: wp.array2d(dtype=wp.float64),
187
+ gx: wp.array2d(dtype=wp.float64),
188
+ gz: wp.array2d(dtype=wp.float64),
189
+ ):
190
+ i, j = wp.tid()
191
+ # Load A, D & y
192
+ a = wp.tile_load(gA, shape=(TILE_M, TILE_M), storage="shared")
193
+ d = wp.tile_load(gD, shape=TILE_M, storage="shared")
194
+ y = wp.tile_load(gy, shape=(TILE_M, TILE_M), storage="shared")
195
+ # Ensure tile_diag_add() and tile_cholesky_solve() work with transposed matrices
196
+ a_t = wp.tile_transpose(a)
197
+ # Compute L st LL^T = A.T + diag(D)
198
+ b = wp.tile_diag_add(a_t, d)
199
+ l = wp.tile_cholesky(b)
200
+ # Solve for y in LL^T x = y.T
201
+ y_t = wp.tile_transpose(y)
202
+ x = wp.tile_cholesky_solve(l, y_t)
203
+ # Ensure matmul receives correct layout information
204
+ z = wp.tile_matmul(x, x)
205
+ # Store L & y
206
+ wp.tile_store(gL, l)
207
+ wp.tile_store(gx, x)
208
+ wp.tile_store(gz, z)
209
+
210
+
211
+ @unittest.skipUnless(wp.context.runtime.core.cuda_toolkit_version() >= 12060, "CUDA toolkit version is less than 12.6")
212
+ def test_tile_math_cholesky_multiple_rhs(test, device):
213
+ A_h = np.ones((TILE_M, TILE_M), dtype=np.float64)
214
+ D_h = 8.0 * np.ones(TILE_M, dtype=np.float64)
215
+ L_h = np.zeros_like(A_h)
216
+ Y_h = np.arange((TILE_M, TILE_M), dtype=np.float64)
217
+ X_h = np.zeros_like(Y_h)
218
+ Z_h = np.zeros_like(Y_h)
219
+
220
+ A_np = A_h.T + np.diag(D_h)
221
+ L_np = np.linalg.cholesky(A_np)
222
+ X_np = np.linalg.solve(A_np, Y_h.T)
223
+ Z_np = X_np @ X_np
224
+
225
+ A_wp = wp.array2d(A_h, requires_grad=True, dtype=wp.float64, device=device)
226
+ D_wp = wp.array2d(D_h, requires_grad=True, dtype=wp.float64, device=device)
227
+ L_wp = wp.array2d(L_h, requires_grad=True, dtype=wp.float64, device=device)
164
228
  Y_wp = wp.array2d(Y_h, requires_grad=True, dtype=wp.float64, device=device)
229
+ X_wp = wp.array2d(X_h, requires_grad=True, dtype=wp.float64, device=device)
230
+ Z_wp = wp.array2d(Z_h, requires_grad=True, dtype=wp.float64, device=device)
165
231
 
166
232
  wp.launch_tiled(
167
- tile_math_cholesky, dim=[1, 1], inputs=[A_wp, D_wp, L_wp, X_wp, Y_wp], block_dim=TILE_DIM, device=device
233
+ tile_math_cholesky_multiple_rhs,
234
+ dim=[1, 1],
235
+ inputs=[A_wp, D_wp, L_wp, Y_wp, X_wp, Z_wp],
236
+ block_dim=TILE_DIM,
237
+ device=device,
238
+ )
239
+ wp.synchronize_device(device)
240
+
241
+ np.testing.assert_allclose(L_wp.numpy(), L_np)
242
+ np.testing.assert_allclose(X_wp.numpy(), X_np)
243
+ np.testing.assert_allclose(Z_wp.numpy(), Z_np)
244
+
245
+ # TODO: implement and test backward pass
246
+
247
+
248
+ @wp.kernel
249
+ def tile_math_forward_substitution(
250
+ gL: wp.array2d(dtype=wp.float64), gx: wp.array1d(dtype=wp.float64), gz: wp.array1d(dtype=wp.float64)
251
+ ):
252
+ i, j = wp.tid()
253
+ # Load L & x
254
+ L = wp.tile_load(gL, shape=(TILE_M, TILE_M), storage="shared")
255
+ x = wp.tile_load(gx, shape=TILE_M, storage="shared")
256
+ # Solve for z in Lz = x
257
+ # Transpose because we loaded an upper triangular matrix
258
+ z = wp.tile_lower_solve(wp.tile_transpose(L), x)
259
+ # Store z
260
+ wp.tile_store(gz, z)
261
+
262
+
263
+ @unittest.skipUnless(wp.context.runtime.core.cuda_toolkit_version() >= 12060, "CUDA toolkit version is less than 12.6")
264
+ def test_tile_math_forward_substitution(test, device):
265
+ # Create test data
266
+ rng = np.random.default_rng(42)
267
+ L_h = np.triu(rng.random((TILE_M, TILE_M))) # Upper triangular matrix
268
+ x_h = rng.random(TILE_M)
269
+ z_h = np.zeros_like(x_h)
270
+
271
+ # Compute reference solution using numpy
272
+ z_np = np.linalg.solve(L_h.T, x_h)
273
+
274
+ # Create Warp arrays
275
+ L_wp = wp.array2d(L_h, requires_grad=True, dtype=wp.float64, device=device)
276
+ x_wp = wp.array1d(x_h, requires_grad=True, dtype=wp.float64, device=device)
277
+ z_wp = wp.array1d(z_h, requires_grad=True, dtype=wp.float64, device=device)
278
+
279
+ # Run kernel
280
+ wp.launch_tiled(
281
+ tile_math_forward_substitution, dim=[1, 1], inputs=[L_wp, x_wp, z_wp], block_dim=TILE_DIM, device=device
282
+ )
283
+ wp.synchronize_device(device)
284
+
285
+ # Verify results
286
+ np.testing.assert_allclose(z_wp.numpy(), z_np)
287
+
288
+ # TODO: implement and test backward pass
289
+
290
+
291
+ @wp.kernel
292
+ def tile_math_back_substitution(
293
+ gL: wp.array2d(dtype=wp.float64), gx: wp.array1d(dtype=wp.float64), gz: wp.array1d(dtype=wp.float64)
294
+ ):
295
+ i, j = wp.tid()
296
+ # Load L & x
297
+ L = wp.tile_load(gL, shape=(TILE_M, TILE_M), storage="shared")
298
+ x = wp.tile_load(gx, shape=TILE_M, storage="shared")
299
+ # Solve for z in L^T z = x
300
+ # Transpose because we loaded a lower triangular matrix
301
+ z = wp.tile_upper_solve(wp.tile_transpose(L), x)
302
+ # Store z
303
+ wp.tile_store(gz, z)
304
+
305
+
306
+ @unittest.skipUnless(wp.context.runtime.core.cuda_toolkit_version() >= 12060, "CUDA toolkit version is less than 12.6")
307
+ def test_tile_math_back_substitution(test, device):
308
+ # Create test data
309
+ rng = np.random.default_rng(42)
310
+ L_h = np.tril(rng.random((TILE_M, TILE_M))) # Lower triangular matrix
311
+ x_h = rng.random(TILE_M)
312
+ z_h = np.zeros_like(x_h)
313
+
314
+ # Compute reference solution using numpy
315
+ z_np = np.linalg.solve(L_h.T, x_h)
316
+
317
+ # Create Warp arrays
318
+ L_wp = wp.array2d(L_h, requires_grad=True, dtype=wp.float64, device=device)
319
+ x_wp = wp.array1d(x_h, requires_grad=True, dtype=wp.float64, device=device)
320
+ z_wp = wp.array1d(z_h, requires_grad=True, dtype=wp.float64, device=device)
321
+
322
+ # Run kernel
323
+ wp.launch_tiled(
324
+ tile_math_back_substitution, dim=[1, 1], inputs=[L_wp, x_wp, z_wp], block_dim=TILE_DIM, device=device
325
+ )
326
+ wp.synchronize_device(device)
327
+
328
+ # Verify results
329
+ np.testing.assert_allclose(z_wp.numpy(), z_np)
330
+
331
+ # TODO: implement and test backward pass
332
+
333
+
334
+ @wp.kernel
335
+ def tile_math_forward_substitution_multiple_rhs(
336
+ gL: wp.array2d(dtype=wp.float64),
337
+ gx: wp.array2d(dtype=wp.float64),
338
+ gz: wp.array2d(dtype=wp.float64),
339
+ gc: wp.array2d(dtype=wp.float64),
340
+ ):
341
+ i, j = wp.tid()
342
+ # Load L & x
343
+ L = wp.tile_load(gL, shape=(TILE_M, TILE_M), storage="shared")
344
+ x = wp.tile_load(gx, shape=(TILE_M, TILE_M), storage="shared")
345
+ # Solve for z in Lz = x.T
346
+ x_t = wp.tile_transpose(x)
347
+ z = wp.tile_lower_solve(L, x_t)
348
+ # Ensure matmul receives correct layout information
349
+ c = wp.tile_matmul(z, z)
350
+ # Store z and c
351
+ wp.tile_store(gz, z)
352
+ wp.tile_store(gc, c)
353
+
354
+
355
+ @unittest.skipUnless(wp.context.runtime.core.cuda_toolkit_version() >= 12060, "CUDA toolkit version is less than 12.6")
356
+ def test_tile_math_forward_substitution_multiple_rhs(test, device):
357
+ # Create test data
358
+ rng = np.random.default_rng(42)
359
+ L_h = np.tril(rng.random((TILE_M, TILE_M))) # Lower triangular matrix
360
+ x_h = rng.random((TILE_M, TILE_M)) # Multiple right-hand sides
361
+ z_h = np.zeros_like(x_h)
362
+ c_h = np.zeros_like(x_h)
363
+
364
+ # Compute reference solution using numpy
365
+ z_np = np.linalg.solve(L_h, x_h.T)
366
+ c_np = z_np @ z_np
367
+
368
+ # Create Warp arrays
369
+ L_wp = wp.array2d(L_h, requires_grad=True, dtype=wp.float64, device=device)
370
+ x_wp = wp.array2d(x_h, requires_grad=True, dtype=wp.float64, device=device)
371
+ z_wp = wp.array2d(z_h, requires_grad=True, dtype=wp.float64, device=device)
372
+ c_wp = wp.array2d(c_h, requires_grad=True, dtype=wp.float64, device=device)
373
+
374
+ # Run kernel
375
+ wp.launch_tiled(
376
+ tile_math_forward_substitution_multiple_rhs,
377
+ dim=[1, 1],
378
+ inputs=[L_wp, x_wp, z_wp, c_wp],
379
+ block_dim=TILE_DIM,
380
+ device=device,
168
381
  )
169
382
  wp.synchronize_device()
170
383
 
171
- assert np.allclose(Y_wp.numpy(), Y_np) and np.allclose(L_wp.numpy(), L_np)
384
+ # Verify results
385
+ assert np.allclose(z_wp.numpy(), z_np)
386
+ assert np.allclose(c_wp.numpy(), c_np)
172
387
 
173
388
  # TODO: implement and test backward pass
174
389
 
175
390
 
391
+ @wp.kernel
392
+ def tile_math_back_substitution_multiple_rhs(
393
+ gL: wp.array2d(dtype=wp.float64),
394
+ gx: wp.array2d(dtype=wp.float64),
395
+ gz: wp.array2d(dtype=wp.float64),
396
+ gc: wp.array2d(dtype=wp.float64),
397
+ ):
398
+ i, j = wp.tid()
399
+ # Load L & x
400
+ L = wp.tile_load(gL, shape=(TILE_M, TILE_M), storage="shared")
401
+ x = wp.tile_load(gx, shape=(TILE_M, TILE_M), storage="shared")
402
+ # Solve for z in L^T z = x.T
403
+ x_t = wp.tile_transpose(x)
404
+ z = wp.tile_upper_solve(wp.tile_transpose(L), x_t)
405
+ # Ensure matmul receives correct layout information
406
+ c = wp.tile_matmul(z, z)
407
+ # Store z and c
408
+ wp.tile_store(gz, z)
409
+ wp.tile_store(gc, c)
410
+
411
+
412
+ @unittest.skipUnless(wp.context.runtime.core.cuda_toolkit_version() >= 12060, "CUDA toolkit version is less than 12.6")
413
+ def test_tile_math_back_substitution_multiple_rhs(test, device):
414
+ # Create test data
415
+ rng = np.random.default_rng(42)
416
+ L_h = np.tril(rng.random((TILE_M, TILE_M))) # Lower triangular matrix
417
+ x_h = rng.random((TILE_M, TILE_M)) # Multiple right-hand sides
418
+ z_h = np.zeros_like(x_h)
419
+ c_h = np.zeros_like(x_h)
420
+
421
+ # Compute reference solution using numpy
422
+ z_np = np.linalg.solve(L_h.T, x_h.T)
423
+ c_np = z_np @ z_np
424
+
425
+ # Create Warp arrays
426
+ L_wp = wp.array2d(L_h, requires_grad=True, dtype=wp.float64, device=device)
427
+ x_wp = wp.array2d(x_h, requires_grad=True, dtype=wp.float64, device=device)
428
+ z_wp = wp.array2d(z_h, requires_grad=True, dtype=wp.float64, device=device)
429
+ c_wp = wp.array2d(c_h, requires_grad=True, dtype=wp.float64, device=device)
430
+
431
+ # Run kernel
432
+ wp.launch_tiled(
433
+ tile_math_back_substitution_multiple_rhs,
434
+ dim=[1, 1],
435
+ inputs=[L_wp, x_wp, z_wp, c_wp],
436
+ block_dim=TILE_DIM,
437
+ device=device,
438
+ )
439
+ wp.synchronize_device()
440
+
441
+ # Verify results
442
+ assert np.allclose(z_wp.numpy(), z_np)
443
+ assert np.allclose(c_wp.numpy(), c_np)
444
+
445
+ # TODO: implement and test backward pass
446
+
447
+
448
+ # tests a complex composition of most libmathdx calls
449
+ @unittest.skipUnless(wp.context.runtime.core.cuda_toolkit_version() >= 12060, "CUDA toolkit version is less than 12.6")
450
+ def test_tile_math_block_cholesky(test, device):
451
+ BLOCK_SIZE = wp.constant(TILE_M // 2)
452
+
453
+ @wp.kernel(module="unique")
454
+ def block_cholesky_kernel(
455
+ A: wp.array2d(dtype=float),
456
+ L: wp.array2d(dtype=float),
457
+ ):
458
+ """
459
+ Computes the Cholesky factorization of a symmetric positive definite matrix A in blocks.
460
+ It returns a lower-triangular matrix L such that A = L L^T.
461
+ """
462
+
463
+ # Process the matrix in blocks along its leading dimension.
464
+ for k in range(0, TILE_M, BLOCK_SIZE):
465
+ end = k + BLOCK_SIZE
466
+
467
+ # Load current diagonal block A[k:end, k:end]
468
+ # and update with contributions from previously computed blocks.
469
+ A_kk_tile = wp.tile_load(A, shape=(BLOCK_SIZE, BLOCK_SIZE), offset=(k, k), storage="shared")
470
+
471
+ for j in range(0, k, BLOCK_SIZE):
472
+ L_block = wp.tile_load(L, shape=(BLOCK_SIZE, BLOCK_SIZE), offset=(k, j))
473
+ L_block_T = wp.tile_transpose(L_block)
474
+ L_L_T_block = wp.tile_matmul(L_block, L_block_T)
475
+ A_kk_tile -= L_L_T_block
476
+
477
+ # Compute the Cholesky factorization for the block
478
+ # print(A_kk_tile)
479
+ L_kk_tile = wp.tile_cholesky(A_kk_tile)
480
+ wp.tile_store(L, L_kk_tile, offset=(k, k))
481
+
482
+ # Process the blocks below the current block
483
+ for i in range(end, TILE_M, BLOCK_SIZE):
484
+ A_ik_tile = wp.tile_load(A, shape=(BLOCK_SIZE, BLOCK_SIZE), offset=(i, k), storage="shared")
485
+
486
+ for j in range(0, k, BLOCK_SIZE):
487
+ L_tile = wp.tile_load(L, shape=(BLOCK_SIZE, BLOCK_SIZE), offset=(i, j))
488
+ L_2_tile = wp.tile_load(L, shape=(BLOCK_SIZE, BLOCK_SIZE), offset=(k, j))
489
+ L_T_tile = wp.tile_transpose(L_2_tile)
490
+ L_L_T_tile = wp.tile_matmul(L_tile, L_T_tile)
491
+ A_ik_tile -= L_L_T_tile
492
+
493
+ A_ik_T_tile = wp.tile_transpose(A_ik_tile)
494
+ sol_T_tile = wp.tile_lower_solve(L_kk_tile, A_ik_T_tile)
495
+ sol_tile = wp.tile_transpose(sol_T_tile)
496
+
497
+ wp.tile_store(L, sol_tile, offset=(i, k))
498
+
499
+ @wp.kernel(module="unique")
500
+ def block_cholesky_solve_kernel(
501
+ L: wp.array2d(dtype=float),
502
+ b: wp.array2d(dtype=float),
503
+ scratch: wp.array2d(dtype=float),
504
+ x: wp.array2d(dtype=float),
505
+ ):
506
+ """
507
+ Solves A x = b given the Cholesky factor L (A = L L^T) using
508
+ blocked forward and backward substitution.
509
+ """
510
+
511
+ # Forward substitution: solve L y = b
512
+ for i in range(0, TILE_M, BLOCK_SIZE):
513
+ i_end = i + BLOCK_SIZE
514
+ rhs_tile = wp.tile_load(b, shape=(BLOCK_SIZE, 1), offset=(i, 0))
515
+ for j in range(0, i, BLOCK_SIZE):
516
+ L_block = wp.tile_load(L, shape=(BLOCK_SIZE, BLOCK_SIZE), offset=(i, j))
517
+ y_block = wp.tile_load(scratch, shape=(BLOCK_SIZE, 1), offset=(j, 0))
518
+ Ly_block = wp.tile_matmul(L_block, y_block)
519
+ rhs_tile -= Ly_block
520
+ L_tile = wp.tile_load(L, shape=(BLOCK_SIZE, BLOCK_SIZE), offset=(i, i))
521
+ y_tile = wp.tile_lower_solve(L_tile, rhs_tile)
522
+ wp.tile_store(scratch, y_tile, offset=(i, 0))
523
+
524
+ # Backward substitution: solve L^T x = y
525
+ for i in range(TILE_M - BLOCK_SIZE, -1, -BLOCK_SIZE):
526
+ i_start = i
527
+ i_end = i_start + BLOCK_SIZE
528
+ rhs_tile = wp.tile_load(scratch, shape=(BLOCK_SIZE, 1), offset=(i_start, 0))
529
+ for j in range(i_end, TILE_M, BLOCK_SIZE):
530
+ L_tile = wp.tile_load(L, shape=(BLOCK_SIZE, BLOCK_SIZE), offset=(j, i_start))
531
+ L_T_tile = wp.tile_transpose(L_tile)
532
+ x_tile = wp.tile_load(x, shape=(BLOCK_SIZE, 1), offset=(j, 0))
533
+ L_T_x_tile = wp.tile_matmul(L_T_tile, x_tile)
534
+ rhs_tile -= L_T_x_tile
535
+ L_tile = wp.tile_load(L, shape=(BLOCK_SIZE, BLOCK_SIZE), offset=(i_start, i_start))
536
+ x_tile = wp.tile_upper_solve(wp.tile_transpose(L_tile), rhs_tile)
537
+ wp.tile_store(x, x_tile, offset=(i_start, 0))
538
+
539
+ # check block cholesky decomposition
540
+
541
+ rng = np.random.default_rng(42)
542
+
543
+ M = np.array(rng.random((TILE_M, TILE_M)), dtype=float)
544
+
545
+ A_np = M.T @ M + np.eye(TILE_M, TILE_M)
546
+ L_np = np.linalg.cholesky(A_np)
547
+
548
+ A_wp = wp.array2d(A_np, dtype=float, device=device)
549
+ L_wp = wp.zeros_like(A_wp)
550
+
551
+ wp.launch_tiled(block_cholesky_kernel, dim=1, inputs=[A_wp], outputs=[L_wp], block_dim=TILE_DIM, device=device)
552
+
553
+ # check block cholesky solve
554
+
555
+ assert_np_equal(L_wp.numpy(), L_np, tol=1e-6)
556
+
557
+ b_np = np.array(rng.random((TILE_M, 1)), dtype=float)
558
+ b_wp = wp.array(b_np, dtype=float, device=device)
559
+
560
+ scratch = wp.zeros_like(b_wp)
561
+
562
+ x_np = np.linalg.solve(L_np.T, np.linalg.solve(L_np, b_np))
563
+ x_wp = wp.zeros_like(b_wp)
564
+
565
+ wp.launch_tiled(
566
+ block_cholesky_solve_kernel,
567
+ dim=1,
568
+ inputs=[L_wp, b_wp, scratch],
569
+ outputs=[x_wp],
570
+ block_dim=TILE_DIM,
571
+ device=device,
572
+ )
573
+
574
+ assert_np_equal(x_wp.numpy(), x_np, tol=1e-6)
575
+
576
+
577
+ @wp.kernel
578
+ def test_tile_lower_solve(L: wp.array2d(dtype=float), y: wp.array(dtype=float), x: wp.array(dtype=float)):
579
+ L_tile = wp.tile_load(L, shape=(TILE_M, TILE_M))
580
+ y_tile = wp.tile_load(x, shape=(TILE_M,))
581
+ sol = wp.tile_lower_solve(L_tile, y_tile)
582
+ wp.tile_store(x, sol)
583
+
584
+
585
+ @wp.kernel
586
+ def test_tile_upper_solve(L: wp.array2d(dtype=float), y: wp.array(dtype=float), x: wp.array(dtype=float)):
587
+ L_tile = wp.tile_load(L, shape=(TILE_M, TILE_M))
588
+ y_tile = wp.tile_load(x, shape=(TILE_M,))
589
+ sol = wp.tile_upper_solve(L_tile, y_tile)
590
+ wp.tile_store(x, sol)
591
+
592
+
593
+ @unittest.skipUnless(wp.context.runtime.core.cuda_toolkit_version() >= 12060, "CUDA toolkit version is less than 12.6")
594
+ def test_tile_math_singular_matrices(test, device):
595
+ rng = np.random.default_rng(42)
596
+ L_np = np.tril(rng.random((TILE_M, TILE_M))) # Lower triangular matrix
597
+ L_np[-1, -1] = 0.0 # Make it singular
598
+ y_np = rng.random(TILE_M)
599
+
600
+ L_wp = wp.array2d(L_np, dtype=float, device=device)
601
+ y_wp = wp.array(y_np, dtype=float, device=device)
602
+ x_wp = wp.zeros_like(y_wp)
603
+
604
+ wp.launch_tiled(
605
+ test_tile_lower_solve, dim=1, inputs=[L_wp, y_wp], outputs=[x_wp], block_dim=TILE_DIM, device=device
606
+ )
607
+
608
+ assert np.isnan(x_wp.numpy()).any()
609
+
610
+ L_np = np.triu(rng.random((TILE_M, TILE_M))) # Upper triangular matrix
611
+ L_np[-1, -1] = 0.0 # Make it singular
612
+
613
+ L_wp = wp.array2d(L_np, dtype=float, device=device)
614
+ y_wp = wp.array(y_np, dtype=float, device=device)
615
+ x_wp = wp.zeros_like(y_wp)
616
+
617
+ wp.launch_tiled(
618
+ test_tile_upper_solve, dim=1, inputs=[L_wp, y_wp], outputs=[x_wp], block_dim=TILE_DIM, device=device
619
+ )
620
+
621
+ assert np.isnan(x_wp.numpy()).any()
622
+
623
+
176
624
  all_devices = get_test_devices()
177
625
  cuda_devices = get_cuda_test_devices()
178
626
 
@@ -188,6 +636,13 @@ add_function_test(
188
636
  add_function_test(
189
637
  TestTileMathDx, "test_tile_math_cholesky", test_tile_math_cholesky, devices=all_devices, check_output=False
190
638
  )
639
+ add_function_test(
640
+ TestTileMathDx,
641
+ "tile_math_cholesky_multiple_rhs",
642
+ tile_math_cholesky_multiple_rhs,
643
+ devices=all_devices,
644
+ check_output=False,
645
+ )
191
646
  add_function_test(
192
647
  TestTileMathDx,
193
648
  "test_tile_math_fft_vec2f",
@@ -203,6 +658,55 @@ add_function_test(
203
658
  check_output=False,
204
659
  )
205
660
 
661
+ add_function_test(
662
+ TestTileMathDx,
663
+ "test_tile_math_forward_substitution",
664
+ test_tile_math_forward_substitution,
665
+ devices=cuda_devices,
666
+ check_output=False,
667
+ )
668
+
669
+ add_function_test(
670
+ TestTileMathDx,
671
+ "test_tile_math_back_substitution",
672
+ test_tile_math_back_substitution,
673
+ devices=cuda_devices,
674
+ check_output=False,
675
+ )
676
+
677
+ add_function_test(
678
+ TestTileMathDx,
679
+ "test_tile_math_forward_substitution_multiple_rhs",
680
+ test_tile_math_forward_substitution_multiple_rhs,
681
+ devices=cuda_devices,
682
+ check_output=False,
683
+ )
684
+
685
+ add_function_test(
686
+ TestTileMathDx,
687
+ "test_tile_math_back_substitution_multiple_rhs",
688
+ test_tile_math_back_substitution_multiple_rhs,
689
+ devices=cuda_devices,
690
+ check_output=False,
691
+ )
692
+
693
+ add_function_test(
694
+ TestTileMathDx,
695
+ "test_tile_math_block_cholesky",
696
+ test_tile_math_block_cholesky,
697
+ devices=cuda_devices,
698
+ check_output=False,
699
+ )
700
+
701
+ add_function_test(
702
+ TestTileMathDx,
703
+ "test_tile_math_singular_matrices",
704
+ test_tile_math_singular_matrices,
705
+ devices=cuda_devices,
706
+ check_output=False,
707
+ )
708
+
709
+
206
710
  if __name__ == "__main__":
207
711
  wp.clear_kernel_cache()
208
712
  unittest.main(verbosity=2, failfast=True)