warp-lang 1.7.2rc1__py3-none-manylinux_2_34_aarch64.whl → 1.8.1__py3-none-manylinux_2_34_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of warp-lang might be problematic. Click here for more details.
- warp/__init__.py +3 -1
- warp/__init__.pyi +3489 -1
- warp/autograd.py +45 -122
- warp/bin/warp-clang.so +0 -0
- warp/bin/warp.so +0 -0
- warp/build.py +241 -252
- warp/build_dll.py +130 -26
- warp/builtins.py +1907 -384
- warp/codegen.py +272 -104
- warp/config.py +12 -1
- warp/constants.py +1 -1
- warp/context.py +770 -238
- warp/dlpack.py +1 -1
- warp/examples/benchmarks/benchmark_cloth.py +2 -2
- warp/examples/benchmarks/benchmark_tile_sort.py +155 -0
- warp/examples/core/example_sample_mesh.py +1 -1
- warp/examples/core/example_spin_lock.py +93 -0
- warp/examples/core/example_work_queue.py +118 -0
- warp/examples/fem/example_adaptive_grid.py +5 -5
- warp/examples/fem/example_apic_fluid.py +1 -1
- warp/examples/fem/example_burgers.py +1 -1
- warp/examples/fem/example_convection_diffusion.py +9 -6
- warp/examples/fem/example_darcy_ls_optimization.py +489 -0
- warp/examples/fem/example_deformed_geometry.py +1 -1
- warp/examples/fem/example_diffusion.py +2 -2
- warp/examples/fem/example_diffusion_3d.py +1 -1
- warp/examples/fem/example_distortion_energy.py +1 -1
- warp/examples/fem/example_elastic_shape_optimization.py +387 -0
- warp/examples/fem/example_magnetostatics.py +5 -3
- warp/examples/fem/example_mixed_elasticity.py +5 -3
- warp/examples/fem/example_navier_stokes.py +11 -9
- warp/examples/fem/example_nonconforming_contact.py +5 -3
- warp/examples/fem/example_streamlines.py +8 -3
- warp/examples/fem/utils.py +9 -8
- warp/examples/interop/example_jax_callable.py +34 -4
- warp/examples/interop/example_jax_ffi_callback.py +2 -2
- warp/examples/interop/example_jax_kernel.py +27 -1
- warp/examples/optim/example_drone.py +1 -1
- warp/examples/sim/example_cloth.py +1 -1
- warp/examples/sim/example_cloth_self_contact.py +48 -54
- warp/examples/tile/example_tile_block_cholesky.py +502 -0
- warp/examples/tile/example_tile_cholesky.py +2 -1
- warp/examples/tile/example_tile_convolution.py +1 -1
- warp/examples/tile/example_tile_filtering.py +1 -1
- warp/examples/tile/example_tile_matmul.py +1 -1
- warp/examples/tile/example_tile_mlp.py +2 -0
- warp/fabric.py +7 -7
- warp/fem/__init__.py +5 -0
- warp/fem/adaptivity.py +1 -1
- warp/fem/cache.py +152 -63
- warp/fem/dirichlet.py +2 -2
- warp/fem/domain.py +136 -6
- warp/fem/field/field.py +141 -99
- warp/fem/field/nodal_field.py +85 -39
- warp/fem/field/virtual.py +99 -52
- warp/fem/geometry/adaptive_nanogrid.py +91 -86
- warp/fem/geometry/closest_point.py +13 -0
- warp/fem/geometry/deformed_geometry.py +102 -40
- warp/fem/geometry/element.py +56 -2
- warp/fem/geometry/geometry.py +323 -22
- warp/fem/geometry/grid_2d.py +157 -62
- warp/fem/geometry/grid_3d.py +116 -20
- warp/fem/geometry/hexmesh.py +86 -20
- warp/fem/geometry/nanogrid.py +166 -86
- warp/fem/geometry/partition.py +59 -25
- warp/fem/geometry/quadmesh.py +86 -135
- warp/fem/geometry/tetmesh.py +47 -119
- warp/fem/geometry/trimesh.py +77 -270
- warp/fem/integrate.py +181 -95
- warp/fem/linalg.py +25 -58
- warp/fem/operator.py +124 -27
- warp/fem/quadrature/pic_quadrature.py +36 -14
- warp/fem/quadrature/quadrature.py +40 -16
- warp/fem/space/__init__.py +1 -1
- warp/fem/space/basis_function_space.py +66 -46
- warp/fem/space/basis_space.py +17 -4
- warp/fem/space/dof_mapper.py +1 -1
- warp/fem/space/function_space.py +2 -2
- warp/fem/space/grid_2d_function_space.py +4 -1
- warp/fem/space/hexmesh_function_space.py +4 -2
- warp/fem/space/nanogrid_function_space.py +3 -1
- warp/fem/space/partition.py +11 -2
- warp/fem/space/quadmesh_function_space.py +4 -1
- warp/fem/space/restriction.py +5 -2
- warp/fem/space/shape/__init__.py +10 -8
- warp/fem/space/tetmesh_function_space.py +4 -1
- warp/fem/space/topology.py +52 -21
- warp/fem/space/trimesh_function_space.py +4 -1
- warp/fem/utils.py +53 -8
- warp/jax.py +1 -2
- warp/jax_experimental/ffi.py +210 -67
- warp/jax_experimental/xla_ffi.py +37 -24
- warp/math.py +171 -1
- warp/native/array.h +103 -4
- warp/native/builtin.h +182 -35
- warp/native/coloring.cpp +6 -2
- warp/native/cuda_util.cpp +1 -1
- warp/native/exports.h +118 -63
- warp/native/intersect.h +5 -5
- warp/native/mat.h +8 -13
- warp/native/mathdx.cpp +11 -5
- warp/native/matnn.h +1 -123
- warp/native/mesh.h +1 -1
- warp/native/quat.h +34 -6
- warp/native/rand.h +7 -7
- warp/native/sparse.cpp +121 -258
- warp/native/sparse.cu +181 -274
- warp/native/spatial.h +305 -17
- warp/native/svd.h +23 -8
- warp/native/tile.h +603 -73
- warp/native/tile_radix_sort.h +1112 -0
- warp/native/tile_reduce.h +239 -13
- warp/native/tile_scan.h +240 -0
- warp/native/tuple.h +189 -0
- warp/native/vec.h +10 -20
- warp/native/warp.cpp +36 -4
- warp/native/warp.cu +588 -52
- warp/native/warp.h +47 -74
- warp/optim/linear.py +5 -1
- warp/paddle.py +7 -8
- warp/py.typed +0 -0
- warp/render/render_opengl.py +110 -80
- warp/render/render_usd.py +124 -62
- warp/sim/__init__.py +9 -0
- warp/sim/collide.py +253 -80
- warp/sim/graph_coloring.py +8 -1
- warp/sim/import_mjcf.py +4 -3
- warp/sim/import_usd.py +11 -7
- warp/sim/integrator.py +5 -2
- warp/sim/integrator_euler.py +1 -1
- warp/sim/integrator_featherstone.py +1 -1
- warp/sim/integrator_vbd.py +761 -322
- warp/sim/integrator_xpbd.py +1 -1
- warp/sim/model.py +265 -260
- warp/sim/utils.py +10 -7
- warp/sparse.py +303 -166
- warp/tape.py +54 -51
- warp/tests/cuda/test_conditional_captures.py +1046 -0
- warp/tests/cuda/test_streams.py +1 -1
- warp/tests/geometry/test_volume.py +2 -2
- warp/tests/interop/test_dlpack.py +9 -9
- warp/tests/interop/test_jax.py +0 -1
- warp/tests/run_coverage_serial.py +1 -1
- warp/tests/sim/disabled_kinematics.py +2 -2
- warp/tests/sim/{test_vbd.py → test_cloth.py} +378 -112
- warp/tests/sim/test_collision.py +159 -51
- warp/tests/sim/test_coloring.py +91 -2
- warp/tests/test_array.py +254 -2
- warp/tests/test_array_reduce.py +2 -2
- warp/tests/test_assert.py +53 -0
- warp/tests/test_atomic_cas.py +312 -0
- warp/tests/test_codegen.py +142 -19
- warp/tests/test_conditional.py +47 -1
- warp/tests/test_ctypes.py +0 -20
- warp/tests/test_devices.py +8 -0
- warp/tests/test_fabricarray.py +4 -2
- warp/tests/test_fem.py +58 -25
- warp/tests/test_func.py +42 -1
- warp/tests/test_grad.py +1 -1
- warp/tests/test_lerp.py +1 -3
- warp/tests/test_map.py +481 -0
- warp/tests/test_mat.py +23 -24
- warp/tests/test_quat.py +28 -15
- warp/tests/test_rounding.py +10 -38
- warp/tests/test_runlength_encode.py +7 -7
- warp/tests/test_smoothstep.py +1 -1
- warp/tests/test_sparse.py +83 -2
- warp/tests/test_spatial.py +507 -1
- warp/tests/test_static.py +48 -0
- warp/tests/test_struct.py +2 -2
- warp/tests/test_tape.py +38 -0
- warp/tests/test_tuple.py +265 -0
- warp/tests/test_types.py +2 -2
- warp/tests/test_utils.py +24 -18
- warp/tests/test_vec.py +38 -408
- warp/tests/test_vec_constructors.py +325 -0
- warp/tests/tile/test_tile.py +438 -131
- warp/tests/tile/test_tile_mathdx.py +518 -14
- warp/tests/tile/test_tile_matmul.py +179 -0
- warp/tests/tile/test_tile_reduce.py +307 -5
- warp/tests/tile/test_tile_shared_memory.py +136 -7
- warp/tests/tile/test_tile_sort.py +121 -0
- warp/tests/unittest_suites.py +14 -6
- warp/types.py +462 -308
- warp/utils.py +647 -86
- {warp_lang-1.7.2rc1.dist-info → warp_lang-1.8.1.dist-info}/METADATA +20 -6
- {warp_lang-1.7.2rc1.dist-info → warp_lang-1.8.1.dist-info}/RECORD +190 -176
- warp/stubs.py +0 -3381
- warp/tests/sim/test_xpbd.py +0 -399
- warp/tests/test_mlp.py +0 -282
- {warp_lang-1.7.2rc1.dist-info → warp_lang-1.8.1.dist-info}/WHEEL +0 -0
- {warp_lang-1.7.2rc1.dist-info → warp_lang-1.8.1.dist-info}/licenses/LICENSE.md +0 -0
- {warp_lang-1.7.2rc1.dist-info → warp_lang-1.8.1.dist-info}/top_level.txt +0 -0
|
@@ -45,6 +45,7 @@ def tile_math_matmul_kernel(
|
|
|
45
45
|
wp.tile_store(gc, c, offset=(i * TILE_M, j * TILE_N))
|
|
46
46
|
|
|
47
47
|
|
|
48
|
+
@unittest.skipUnless(wp.context.runtime.core.cuda_toolkit_version() >= 12060, "CUDA toolkit version is less than 12.6")
|
|
48
49
|
def test_tile_math_matmul(test, device):
|
|
49
50
|
rng = np.random.default_rng(42)
|
|
50
51
|
|
|
@@ -128,51 +129,498 @@ def tile_math_cholesky(
|
|
|
128
129
|
gA: wp.array2d(dtype=wp.float64),
|
|
129
130
|
gD: wp.array1d(dtype=wp.float64),
|
|
130
131
|
gL: wp.array2d(dtype=wp.float64),
|
|
131
|
-
gx: wp.array1d(dtype=wp.float64),
|
|
132
132
|
gy: wp.array1d(dtype=wp.float64),
|
|
133
|
+
gx: wp.array1d(dtype=wp.float64),
|
|
133
134
|
):
|
|
134
135
|
i, j = wp.tid()
|
|
135
|
-
# Load A, D &
|
|
136
|
+
# Load A, D & y
|
|
136
137
|
a = wp.tile_load(gA, shape=(TILE_M, TILE_M), storage="shared")
|
|
137
138
|
d = wp.tile_load(gD, shape=TILE_M, storage="shared")
|
|
138
|
-
|
|
139
|
-
#
|
|
140
|
-
|
|
139
|
+
y = wp.tile_load(gy, shape=TILE_M, storage="shared")
|
|
140
|
+
# Ensure tile_diag_add() and tile_cholesky_solve() work with transposed matrices
|
|
141
|
+
a_t = wp.tile_transpose(a)
|
|
142
|
+
# Compute L st LL^T = A^T + diag(D)
|
|
143
|
+
b = wp.tile_diag_add(a_t, d)
|
|
141
144
|
l = wp.tile_cholesky(b)
|
|
142
|
-
# Solve for y in LL^T
|
|
143
|
-
|
|
145
|
+
# Solve for y in LL^T x = y
|
|
146
|
+
x = wp.tile_cholesky_solve(l, y)
|
|
144
147
|
# Store L & y
|
|
145
148
|
wp.tile_store(gL, l)
|
|
146
|
-
wp.tile_store(
|
|
149
|
+
wp.tile_store(gx, x)
|
|
147
150
|
|
|
148
151
|
|
|
152
|
+
@unittest.skipUnless(wp.context.runtime.core.cuda_toolkit_version() >= 12060, "CUDA toolkit version is less than 12.6")
|
|
149
153
|
def test_tile_math_cholesky(test, device):
|
|
150
154
|
A_h = np.ones((TILE_M, TILE_M), dtype=np.float64)
|
|
151
155
|
D_h = 8.0 * np.ones(TILE_M, dtype=np.float64)
|
|
152
156
|
L_h = np.zeros_like(A_h)
|
|
153
|
-
|
|
154
|
-
|
|
157
|
+
Y_h = np.arange(TILE_M, dtype=np.float64)
|
|
158
|
+
X_h = np.zeros_like(Y_h)
|
|
155
159
|
|
|
156
|
-
A_np = A_h + np.diag(D_h)
|
|
160
|
+
A_np = A_h.T + np.diag(D_h)
|
|
157
161
|
L_np = np.linalg.cholesky(A_np)
|
|
158
|
-
|
|
162
|
+
X_np = np.linalg.solve(A_np, Y_h)
|
|
159
163
|
|
|
160
164
|
A_wp = wp.array2d(A_h, requires_grad=True, dtype=wp.float64, device=device)
|
|
161
165
|
D_wp = wp.array2d(D_h, requires_grad=True, dtype=wp.float64, device=device)
|
|
162
166
|
L_wp = wp.array2d(L_h, requires_grad=True, dtype=wp.float64, device=device)
|
|
167
|
+
Y_wp = wp.array2d(Y_h, requires_grad=True, dtype=wp.float64, device=device)
|
|
163
168
|
X_wp = wp.array2d(X_h, requires_grad=True, dtype=wp.float64, device=device)
|
|
169
|
+
|
|
170
|
+
wp.launch_tiled(
|
|
171
|
+
tile_math_cholesky, dim=[1, 1], inputs=[A_wp, D_wp, L_wp, Y_wp, X_wp], block_dim=TILE_DIM, device=device
|
|
172
|
+
)
|
|
173
|
+
wp.synchronize_device(device)
|
|
174
|
+
|
|
175
|
+
np.testing.assert_allclose(X_wp.numpy(), X_np)
|
|
176
|
+
np.testing.assert_allclose(L_wp.numpy(), L_np)
|
|
177
|
+
|
|
178
|
+
# TODO: implement and test backward pass
|
|
179
|
+
|
|
180
|
+
|
|
181
|
+
@wp.kernel()
|
|
182
|
+
def tile_math_cholesky_multiple_rhs(
|
|
183
|
+
gA: wp.array2d(dtype=wp.float64),
|
|
184
|
+
gD: wp.array1d(dtype=wp.float64),
|
|
185
|
+
gL: wp.array2d(dtype=wp.float64),
|
|
186
|
+
gy: wp.array2d(dtype=wp.float64),
|
|
187
|
+
gx: wp.array2d(dtype=wp.float64),
|
|
188
|
+
gz: wp.array2d(dtype=wp.float64),
|
|
189
|
+
):
|
|
190
|
+
i, j = wp.tid()
|
|
191
|
+
# Load A, D & y
|
|
192
|
+
a = wp.tile_load(gA, shape=(TILE_M, TILE_M), storage="shared")
|
|
193
|
+
d = wp.tile_load(gD, shape=TILE_M, storage="shared")
|
|
194
|
+
y = wp.tile_load(gy, shape=(TILE_M, TILE_M), storage="shared")
|
|
195
|
+
# Ensure tile_diag_add() and tile_cholesky_solve() work with transposed matrices
|
|
196
|
+
a_t = wp.tile_transpose(a)
|
|
197
|
+
# Compute L st LL^T = A.T + diag(D)
|
|
198
|
+
b = wp.tile_diag_add(a_t, d)
|
|
199
|
+
l = wp.tile_cholesky(b)
|
|
200
|
+
# Solve for y in LL^T x = y.T
|
|
201
|
+
y_t = wp.tile_transpose(y)
|
|
202
|
+
x = wp.tile_cholesky_solve(l, y_t)
|
|
203
|
+
# Ensure matmul receives correct layout information
|
|
204
|
+
z = wp.tile_matmul(x, x)
|
|
205
|
+
# Store L & y
|
|
206
|
+
wp.tile_store(gL, l)
|
|
207
|
+
wp.tile_store(gx, x)
|
|
208
|
+
wp.tile_store(gz, z)
|
|
209
|
+
|
|
210
|
+
|
|
211
|
+
@unittest.skipUnless(wp.context.runtime.core.cuda_toolkit_version() >= 12060, "CUDA toolkit version is less than 12.6")
|
|
212
|
+
def test_tile_math_cholesky_multiple_rhs(test, device):
|
|
213
|
+
A_h = np.ones((TILE_M, TILE_M), dtype=np.float64)
|
|
214
|
+
D_h = 8.0 * np.ones(TILE_M, dtype=np.float64)
|
|
215
|
+
L_h = np.zeros_like(A_h)
|
|
216
|
+
Y_h = np.arange((TILE_M, TILE_M), dtype=np.float64)
|
|
217
|
+
X_h = np.zeros_like(Y_h)
|
|
218
|
+
Z_h = np.zeros_like(Y_h)
|
|
219
|
+
|
|
220
|
+
A_np = A_h.T + np.diag(D_h)
|
|
221
|
+
L_np = np.linalg.cholesky(A_np)
|
|
222
|
+
X_np = np.linalg.solve(A_np, Y_h.T)
|
|
223
|
+
Z_np = X_np @ X_np
|
|
224
|
+
|
|
225
|
+
A_wp = wp.array2d(A_h, requires_grad=True, dtype=wp.float64, device=device)
|
|
226
|
+
D_wp = wp.array2d(D_h, requires_grad=True, dtype=wp.float64, device=device)
|
|
227
|
+
L_wp = wp.array2d(L_h, requires_grad=True, dtype=wp.float64, device=device)
|
|
164
228
|
Y_wp = wp.array2d(Y_h, requires_grad=True, dtype=wp.float64, device=device)
|
|
229
|
+
X_wp = wp.array2d(X_h, requires_grad=True, dtype=wp.float64, device=device)
|
|
230
|
+
Z_wp = wp.array2d(Z_h, requires_grad=True, dtype=wp.float64, device=device)
|
|
165
231
|
|
|
166
232
|
wp.launch_tiled(
|
|
167
|
-
|
|
233
|
+
tile_math_cholesky_multiple_rhs,
|
|
234
|
+
dim=[1, 1],
|
|
235
|
+
inputs=[A_wp, D_wp, L_wp, Y_wp, X_wp, Z_wp],
|
|
236
|
+
block_dim=TILE_DIM,
|
|
237
|
+
device=device,
|
|
238
|
+
)
|
|
239
|
+
wp.synchronize_device(device)
|
|
240
|
+
|
|
241
|
+
np.testing.assert_allclose(L_wp.numpy(), L_np)
|
|
242
|
+
np.testing.assert_allclose(X_wp.numpy(), X_np)
|
|
243
|
+
np.testing.assert_allclose(Z_wp.numpy(), Z_np)
|
|
244
|
+
|
|
245
|
+
# TODO: implement and test backward pass
|
|
246
|
+
|
|
247
|
+
|
|
248
|
+
@wp.kernel
|
|
249
|
+
def tile_math_forward_substitution(
|
|
250
|
+
gL: wp.array2d(dtype=wp.float64), gx: wp.array1d(dtype=wp.float64), gz: wp.array1d(dtype=wp.float64)
|
|
251
|
+
):
|
|
252
|
+
i, j = wp.tid()
|
|
253
|
+
# Load L & x
|
|
254
|
+
L = wp.tile_load(gL, shape=(TILE_M, TILE_M), storage="shared")
|
|
255
|
+
x = wp.tile_load(gx, shape=TILE_M, storage="shared")
|
|
256
|
+
# Solve for z in Lz = x
|
|
257
|
+
# Transpose because we loaded an upper triangular matrix
|
|
258
|
+
z = wp.tile_lower_solve(wp.tile_transpose(L), x)
|
|
259
|
+
# Store z
|
|
260
|
+
wp.tile_store(gz, z)
|
|
261
|
+
|
|
262
|
+
|
|
263
|
+
@unittest.skipUnless(wp.context.runtime.core.cuda_toolkit_version() >= 12060, "CUDA toolkit version is less than 12.6")
|
|
264
|
+
def test_tile_math_forward_substitution(test, device):
|
|
265
|
+
# Create test data
|
|
266
|
+
rng = np.random.default_rng(42)
|
|
267
|
+
L_h = np.triu(rng.random((TILE_M, TILE_M))) # Upper triangular matrix
|
|
268
|
+
x_h = rng.random(TILE_M)
|
|
269
|
+
z_h = np.zeros_like(x_h)
|
|
270
|
+
|
|
271
|
+
# Compute reference solution using numpy
|
|
272
|
+
z_np = np.linalg.solve(L_h.T, x_h)
|
|
273
|
+
|
|
274
|
+
# Create Warp arrays
|
|
275
|
+
L_wp = wp.array2d(L_h, requires_grad=True, dtype=wp.float64, device=device)
|
|
276
|
+
x_wp = wp.array1d(x_h, requires_grad=True, dtype=wp.float64, device=device)
|
|
277
|
+
z_wp = wp.array1d(z_h, requires_grad=True, dtype=wp.float64, device=device)
|
|
278
|
+
|
|
279
|
+
# Run kernel
|
|
280
|
+
wp.launch_tiled(
|
|
281
|
+
tile_math_forward_substitution, dim=[1, 1], inputs=[L_wp, x_wp, z_wp], block_dim=TILE_DIM, device=device
|
|
282
|
+
)
|
|
283
|
+
wp.synchronize_device(device)
|
|
284
|
+
|
|
285
|
+
# Verify results
|
|
286
|
+
np.testing.assert_allclose(z_wp.numpy(), z_np)
|
|
287
|
+
|
|
288
|
+
# TODO: implement and test backward pass
|
|
289
|
+
|
|
290
|
+
|
|
291
|
+
@wp.kernel
|
|
292
|
+
def tile_math_back_substitution(
|
|
293
|
+
gL: wp.array2d(dtype=wp.float64), gx: wp.array1d(dtype=wp.float64), gz: wp.array1d(dtype=wp.float64)
|
|
294
|
+
):
|
|
295
|
+
i, j = wp.tid()
|
|
296
|
+
# Load L & x
|
|
297
|
+
L = wp.tile_load(gL, shape=(TILE_M, TILE_M), storage="shared")
|
|
298
|
+
x = wp.tile_load(gx, shape=TILE_M, storage="shared")
|
|
299
|
+
# Solve for z in L^T z = x
|
|
300
|
+
# Transpose because we loaded a lower triangular matrix
|
|
301
|
+
z = wp.tile_upper_solve(wp.tile_transpose(L), x)
|
|
302
|
+
# Store z
|
|
303
|
+
wp.tile_store(gz, z)
|
|
304
|
+
|
|
305
|
+
|
|
306
|
+
@unittest.skipUnless(wp.context.runtime.core.cuda_toolkit_version() >= 12060, "CUDA toolkit version is less than 12.6")
|
|
307
|
+
def test_tile_math_back_substitution(test, device):
|
|
308
|
+
# Create test data
|
|
309
|
+
rng = np.random.default_rng(42)
|
|
310
|
+
L_h = np.tril(rng.random((TILE_M, TILE_M))) # Lower triangular matrix
|
|
311
|
+
x_h = rng.random(TILE_M)
|
|
312
|
+
z_h = np.zeros_like(x_h)
|
|
313
|
+
|
|
314
|
+
# Compute reference solution using numpy
|
|
315
|
+
z_np = np.linalg.solve(L_h.T, x_h)
|
|
316
|
+
|
|
317
|
+
# Create Warp arrays
|
|
318
|
+
L_wp = wp.array2d(L_h, requires_grad=True, dtype=wp.float64, device=device)
|
|
319
|
+
x_wp = wp.array1d(x_h, requires_grad=True, dtype=wp.float64, device=device)
|
|
320
|
+
z_wp = wp.array1d(z_h, requires_grad=True, dtype=wp.float64, device=device)
|
|
321
|
+
|
|
322
|
+
# Run kernel
|
|
323
|
+
wp.launch_tiled(
|
|
324
|
+
tile_math_back_substitution, dim=[1, 1], inputs=[L_wp, x_wp, z_wp], block_dim=TILE_DIM, device=device
|
|
325
|
+
)
|
|
326
|
+
wp.synchronize_device(device)
|
|
327
|
+
|
|
328
|
+
# Verify results
|
|
329
|
+
np.testing.assert_allclose(z_wp.numpy(), z_np)
|
|
330
|
+
|
|
331
|
+
# TODO: implement and test backward pass
|
|
332
|
+
|
|
333
|
+
|
|
334
|
+
@wp.kernel
|
|
335
|
+
def tile_math_forward_substitution_multiple_rhs(
|
|
336
|
+
gL: wp.array2d(dtype=wp.float64),
|
|
337
|
+
gx: wp.array2d(dtype=wp.float64),
|
|
338
|
+
gz: wp.array2d(dtype=wp.float64),
|
|
339
|
+
gc: wp.array2d(dtype=wp.float64),
|
|
340
|
+
):
|
|
341
|
+
i, j = wp.tid()
|
|
342
|
+
# Load L & x
|
|
343
|
+
L = wp.tile_load(gL, shape=(TILE_M, TILE_M), storage="shared")
|
|
344
|
+
x = wp.tile_load(gx, shape=(TILE_M, TILE_M), storage="shared")
|
|
345
|
+
# Solve for z in Lz = x.T
|
|
346
|
+
x_t = wp.tile_transpose(x)
|
|
347
|
+
z = wp.tile_lower_solve(L, x_t)
|
|
348
|
+
# Ensure matmul receives correct layout information
|
|
349
|
+
c = wp.tile_matmul(z, z)
|
|
350
|
+
# Store z and c
|
|
351
|
+
wp.tile_store(gz, z)
|
|
352
|
+
wp.tile_store(gc, c)
|
|
353
|
+
|
|
354
|
+
|
|
355
|
+
@unittest.skipUnless(wp.context.runtime.core.cuda_toolkit_version() >= 12060, "CUDA toolkit version is less than 12.6")
|
|
356
|
+
def test_tile_math_forward_substitution_multiple_rhs(test, device):
|
|
357
|
+
# Create test data
|
|
358
|
+
rng = np.random.default_rng(42)
|
|
359
|
+
L_h = np.tril(rng.random((TILE_M, TILE_M))) # Lower triangular matrix
|
|
360
|
+
x_h = rng.random((TILE_M, TILE_M)) # Multiple right-hand sides
|
|
361
|
+
z_h = np.zeros_like(x_h)
|
|
362
|
+
c_h = np.zeros_like(x_h)
|
|
363
|
+
|
|
364
|
+
# Compute reference solution using numpy
|
|
365
|
+
z_np = np.linalg.solve(L_h, x_h.T)
|
|
366
|
+
c_np = z_np @ z_np
|
|
367
|
+
|
|
368
|
+
# Create Warp arrays
|
|
369
|
+
L_wp = wp.array2d(L_h, requires_grad=True, dtype=wp.float64, device=device)
|
|
370
|
+
x_wp = wp.array2d(x_h, requires_grad=True, dtype=wp.float64, device=device)
|
|
371
|
+
z_wp = wp.array2d(z_h, requires_grad=True, dtype=wp.float64, device=device)
|
|
372
|
+
c_wp = wp.array2d(c_h, requires_grad=True, dtype=wp.float64, device=device)
|
|
373
|
+
|
|
374
|
+
# Run kernel
|
|
375
|
+
wp.launch_tiled(
|
|
376
|
+
tile_math_forward_substitution_multiple_rhs,
|
|
377
|
+
dim=[1, 1],
|
|
378
|
+
inputs=[L_wp, x_wp, z_wp, c_wp],
|
|
379
|
+
block_dim=TILE_DIM,
|
|
380
|
+
device=device,
|
|
168
381
|
)
|
|
169
382
|
wp.synchronize_device()
|
|
170
383
|
|
|
171
|
-
|
|
384
|
+
# Verify results
|
|
385
|
+
assert np.allclose(z_wp.numpy(), z_np)
|
|
386
|
+
assert np.allclose(c_wp.numpy(), c_np)
|
|
172
387
|
|
|
173
388
|
# TODO: implement and test backward pass
|
|
174
389
|
|
|
175
390
|
|
|
391
|
+
@wp.kernel
|
|
392
|
+
def tile_math_back_substitution_multiple_rhs(
|
|
393
|
+
gL: wp.array2d(dtype=wp.float64),
|
|
394
|
+
gx: wp.array2d(dtype=wp.float64),
|
|
395
|
+
gz: wp.array2d(dtype=wp.float64),
|
|
396
|
+
gc: wp.array2d(dtype=wp.float64),
|
|
397
|
+
):
|
|
398
|
+
i, j = wp.tid()
|
|
399
|
+
# Load L & x
|
|
400
|
+
L = wp.tile_load(gL, shape=(TILE_M, TILE_M), storage="shared")
|
|
401
|
+
x = wp.tile_load(gx, shape=(TILE_M, TILE_M), storage="shared")
|
|
402
|
+
# Solve for z in L^T z = x.T
|
|
403
|
+
x_t = wp.tile_transpose(x)
|
|
404
|
+
z = wp.tile_upper_solve(wp.tile_transpose(L), x_t)
|
|
405
|
+
# Ensure matmul receives correct layout information
|
|
406
|
+
c = wp.tile_matmul(z, z)
|
|
407
|
+
# Store z and c
|
|
408
|
+
wp.tile_store(gz, z)
|
|
409
|
+
wp.tile_store(gc, c)
|
|
410
|
+
|
|
411
|
+
|
|
412
|
+
@unittest.skipUnless(wp.context.runtime.core.cuda_toolkit_version() >= 12060, "CUDA toolkit version is less than 12.6")
|
|
413
|
+
def test_tile_math_back_substitution_multiple_rhs(test, device):
|
|
414
|
+
# Create test data
|
|
415
|
+
rng = np.random.default_rng(42)
|
|
416
|
+
L_h = np.tril(rng.random((TILE_M, TILE_M))) # Lower triangular matrix
|
|
417
|
+
x_h = rng.random((TILE_M, TILE_M)) # Multiple right-hand sides
|
|
418
|
+
z_h = np.zeros_like(x_h)
|
|
419
|
+
c_h = np.zeros_like(x_h)
|
|
420
|
+
|
|
421
|
+
# Compute reference solution using numpy
|
|
422
|
+
z_np = np.linalg.solve(L_h.T, x_h.T)
|
|
423
|
+
c_np = z_np @ z_np
|
|
424
|
+
|
|
425
|
+
# Create Warp arrays
|
|
426
|
+
L_wp = wp.array2d(L_h, requires_grad=True, dtype=wp.float64, device=device)
|
|
427
|
+
x_wp = wp.array2d(x_h, requires_grad=True, dtype=wp.float64, device=device)
|
|
428
|
+
z_wp = wp.array2d(z_h, requires_grad=True, dtype=wp.float64, device=device)
|
|
429
|
+
c_wp = wp.array2d(c_h, requires_grad=True, dtype=wp.float64, device=device)
|
|
430
|
+
|
|
431
|
+
# Run kernel
|
|
432
|
+
wp.launch_tiled(
|
|
433
|
+
tile_math_back_substitution_multiple_rhs,
|
|
434
|
+
dim=[1, 1],
|
|
435
|
+
inputs=[L_wp, x_wp, z_wp, c_wp],
|
|
436
|
+
block_dim=TILE_DIM,
|
|
437
|
+
device=device,
|
|
438
|
+
)
|
|
439
|
+
wp.synchronize_device()
|
|
440
|
+
|
|
441
|
+
# Verify results
|
|
442
|
+
assert np.allclose(z_wp.numpy(), z_np)
|
|
443
|
+
assert np.allclose(c_wp.numpy(), c_np)
|
|
444
|
+
|
|
445
|
+
# TODO: implement and test backward pass
|
|
446
|
+
|
|
447
|
+
|
|
448
|
+
# tests a complex composition of most libmathdx calls
|
|
449
|
+
@unittest.skipUnless(wp.context.runtime.core.cuda_toolkit_version() >= 12060, "CUDA toolkit version is less than 12.6")
|
|
450
|
+
def test_tile_math_block_cholesky(test, device):
|
|
451
|
+
BLOCK_SIZE = wp.constant(TILE_M // 2)
|
|
452
|
+
|
|
453
|
+
@wp.kernel(module="unique")
|
|
454
|
+
def block_cholesky_kernel(
|
|
455
|
+
A: wp.array2d(dtype=float),
|
|
456
|
+
L: wp.array2d(dtype=float),
|
|
457
|
+
):
|
|
458
|
+
"""
|
|
459
|
+
Computes the Cholesky factorization of a symmetric positive definite matrix A in blocks.
|
|
460
|
+
It returns a lower-triangular matrix L such that A = L L^T.
|
|
461
|
+
"""
|
|
462
|
+
|
|
463
|
+
# Process the matrix in blocks along its leading dimension.
|
|
464
|
+
for k in range(0, TILE_M, BLOCK_SIZE):
|
|
465
|
+
end = k + BLOCK_SIZE
|
|
466
|
+
|
|
467
|
+
# Load current diagonal block A[k:end, k:end]
|
|
468
|
+
# and update with contributions from previously computed blocks.
|
|
469
|
+
A_kk_tile = wp.tile_load(A, shape=(BLOCK_SIZE, BLOCK_SIZE), offset=(k, k), storage="shared")
|
|
470
|
+
|
|
471
|
+
for j in range(0, k, BLOCK_SIZE):
|
|
472
|
+
L_block = wp.tile_load(L, shape=(BLOCK_SIZE, BLOCK_SIZE), offset=(k, j))
|
|
473
|
+
L_block_T = wp.tile_transpose(L_block)
|
|
474
|
+
L_L_T_block = wp.tile_matmul(L_block, L_block_T)
|
|
475
|
+
A_kk_tile -= L_L_T_block
|
|
476
|
+
|
|
477
|
+
# Compute the Cholesky factorization for the block
|
|
478
|
+
# print(A_kk_tile)
|
|
479
|
+
L_kk_tile = wp.tile_cholesky(A_kk_tile)
|
|
480
|
+
wp.tile_store(L, L_kk_tile, offset=(k, k))
|
|
481
|
+
|
|
482
|
+
# Process the blocks below the current block
|
|
483
|
+
for i in range(end, TILE_M, BLOCK_SIZE):
|
|
484
|
+
A_ik_tile = wp.tile_load(A, shape=(BLOCK_SIZE, BLOCK_SIZE), offset=(i, k), storage="shared")
|
|
485
|
+
|
|
486
|
+
for j in range(0, k, BLOCK_SIZE):
|
|
487
|
+
L_tile = wp.tile_load(L, shape=(BLOCK_SIZE, BLOCK_SIZE), offset=(i, j))
|
|
488
|
+
L_2_tile = wp.tile_load(L, shape=(BLOCK_SIZE, BLOCK_SIZE), offset=(k, j))
|
|
489
|
+
L_T_tile = wp.tile_transpose(L_2_tile)
|
|
490
|
+
L_L_T_tile = wp.tile_matmul(L_tile, L_T_tile)
|
|
491
|
+
A_ik_tile -= L_L_T_tile
|
|
492
|
+
|
|
493
|
+
A_ik_T_tile = wp.tile_transpose(A_ik_tile)
|
|
494
|
+
sol_T_tile = wp.tile_lower_solve(L_kk_tile, A_ik_T_tile)
|
|
495
|
+
sol_tile = wp.tile_transpose(sol_T_tile)
|
|
496
|
+
|
|
497
|
+
wp.tile_store(L, sol_tile, offset=(i, k))
|
|
498
|
+
|
|
499
|
+
@wp.kernel(module="unique")
|
|
500
|
+
def block_cholesky_solve_kernel(
|
|
501
|
+
L: wp.array2d(dtype=float),
|
|
502
|
+
b: wp.array2d(dtype=float),
|
|
503
|
+
scratch: wp.array2d(dtype=float),
|
|
504
|
+
x: wp.array2d(dtype=float),
|
|
505
|
+
):
|
|
506
|
+
"""
|
|
507
|
+
Solves A x = b given the Cholesky factor L (A = L L^T) using
|
|
508
|
+
blocked forward and backward substitution.
|
|
509
|
+
"""
|
|
510
|
+
|
|
511
|
+
# Forward substitution: solve L y = b
|
|
512
|
+
for i in range(0, TILE_M, BLOCK_SIZE):
|
|
513
|
+
i_end = i + BLOCK_SIZE
|
|
514
|
+
rhs_tile = wp.tile_load(b, shape=(BLOCK_SIZE, 1), offset=(i, 0))
|
|
515
|
+
for j in range(0, i, BLOCK_SIZE):
|
|
516
|
+
L_block = wp.tile_load(L, shape=(BLOCK_SIZE, BLOCK_SIZE), offset=(i, j))
|
|
517
|
+
y_block = wp.tile_load(scratch, shape=(BLOCK_SIZE, 1), offset=(j, 0))
|
|
518
|
+
Ly_block = wp.tile_matmul(L_block, y_block)
|
|
519
|
+
rhs_tile -= Ly_block
|
|
520
|
+
L_tile = wp.tile_load(L, shape=(BLOCK_SIZE, BLOCK_SIZE), offset=(i, i))
|
|
521
|
+
y_tile = wp.tile_lower_solve(L_tile, rhs_tile)
|
|
522
|
+
wp.tile_store(scratch, y_tile, offset=(i, 0))
|
|
523
|
+
|
|
524
|
+
# Backward substitution: solve L^T x = y
|
|
525
|
+
for i in range(TILE_M - BLOCK_SIZE, -1, -BLOCK_SIZE):
|
|
526
|
+
i_start = i
|
|
527
|
+
i_end = i_start + BLOCK_SIZE
|
|
528
|
+
rhs_tile = wp.tile_load(scratch, shape=(BLOCK_SIZE, 1), offset=(i_start, 0))
|
|
529
|
+
for j in range(i_end, TILE_M, BLOCK_SIZE):
|
|
530
|
+
L_tile = wp.tile_load(L, shape=(BLOCK_SIZE, BLOCK_SIZE), offset=(j, i_start))
|
|
531
|
+
L_T_tile = wp.tile_transpose(L_tile)
|
|
532
|
+
x_tile = wp.tile_load(x, shape=(BLOCK_SIZE, 1), offset=(j, 0))
|
|
533
|
+
L_T_x_tile = wp.tile_matmul(L_T_tile, x_tile)
|
|
534
|
+
rhs_tile -= L_T_x_tile
|
|
535
|
+
L_tile = wp.tile_load(L, shape=(BLOCK_SIZE, BLOCK_SIZE), offset=(i_start, i_start))
|
|
536
|
+
x_tile = wp.tile_upper_solve(wp.tile_transpose(L_tile), rhs_tile)
|
|
537
|
+
wp.tile_store(x, x_tile, offset=(i_start, 0))
|
|
538
|
+
|
|
539
|
+
# check block cholesky decomposition
|
|
540
|
+
|
|
541
|
+
rng = np.random.default_rng(42)
|
|
542
|
+
|
|
543
|
+
M = np.array(rng.random((TILE_M, TILE_M)), dtype=float)
|
|
544
|
+
|
|
545
|
+
A_np = M.T @ M + np.eye(TILE_M, TILE_M)
|
|
546
|
+
L_np = np.linalg.cholesky(A_np)
|
|
547
|
+
|
|
548
|
+
A_wp = wp.array2d(A_np, dtype=float, device=device)
|
|
549
|
+
L_wp = wp.zeros_like(A_wp)
|
|
550
|
+
|
|
551
|
+
wp.launch_tiled(block_cholesky_kernel, dim=1, inputs=[A_wp], outputs=[L_wp], block_dim=TILE_DIM, device=device)
|
|
552
|
+
|
|
553
|
+
# check block cholesky solve
|
|
554
|
+
|
|
555
|
+
assert_np_equal(L_wp.numpy(), L_np, tol=1e-6)
|
|
556
|
+
|
|
557
|
+
b_np = np.array(rng.random((TILE_M, 1)), dtype=float)
|
|
558
|
+
b_wp = wp.array(b_np, dtype=float, device=device)
|
|
559
|
+
|
|
560
|
+
scratch = wp.zeros_like(b_wp)
|
|
561
|
+
|
|
562
|
+
x_np = np.linalg.solve(L_np.T, np.linalg.solve(L_np, b_np))
|
|
563
|
+
x_wp = wp.zeros_like(b_wp)
|
|
564
|
+
|
|
565
|
+
wp.launch_tiled(
|
|
566
|
+
block_cholesky_solve_kernel,
|
|
567
|
+
dim=1,
|
|
568
|
+
inputs=[L_wp, b_wp, scratch],
|
|
569
|
+
outputs=[x_wp],
|
|
570
|
+
block_dim=TILE_DIM,
|
|
571
|
+
device=device,
|
|
572
|
+
)
|
|
573
|
+
|
|
574
|
+
assert_np_equal(x_wp.numpy(), x_np, tol=1e-6)
|
|
575
|
+
|
|
576
|
+
|
|
577
|
+
@wp.kernel
|
|
578
|
+
def test_tile_lower_solve(L: wp.array2d(dtype=float), y: wp.array(dtype=float), x: wp.array(dtype=float)):
|
|
579
|
+
L_tile = wp.tile_load(L, shape=(TILE_M, TILE_M))
|
|
580
|
+
y_tile = wp.tile_load(x, shape=(TILE_M,))
|
|
581
|
+
sol = wp.tile_lower_solve(L_tile, y_tile)
|
|
582
|
+
wp.tile_store(x, sol)
|
|
583
|
+
|
|
584
|
+
|
|
585
|
+
@wp.kernel
|
|
586
|
+
def test_tile_upper_solve(L: wp.array2d(dtype=float), y: wp.array(dtype=float), x: wp.array(dtype=float)):
|
|
587
|
+
L_tile = wp.tile_load(L, shape=(TILE_M, TILE_M))
|
|
588
|
+
y_tile = wp.tile_load(x, shape=(TILE_M,))
|
|
589
|
+
sol = wp.tile_upper_solve(L_tile, y_tile)
|
|
590
|
+
wp.tile_store(x, sol)
|
|
591
|
+
|
|
592
|
+
|
|
593
|
+
@unittest.skipUnless(wp.context.runtime.core.cuda_toolkit_version() >= 12060, "CUDA toolkit version is less than 12.6")
|
|
594
|
+
def test_tile_math_singular_matrices(test, device):
|
|
595
|
+
rng = np.random.default_rng(42)
|
|
596
|
+
L_np = np.tril(rng.random((TILE_M, TILE_M))) # Lower triangular matrix
|
|
597
|
+
L_np[-1, -1] = 0.0 # Make it singular
|
|
598
|
+
y_np = rng.random(TILE_M)
|
|
599
|
+
|
|
600
|
+
L_wp = wp.array2d(L_np, dtype=float, device=device)
|
|
601
|
+
y_wp = wp.array(y_np, dtype=float, device=device)
|
|
602
|
+
x_wp = wp.zeros_like(y_wp)
|
|
603
|
+
|
|
604
|
+
wp.launch_tiled(
|
|
605
|
+
test_tile_lower_solve, dim=1, inputs=[L_wp, y_wp], outputs=[x_wp], block_dim=TILE_DIM, device=device
|
|
606
|
+
)
|
|
607
|
+
|
|
608
|
+
assert np.isnan(x_wp.numpy()).any()
|
|
609
|
+
|
|
610
|
+
L_np = np.triu(rng.random((TILE_M, TILE_M))) # Upper triangular matrix
|
|
611
|
+
L_np[-1, -1] = 0.0 # Make it singular
|
|
612
|
+
|
|
613
|
+
L_wp = wp.array2d(L_np, dtype=float, device=device)
|
|
614
|
+
y_wp = wp.array(y_np, dtype=float, device=device)
|
|
615
|
+
x_wp = wp.zeros_like(y_wp)
|
|
616
|
+
|
|
617
|
+
wp.launch_tiled(
|
|
618
|
+
test_tile_upper_solve, dim=1, inputs=[L_wp, y_wp], outputs=[x_wp], block_dim=TILE_DIM, device=device
|
|
619
|
+
)
|
|
620
|
+
|
|
621
|
+
assert np.isnan(x_wp.numpy()).any()
|
|
622
|
+
|
|
623
|
+
|
|
176
624
|
all_devices = get_test_devices()
|
|
177
625
|
cuda_devices = get_cuda_test_devices()
|
|
178
626
|
|
|
@@ -188,6 +636,13 @@ add_function_test(
|
|
|
188
636
|
add_function_test(
|
|
189
637
|
TestTileMathDx, "test_tile_math_cholesky", test_tile_math_cholesky, devices=all_devices, check_output=False
|
|
190
638
|
)
|
|
639
|
+
add_function_test(
|
|
640
|
+
TestTileMathDx,
|
|
641
|
+
"tile_math_cholesky_multiple_rhs",
|
|
642
|
+
tile_math_cholesky_multiple_rhs,
|
|
643
|
+
devices=all_devices,
|
|
644
|
+
check_output=False,
|
|
645
|
+
)
|
|
191
646
|
add_function_test(
|
|
192
647
|
TestTileMathDx,
|
|
193
648
|
"test_tile_math_fft_vec2f",
|
|
@@ -203,6 +658,55 @@ add_function_test(
|
|
|
203
658
|
check_output=False,
|
|
204
659
|
)
|
|
205
660
|
|
|
661
|
+
add_function_test(
|
|
662
|
+
TestTileMathDx,
|
|
663
|
+
"test_tile_math_forward_substitution",
|
|
664
|
+
test_tile_math_forward_substitution,
|
|
665
|
+
devices=cuda_devices,
|
|
666
|
+
check_output=False,
|
|
667
|
+
)
|
|
668
|
+
|
|
669
|
+
add_function_test(
|
|
670
|
+
TestTileMathDx,
|
|
671
|
+
"test_tile_math_back_substitution",
|
|
672
|
+
test_tile_math_back_substitution,
|
|
673
|
+
devices=cuda_devices,
|
|
674
|
+
check_output=False,
|
|
675
|
+
)
|
|
676
|
+
|
|
677
|
+
add_function_test(
|
|
678
|
+
TestTileMathDx,
|
|
679
|
+
"test_tile_math_forward_substitution_multiple_rhs",
|
|
680
|
+
test_tile_math_forward_substitution_multiple_rhs,
|
|
681
|
+
devices=cuda_devices,
|
|
682
|
+
check_output=False,
|
|
683
|
+
)
|
|
684
|
+
|
|
685
|
+
add_function_test(
|
|
686
|
+
TestTileMathDx,
|
|
687
|
+
"test_tile_math_back_substitution_multiple_rhs",
|
|
688
|
+
test_tile_math_back_substitution_multiple_rhs,
|
|
689
|
+
devices=cuda_devices,
|
|
690
|
+
check_output=False,
|
|
691
|
+
)
|
|
692
|
+
|
|
693
|
+
add_function_test(
|
|
694
|
+
TestTileMathDx,
|
|
695
|
+
"test_tile_math_block_cholesky",
|
|
696
|
+
test_tile_math_block_cholesky,
|
|
697
|
+
devices=cuda_devices,
|
|
698
|
+
check_output=False,
|
|
699
|
+
)
|
|
700
|
+
|
|
701
|
+
add_function_test(
|
|
702
|
+
TestTileMathDx,
|
|
703
|
+
"test_tile_math_singular_matrices",
|
|
704
|
+
test_tile_math_singular_matrices,
|
|
705
|
+
devices=cuda_devices,
|
|
706
|
+
check_output=False,
|
|
707
|
+
)
|
|
708
|
+
|
|
709
|
+
|
|
206
710
|
if __name__ == "__main__":
|
|
207
711
|
wp.clear_kernel_cache()
|
|
208
712
|
unittest.main(verbosity=2, failfast=True)
|