warp-lang 1.7.2rc1__py3-none-manylinux_2_34_aarch64.whl → 1.8.1__py3-none-manylinux_2_34_aarch64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of warp-lang might be problematic. Click here for more details.
- warp/__init__.py +3 -1
- warp/__init__.pyi +3489 -1
- warp/autograd.py +45 -122
- warp/bin/warp-clang.so +0 -0
- warp/bin/warp.so +0 -0
- warp/build.py +241 -252
- warp/build_dll.py +130 -26
- warp/builtins.py +1907 -384
- warp/codegen.py +272 -104
- warp/config.py +12 -1
- warp/constants.py +1 -1
- warp/context.py +770 -238
- warp/dlpack.py +1 -1
- warp/examples/benchmarks/benchmark_cloth.py +2 -2
- warp/examples/benchmarks/benchmark_tile_sort.py +155 -0
- warp/examples/core/example_sample_mesh.py +1 -1
- warp/examples/core/example_spin_lock.py +93 -0
- warp/examples/core/example_work_queue.py +118 -0
- warp/examples/fem/example_adaptive_grid.py +5 -5
- warp/examples/fem/example_apic_fluid.py +1 -1
- warp/examples/fem/example_burgers.py +1 -1
- warp/examples/fem/example_convection_diffusion.py +9 -6
- warp/examples/fem/example_darcy_ls_optimization.py +489 -0
- warp/examples/fem/example_deformed_geometry.py +1 -1
- warp/examples/fem/example_diffusion.py +2 -2
- warp/examples/fem/example_diffusion_3d.py +1 -1
- warp/examples/fem/example_distortion_energy.py +1 -1
- warp/examples/fem/example_elastic_shape_optimization.py +387 -0
- warp/examples/fem/example_magnetostatics.py +5 -3
- warp/examples/fem/example_mixed_elasticity.py +5 -3
- warp/examples/fem/example_navier_stokes.py +11 -9
- warp/examples/fem/example_nonconforming_contact.py +5 -3
- warp/examples/fem/example_streamlines.py +8 -3
- warp/examples/fem/utils.py +9 -8
- warp/examples/interop/example_jax_callable.py +34 -4
- warp/examples/interop/example_jax_ffi_callback.py +2 -2
- warp/examples/interop/example_jax_kernel.py +27 -1
- warp/examples/optim/example_drone.py +1 -1
- warp/examples/sim/example_cloth.py +1 -1
- warp/examples/sim/example_cloth_self_contact.py +48 -54
- warp/examples/tile/example_tile_block_cholesky.py +502 -0
- warp/examples/tile/example_tile_cholesky.py +2 -1
- warp/examples/tile/example_tile_convolution.py +1 -1
- warp/examples/tile/example_tile_filtering.py +1 -1
- warp/examples/tile/example_tile_matmul.py +1 -1
- warp/examples/tile/example_tile_mlp.py +2 -0
- warp/fabric.py +7 -7
- warp/fem/__init__.py +5 -0
- warp/fem/adaptivity.py +1 -1
- warp/fem/cache.py +152 -63
- warp/fem/dirichlet.py +2 -2
- warp/fem/domain.py +136 -6
- warp/fem/field/field.py +141 -99
- warp/fem/field/nodal_field.py +85 -39
- warp/fem/field/virtual.py +99 -52
- warp/fem/geometry/adaptive_nanogrid.py +91 -86
- warp/fem/geometry/closest_point.py +13 -0
- warp/fem/geometry/deformed_geometry.py +102 -40
- warp/fem/geometry/element.py +56 -2
- warp/fem/geometry/geometry.py +323 -22
- warp/fem/geometry/grid_2d.py +157 -62
- warp/fem/geometry/grid_3d.py +116 -20
- warp/fem/geometry/hexmesh.py +86 -20
- warp/fem/geometry/nanogrid.py +166 -86
- warp/fem/geometry/partition.py +59 -25
- warp/fem/geometry/quadmesh.py +86 -135
- warp/fem/geometry/tetmesh.py +47 -119
- warp/fem/geometry/trimesh.py +77 -270
- warp/fem/integrate.py +181 -95
- warp/fem/linalg.py +25 -58
- warp/fem/operator.py +124 -27
- warp/fem/quadrature/pic_quadrature.py +36 -14
- warp/fem/quadrature/quadrature.py +40 -16
- warp/fem/space/__init__.py +1 -1
- warp/fem/space/basis_function_space.py +66 -46
- warp/fem/space/basis_space.py +17 -4
- warp/fem/space/dof_mapper.py +1 -1
- warp/fem/space/function_space.py +2 -2
- warp/fem/space/grid_2d_function_space.py +4 -1
- warp/fem/space/hexmesh_function_space.py +4 -2
- warp/fem/space/nanogrid_function_space.py +3 -1
- warp/fem/space/partition.py +11 -2
- warp/fem/space/quadmesh_function_space.py +4 -1
- warp/fem/space/restriction.py +5 -2
- warp/fem/space/shape/__init__.py +10 -8
- warp/fem/space/tetmesh_function_space.py +4 -1
- warp/fem/space/topology.py +52 -21
- warp/fem/space/trimesh_function_space.py +4 -1
- warp/fem/utils.py +53 -8
- warp/jax.py +1 -2
- warp/jax_experimental/ffi.py +210 -67
- warp/jax_experimental/xla_ffi.py +37 -24
- warp/math.py +171 -1
- warp/native/array.h +103 -4
- warp/native/builtin.h +182 -35
- warp/native/coloring.cpp +6 -2
- warp/native/cuda_util.cpp +1 -1
- warp/native/exports.h +118 -63
- warp/native/intersect.h +5 -5
- warp/native/mat.h +8 -13
- warp/native/mathdx.cpp +11 -5
- warp/native/matnn.h +1 -123
- warp/native/mesh.h +1 -1
- warp/native/quat.h +34 -6
- warp/native/rand.h +7 -7
- warp/native/sparse.cpp +121 -258
- warp/native/sparse.cu +181 -274
- warp/native/spatial.h +305 -17
- warp/native/svd.h +23 -8
- warp/native/tile.h +603 -73
- warp/native/tile_radix_sort.h +1112 -0
- warp/native/tile_reduce.h +239 -13
- warp/native/tile_scan.h +240 -0
- warp/native/tuple.h +189 -0
- warp/native/vec.h +10 -20
- warp/native/warp.cpp +36 -4
- warp/native/warp.cu +588 -52
- warp/native/warp.h +47 -74
- warp/optim/linear.py +5 -1
- warp/paddle.py +7 -8
- warp/py.typed +0 -0
- warp/render/render_opengl.py +110 -80
- warp/render/render_usd.py +124 -62
- warp/sim/__init__.py +9 -0
- warp/sim/collide.py +253 -80
- warp/sim/graph_coloring.py +8 -1
- warp/sim/import_mjcf.py +4 -3
- warp/sim/import_usd.py +11 -7
- warp/sim/integrator.py +5 -2
- warp/sim/integrator_euler.py +1 -1
- warp/sim/integrator_featherstone.py +1 -1
- warp/sim/integrator_vbd.py +761 -322
- warp/sim/integrator_xpbd.py +1 -1
- warp/sim/model.py +265 -260
- warp/sim/utils.py +10 -7
- warp/sparse.py +303 -166
- warp/tape.py +54 -51
- warp/tests/cuda/test_conditional_captures.py +1046 -0
- warp/tests/cuda/test_streams.py +1 -1
- warp/tests/geometry/test_volume.py +2 -2
- warp/tests/interop/test_dlpack.py +9 -9
- warp/tests/interop/test_jax.py +0 -1
- warp/tests/run_coverage_serial.py +1 -1
- warp/tests/sim/disabled_kinematics.py +2 -2
- warp/tests/sim/{test_vbd.py → test_cloth.py} +378 -112
- warp/tests/sim/test_collision.py +159 -51
- warp/tests/sim/test_coloring.py +91 -2
- warp/tests/test_array.py +254 -2
- warp/tests/test_array_reduce.py +2 -2
- warp/tests/test_assert.py +53 -0
- warp/tests/test_atomic_cas.py +312 -0
- warp/tests/test_codegen.py +142 -19
- warp/tests/test_conditional.py +47 -1
- warp/tests/test_ctypes.py +0 -20
- warp/tests/test_devices.py +8 -0
- warp/tests/test_fabricarray.py +4 -2
- warp/tests/test_fem.py +58 -25
- warp/tests/test_func.py +42 -1
- warp/tests/test_grad.py +1 -1
- warp/tests/test_lerp.py +1 -3
- warp/tests/test_map.py +481 -0
- warp/tests/test_mat.py +23 -24
- warp/tests/test_quat.py +28 -15
- warp/tests/test_rounding.py +10 -38
- warp/tests/test_runlength_encode.py +7 -7
- warp/tests/test_smoothstep.py +1 -1
- warp/tests/test_sparse.py +83 -2
- warp/tests/test_spatial.py +507 -1
- warp/tests/test_static.py +48 -0
- warp/tests/test_struct.py +2 -2
- warp/tests/test_tape.py +38 -0
- warp/tests/test_tuple.py +265 -0
- warp/tests/test_types.py +2 -2
- warp/tests/test_utils.py +24 -18
- warp/tests/test_vec.py +38 -408
- warp/tests/test_vec_constructors.py +325 -0
- warp/tests/tile/test_tile.py +438 -131
- warp/tests/tile/test_tile_mathdx.py +518 -14
- warp/tests/tile/test_tile_matmul.py +179 -0
- warp/tests/tile/test_tile_reduce.py +307 -5
- warp/tests/tile/test_tile_shared_memory.py +136 -7
- warp/tests/tile/test_tile_sort.py +121 -0
- warp/tests/unittest_suites.py +14 -6
- warp/types.py +462 -308
- warp/utils.py +647 -86
- {warp_lang-1.7.2rc1.dist-info → warp_lang-1.8.1.dist-info}/METADATA +20 -6
- {warp_lang-1.7.2rc1.dist-info → warp_lang-1.8.1.dist-info}/RECORD +190 -176
- warp/stubs.py +0 -3381
- warp/tests/sim/test_xpbd.py +0 -399
- warp/tests/test_mlp.py +0 -282
- {warp_lang-1.7.2rc1.dist-info → warp_lang-1.8.1.dist-info}/WHEEL +0 -0
- {warp_lang-1.7.2rc1.dist-info → warp_lang-1.8.1.dist-info}/licenses/LICENSE.md +0 -0
- {warp_lang-1.7.2rc1.dist-info → warp_lang-1.8.1.dist-info}/top_level.txt +0 -0
|
@@ -0,0 +1,502 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
|
|
16
|
+
###########################################################################
|
|
17
|
+
# Example Tile Block Cholesky
|
|
18
|
+
#
|
|
19
|
+
# Shows how to write a kernel computing a blocked Cholesky factorization
|
|
20
|
+
# of a symmetric positive definite matrix using Warp Tile APIs.
|
|
21
|
+
#
|
|
22
|
+
###########################################################################
|
|
23
|
+
|
|
24
|
+
from functools import lru_cache
|
|
25
|
+
|
|
26
|
+
import numpy as np
|
|
27
|
+
|
|
28
|
+
import warp as wp
|
|
29
|
+
|
|
30
|
+
wp.set_module_options({"enable_backward": False})
|
|
31
|
+
|
|
32
|
+
|
|
33
|
+
@lru_cache(maxsize=None)
|
|
34
|
+
def create_blocked_cholesky_kernel(block_size: int):
|
|
35
|
+
@wp.kernel
|
|
36
|
+
def blocked_cholesky_kernel(
|
|
37
|
+
A: wp.array(dtype=float, ndim=2),
|
|
38
|
+
L: wp.array(dtype=float, ndim=2),
|
|
39
|
+
active_matrix_size_arr: wp.array(dtype=int, ndim=1),
|
|
40
|
+
):
|
|
41
|
+
"""
|
|
42
|
+
Computes the Cholesky factorization of a symmetric positive definite matrix A in blocks.
|
|
43
|
+
It returns a lower-triangular matrix L such that A = L L^T.
|
|
44
|
+
|
|
45
|
+
A is assumed to support block reading.
|
|
46
|
+
"""
|
|
47
|
+
tid, tid_block = wp.tid()
|
|
48
|
+
num_threads_per_block = wp.block_dim()
|
|
49
|
+
|
|
50
|
+
active_matrix_size = active_matrix_size_arr[0]
|
|
51
|
+
|
|
52
|
+
# Round up active_matrix_size to next multiple of block_size
|
|
53
|
+
n = ((active_matrix_size + block_size - 1) // block_size) * block_size
|
|
54
|
+
|
|
55
|
+
# Process the matrix in blocks along its leading dimension.
|
|
56
|
+
for k in range(0, n, block_size):
|
|
57
|
+
end = k + block_size
|
|
58
|
+
|
|
59
|
+
# Load current diagonal block A[k:end, k:end]
|
|
60
|
+
# and update with contributions from previously computed blocks.
|
|
61
|
+
A_kk_tile = wp.tile_load(A, shape=(block_size, block_size), offset=(k, k), storage="shared")
|
|
62
|
+
# The following if pads the matrix if it is not divisible by block_size
|
|
63
|
+
if k + block_size > active_matrix_size:
|
|
64
|
+
num_tile_elements = block_size * block_size
|
|
65
|
+
num_iterations = (num_tile_elements + num_threads_per_block - 1) // num_threads_per_block
|
|
66
|
+
|
|
67
|
+
for i in range(num_iterations):
|
|
68
|
+
linear_index = tid_block + i * num_threads_per_block
|
|
69
|
+
linear_index = linear_index % num_tile_elements
|
|
70
|
+
row = linear_index // block_size
|
|
71
|
+
col = linear_index % block_size
|
|
72
|
+
value = A_kk_tile[row, col]
|
|
73
|
+
if k + row >= active_matrix_size or k + col >= active_matrix_size:
|
|
74
|
+
value = wp.where(row == col, float(1), float(0))
|
|
75
|
+
A_kk_tile[row, col] = value
|
|
76
|
+
|
|
77
|
+
if k > 0:
|
|
78
|
+
for j in range(0, k, block_size):
|
|
79
|
+
L_block = wp.tile_load(L, shape=(block_size, block_size), offset=(k, j))
|
|
80
|
+
L_block_T = wp.tile_transpose(L_block)
|
|
81
|
+
L_L_T_block = wp.tile_matmul(L_block, L_block_T)
|
|
82
|
+
A_kk_tile -= L_L_T_block
|
|
83
|
+
|
|
84
|
+
# Compute the Cholesky factorization for the block
|
|
85
|
+
L_kk_tile = wp.tile_cholesky(A_kk_tile)
|
|
86
|
+
wp.tile_store(L, L_kk_tile, offset=(k, k))
|
|
87
|
+
|
|
88
|
+
# Process the blocks below the current block
|
|
89
|
+
for i in range(end, n, block_size):
|
|
90
|
+
A_ik_tile = wp.tile_load(A, shape=(block_size, block_size), offset=(i, k), storage="shared")
|
|
91
|
+
# The following if pads the matrix if it is not divisible by block_size
|
|
92
|
+
if i + block_size > active_matrix_size or k + block_size > active_matrix_size:
|
|
93
|
+
num_tile_elements = block_size * block_size
|
|
94
|
+
num_iterations = (num_tile_elements + num_threads_per_block - 1) // num_threads_per_block
|
|
95
|
+
|
|
96
|
+
for ii in range(num_iterations):
|
|
97
|
+
linear_index = tid_block + ii * num_threads_per_block
|
|
98
|
+
linear_index = linear_index % num_tile_elements
|
|
99
|
+
row = linear_index // block_size
|
|
100
|
+
col = linear_index % block_size
|
|
101
|
+
value = A_ik_tile[row, col]
|
|
102
|
+
if i + row >= active_matrix_size or k + col >= active_matrix_size:
|
|
103
|
+
value = wp.where(i + row == k + col, float(1), float(0))
|
|
104
|
+
A_ik_tile[row, col] = value
|
|
105
|
+
|
|
106
|
+
if k > 0:
|
|
107
|
+
for j in range(0, k, block_size):
|
|
108
|
+
L_tile = wp.tile_load(L, shape=(block_size, block_size), offset=(i, j))
|
|
109
|
+
L_2_tile = wp.tile_load(L, shape=(block_size, block_size), offset=(k, j))
|
|
110
|
+
L_T_tile = wp.tile_transpose(L_2_tile)
|
|
111
|
+
L_L_T_tile = wp.tile_matmul(L_tile, L_T_tile)
|
|
112
|
+
A_ik_tile -= L_L_T_tile
|
|
113
|
+
|
|
114
|
+
t = wp.tile_transpose(A_ik_tile)
|
|
115
|
+
tmp = wp.tile_lower_solve(L_kk_tile, t)
|
|
116
|
+
sol_tile = wp.tile_transpose(tmp)
|
|
117
|
+
|
|
118
|
+
wp.tile_store(L, sol_tile, offset=(i, k))
|
|
119
|
+
|
|
120
|
+
return blocked_cholesky_kernel
|
|
121
|
+
|
|
122
|
+
|
|
123
|
+
@lru_cache(maxsize=None)
|
|
124
|
+
def create_blocked_cholesky_solve_kernel(block_size: int):
|
|
125
|
+
@wp.kernel
|
|
126
|
+
def blocked_cholesky_solve_kernel(
|
|
127
|
+
L: wp.array(dtype=float, ndim=2),
|
|
128
|
+
b: wp.array(dtype=float, ndim=2),
|
|
129
|
+
x: wp.array(dtype=float, ndim=2),
|
|
130
|
+
y: wp.array(dtype=float, ndim=2),
|
|
131
|
+
active_matrix_size_arr: wp.array(dtype=int, ndim=1),
|
|
132
|
+
):
|
|
133
|
+
"""
|
|
134
|
+
Solves A x = b given the Cholesky factor L (A = L L^T) using
|
|
135
|
+
blocked forward and backward substitution.
|
|
136
|
+
|
|
137
|
+
b can be a vector or 2-D array with multiple right-hand sides.
|
|
138
|
+
"""
|
|
139
|
+
|
|
140
|
+
active_matrix_size = active_matrix_size_arr[0]
|
|
141
|
+
|
|
142
|
+
# Round up active_matrix_size to next multiple of block_size
|
|
143
|
+
n = ((active_matrix_size + block_size - 1) // block_size) * block_size
|
|
144
|
+
|
|
145
|
+
# Forward substitution: solve L y = b
|
|
146
|
+
for i in range(0, n, block_size):
|
|
147
|
+
i_end = i + block_size
|
|
148
|
+
rhs_tile = wp.tile_load(b, shape=(block_size, 1), offset=(i, 0))
|
|
149
|
+
if i > 0:
|
|
150
|
+
for j in range(0, i, block_size):
|
|
151
|
+
L_block = wp.tile_load(L, shape=(block_size, block_size), offset=(i, j))
|
|
152
|
+
y_block = wp.tile_load(y, shape=(block_size, 1), offset=(j, 0))
|
|
153
|
+
Ly_block = wp.tile_matmul(L_block, y_block)
|
|
154
|
+
rhs_tile -= Ly_block
|
|
155
|
+
L_tile = wp.tile_load(L, shape=(block_size, block_size), offset=(i, i))
|
|
156
|
+
y_tile = wp.tile_lower_solve(L_tile, rhs_tile)
|
|
157
|
+
wp.tile_store(y, y_tile, offset=(i, 0))
|
|
158
|
+
|
|
159
|
+
# Backward substitution: solve L^T x = y
|
|
160
|
+
for i in range(n - block_size, -1, -block_size):
|
|
161
|
+
i_start = i
|
|
162
|
+
i_end = i_start + block_size
|
|
163
|
+
rhs_tile = wp.tile_load(y, shape=(block_size, 1), offset=(i_start, 0))
|
|
164
|
+
if i_end < n:
|
|
165
|
+
for j in range(i_end, n, block_size):
|
|
166
|
+
L_tile = wp.tile_load(L, shape=(block_size, block_size), offset=(j, i_start))
|
|
167
|
+
L_T_tile = wp.tile_transpose(L_tile)
|
|
168
|
+
x_tile = wp.tile_load(x, shape=(block_size, 1), offset=(j, 0))
|
|
169
|
+
L_T_x_tile = wp.tile_matmul(L_T_tile, x_tile)
|
|
170
|
+
rhs_tile -= L_T_x_tile
|
|
171
|
+
L_tile = wp.tile_load(L, shape=(block_size, block_size), offset=(i_start, i_start))
|
|
172
|
+
x_tile = wp.tile_upper_solve(wp.tile_transpose(L_tile), rhs_tile)
|
|
173
|
+
wp.tile_store(x, x_tile, offset=(i_start, 0))
|
|
174
|
+
|
|
175
|
+
return blocked_cholesky_solve_kernel
|
|
176
|
+
|
|
177
|
+
|
|
178
|
+
# TODO: Add batching support to solve multiple equation systems at once (one per thread block)
|
|
179
|
+
class BlockCholeskySolver:
|
|
180
|
+
"""
|
|
181
|
+
A class for solving linear systems using the Cholesky factorization.
|
|
182
|
+
"""
|
|
183
|
+
|
|
184
|
+
def __init__(self, max_num_equations: int, block_size=16, device="cuda"):
|
|
185
|
+
# Round up max_num_equations to next multiple of block_size
|
|
186
|
+
max_num_equations = ((max_num_equations + block_size - 1) // block_size) * block_size
|
|
187
|
+
|
|
188
|
+
self.max_num_equations = max_num_equations
|
|
189
|
+
self.device = device
|
|
190
|
+
|
|
191
|
+
self.num_threads_per_block_factorize = 128
|
|
192
|
+
self.num_threads_per_block_solve = 64
|
|
193
|
+
self.active_matrix_size_int = -1
|
|
194
|
+
|
|
195
|
+
self.block_size = block_size
|
|
196
|
+
self.cholesky_kernel = create_blocked_cholesky_kernel(block_size)
|
|
197
|
+
self.solve_kernel = create_blocked_cholesky_solve_kernel(block_size)
|
|
198
|
+
|
|
199
|
+
# Allocate workspace arrays for factorization and solve
|
|
200
|
+
self.L = wp.zeros(shape=(self.max_num_equations, self.max_num_equations), dtype=float, device=self.device)
|
|
201
|
+
self.y = wp.zeros(shape=(self.max_num_equations, 1), dtype=float, device=self.device) # temp memory
|
|
202
|
+
self.active_matrix_size = wp.zeros(
|
|
203
|
+
shape=(1,), dtype=int, device=self.device
|
|
204
|
+
) # array to hold active matrix size
|
|
205
|
+
self.active_matrix_size_external = None
|
|
206
|
+
|
|
207
|
+
def factorize(self, A: wp.array(dtype=float, ndim=2), num_active_equations: int):
|
|
208
|
+
"""
|
|
209
|
+
Computes the Cholesky factorization of a symmetric positive definite matrix A in blocks.
|
|
210
|
+
It returns a lower-triangular matrix L such that A = L L^T.
|
|
211
|
+
"""
|
|
212
|
+
|
|
213
|
+
assert num_active_equations <= self.max_num_equations, (
|
|
214
|
+
f"Number of active equations ({num_active_equations}) exceeds maximum allowed ({self.max_num_equations})"
|
|
215
|
+
)
|
|
216
|
+
|
|
217
|
+
padded_n = ((num_active_equations + self.block_size - 1) // self.block_size) * self.block_size
|
|
218
|
+
|
|
219
|
+
# Verify input dimensions
|
|
220
|
+
assert A.shape[0] == A.shape[1], "Matrix A must be square"
|
|
221
|
+
assert A.shape[0] >= padded_n, f"Matrix A must be at least {padded_n}x{padded_n} to accommodate padding"
|
|
222
|
+
|
|
223
|
+
self.active_matrix_size.zero_()
|
|
224
|
+
wp.copy(self.active_matrix_size, wp.array([num_active_equations], dtype=int, device=self.device))
|
|
225
|
+
|
|
226
|
+
self.factorize_dynamic(A, self.active_matrix_size)
|
|
227
|
+
|
|
228
|
+
self.active_matrix_size_external = None
|
|
229
|
+
self.active_matrix_size_int = num_active_equations
|
|
230
|
+
|
|
231
|
+
def factorize_dynamic(self, A: wp.array(dtype=float, ndim=2), num_active_equations: wp.array(dtype=int, ndim=1)):
|
|
232
|
+
"""
|
|
233
|
+
Computes the Cholesky factorization of a symmetric positive definite matrix A in blocks.
|
|
234
|
+
It returns a lower-triangular matrix L such that A = L L^T.
|
|
235
|
+
"""
|
|
236
|
+
|
|
237
|
+
self.active_matrix_size_external = num_active_equations
|
|
238
|
+
self.active_matrix_size_int = -1
|
|
239
|
+
|
|
240
|
+
wp.launch_tiled(
|
|
241
|
+
self.cholesky_kernel,
|
|
242
|
+
dim=1,
|
|
243
|
+
inputs=[A, self.L, num_active_equations],
|
|
244
|
+
block_dim=self.num_threads_per_block_factorize,
|
|
245
|
+
device=self.device,
|
|
246
|
+
)
|
|
247
|
+
|
|
248
|
+
def solve(self, rhs: wp.array(dtype=float, ndim=2), result: wp.array(dtype=float, ndim=2)):
|
|
249
|
+
"""
|
|
250
|
+
Solves A x = b given the Cholesky factor L (A = L L^T) using
|
|
251
|
+
blocked forward and backward substitution.
|
|
252
|
+
|
|
253
|
+
b can be a vector or 2-D array with multiple right-hand sides.
|
|
254
|
+
"""
|
|
255
|
+
|
|
256
|
+
# Do safety checks but they can only be done if the matrix size is known on the host
|
|
257
|
+
if self.active_matrix_size_int > 0:
|
|
258
|
+
n = self.active_matrix_size_int
|
|
259
|
+
padded_n = ((n + self.block_size - 1) // self.block_size) * self.block_size
|
|
260
|
+
|
|
261
|
+
# Verify input dimensions
|
|
262
|
+
assert rhs.shape[1] == 1, "Matrix b must be a column vector"
|
|
263
|
+
assert rhs.shape[0] >= padded_n, f"Matrix b must be at least {padded_n}x1 to accommodate padding"
|
|
264
|
+
|
|
265
|
+
assert result.shape[1] == 1, "Matrix result must be a column vector"
|
|
266
|
+
assert result.shape[0] >= padded_n, f"Matrix result must be at least {padded_n}x1 to accommodate padding"
|
|
267
|
+
|
|
268
|
+
if self.active_matrix_size_external is not None:
|
|
269
|
+
matrix_size = self.active_matrix_size_external
|
|
270
|
+
else:
|
|
271
|
+
matrix_size = self.active_matrix_size
|
|
272
|
+
|
|
273
|
+
# Then solve the system using blocked_cholesky_solve kernel
|
|
274
|
+
wp.launch_tiled(
|
|
275
|
+
self.solve_kernel,
|
|
276
|
+
dim=1,
|
|
277
|
+
inputs=[self.L, rhs, result, self.y, matrix_size],
|
|
278
|
+
block_dim=self.num_threads_per_block_solve,
|
|
279
|
+
device=self.device,
|
|
280
|
+
)
|
|
281
|
+
|
|
282
|
+
|
|
283
|
+
class CholeskySolverNumPy:
|
|
284
|
+
"""
|
|
285
|
+
A class for solving linear systems using the Cholesky factorization.
|
|
286
|
+
"""
|
|
287
|
+
|
|
288
|
+
def __init__(self, max_num_equations: int):
|
|
289
|
+
self.max_num_equations = max_num_equations
|
|
290
|
+
self.num_active_equations = 0
|
|
291
|
+
|
|
292
|
+
# Allocate workspace arrays for factorization and solve
|
|
293
|
+
self.L = np.zeros((self.max_num_equations, self.max_num_equations))
|
|
294
|
+
self.y = np.zeros((self.max_num_equations, 1)) # temp memory
|
|
295
|
+
|
|
296
|
+
def factorize(self, A: np.ndarray, num_active_equations: int):
|
|
297
|
+
"""
|
|
298
|
+
Computes the Cholesky factorization of a symmetric positive definite matrix A.
|
|
299
|
+
It returns a lower-triangular matrix L such that A = L L^T.
|
|
300
|
+
"""
|
|
301
|
+
assert num_active_equations <= self.max_num_equations, (
|
|
302
|
+
f"Number of active equations ({num_active_equations}) exceeds maximum allowed ({self.max_num_equations})"
|
|
303
|
+
)
|
|
304
|
+
|
|
305
|
+
self.num_active_equations = num_active_equations
|
|
306
|
+
|
|
307
|
+
# Verify input dimensions
|
|
308
|
+
assert A.shape[0] == A.shape[1], "Matrix A must be square"
|
|
309
|
+
assert A.shape[0] >= num_active_equations, (
|
|
310
|
+
f"Matrix A must be at least {num_active_equations}x{num_active_equations}"
|
|
311
|
+
)
|
|
312
|
+
|
|
313
|
+
# Compute Cholesky factorization
|
|
314
|
+
self.L[:num_active_equations, :num_active_equations] = np.linalg.cholesky(
|
|
315
|
+
A[:num_active_equations, :num_active_equations]
|
|
316
|
+
)
|
|
317
|
+
|
|
318
|
+
def solve(self, rhs: np.ndarray, result: np.ndarray):
|
|
319
|
+
"""
|
|
320
|
+
Solves A x = b given the Cholesky factor L (A = L L^T) using
|
|
321
|
+
forward and backward substitution.
|
|
322
|
+
|
|
323
|
+
b can be a vector or 2-D array with multiple right-hand sides.
|
|
324
|
+
"""
|
|
325
|
+
assert self.num_active_equations <= self.max_num_equations, (
|
|
326
|
+
f"Number of active equations ({self.num_active_equations}) exceeds maximum allowed ({self.max_num_equations})"
|
|
327
|
+
)
|
|
328
|
+
|
|
329
|
+
n = self.num_active_equations
|
|
330
|
+
|
|
331
|
+
# Verify input dimensions
|
|
332
|
+
assert rhs.shape[1] == 1, "Matrix b must be a column vector"
|
|
333
|
+
assert rhs.shape[0] >= n, f"Matrix b must be at least {n}x1"
|
|
334
|
+
|
|
335
|
+
assert result.shape[1] == 1, "Matrix result must be a column vector"
|
|
336
|
+
assert result.shape[0] >= n, f"Matrix result must be at least {n}x1"
|
|
337
|
+
|
|
338
|
+
# Forward substitution: L y = b
|
|
339
|
+
self.y[:n] = np.linalg.solve(self.L[:n, :n], rhs[:n])
|
|
340
|
+
|
|
341
|
+
# Backward substitution: L^T x = y
|
|
342
|
+
result[:n] = np.linalg.solve(self.L[:n, :n].T, self.y[:n])
|
|
343
|
+
|
|
344
|
+
|
|
345
|
+
def test_cholesky_solver(n, warp_solver: BlockCholeskySolver, device: str = "cuda"):
|
|
346
|
+
# Create a symmetric positive definite matrix
|
|
347
|
+
rng = np.random.default_rng(0)
|
|
348
|
+
A_full = rng.standard_normal((n, n))
|
|
349
|
+
A_full = A_full @ A_full.T + n * np.eye(n) # ensure SPD
|
|
350
|
+
block_size = warp_solver.block_size
|
|
351
|
+
|
|
352
|
+
# Pad matrix to make it divisible by block_size
|
|
353
|
+
padded_n = ((n + block_size - 1) // block_size) * block_size
|
|
354
|
+
padding = padded_n - n
|
|
355
|
+
|
|
356
|
+
if padding > 0:
|
|
357
|
+
# Pad the original matrix with random values while maintaining SPD
|
|
358
|
+
A_padded = rng.standard_normal((padded_n, padded_n))
|
|
359
|
+
A_padded[:n, :n] = A_full
|
|
360
|
+
padding_block = rng.standard_normal((padding, padding))
|
|
361
|
+
padding_block = padding_block @ padding_block.T + padding * np.eye(padding)
|
|
362
|
+
A_padded[n:, n:] = padding_block
|
|
363
|
+
A_padded[n:, :n] = rng.standard_normal((padding, n))
|
|
364
|
+
A_padded[:n, n:] = A_padded[n:, :n].T # Maintain symmetry
|
|
365
|
+
else:
|
|
366
|
+
A_padded = A_full
|
|
367
|
+
|
|
368
|
+
# Create random RHS vector and pad
|
|
369
|
+
b = rng.standard_normal(n)
|
|
370
|
+
b_padded = rng.standard_normal(padded_n)
|
|
371
|
+
b_padded[:n] = b
|
|
372
|
+
|
|
373
|
+
print("\nSolving with NumPy:")
|
|
374
|
+
# NumPy reference solution
|
|
375
|
+
x = np.linalg.solve(A_full, b)
|
|
376
|
+
L_full = np.linalg.cholesky(A_full)
|
|
377
|
+
|
|
378
|
+
# Verify NumPy solution
|
|
379
|
+
err = np.linalg.norm(A_full - L_full @ L_full.T)
|
|
380
|
+
res_norm = np.linalg.norm(b - A_full @ x)
|
|
381
|
+
print(f"Cholesky factorization error: {err:.3e}")
|
|
382
|
+
print(f"Solution residual norm: {res_norm:.3e}")
|
|
383
|
+
|
|
384
|
+
print("\nSolving with Warp kernels:")
|
|
385
|
+
# Initialize Warp arrays
|
|
386
|
+
A_wp = wp.array(A_padded, dtype=wp.float32, device=device)
|
|
387
|
+
b_wp = wp.array(b_padded, dtype=wp.float32, device=device).reshape((padded_n, 1))
|
|
388
|
+
x_wp = wp.zeros_like(b_wp)
|
|
389
|
+
|
|
390
|
+
# Create and use the Cholesky solver
|
|
391
|
+
warp_solver.factorize(A_wp, n)
|
|
392
|
+
warp_solver.solve(b_wp, x_wp)
|
|
393
|
+
wp.synchronize()
|
|
394
|
+
|
|
395
|
+
# Get result back to CPU and verify
|
|
396
|
+
x_warp = x_wp.numpy()[:n].squeeze()
|
|
397
|
+
L_warp = warp_solver.L.numpy()
|
|
398
|
+
|
|
399
|
+
# Verify Warp solution
|
|
400
|
+
err_warp = np.linalg.norm(A_full - L_warp[:n, :n] @ L_warp[:n, :n].T)
|
|
401
|
+
res_norm_warp = np.linalg.norm(b - A_full @ x_warp)
|
|
402
|
+
diff_norm = np.linalg.norm(x - x_warp)
|
|
403
|
+
|
|
404
|
+
print(f"Warp Cholesky factorization error: {err_warp:.3e}")
|
|
405
|
+
print(f"Warp solution residual norm: {res_norm_warp:.3e}")
|
|
406
|
+
print(f"Difference between CPU and GPU solutions: {diff_norm:.3e}")
|
|
407
|
+
|
|
408
|
+
|
|
409
|
+
@wp.kernel
|
|
410
|
+
def assign_int_kernel(arr: wp.array(dtype=int, ndim=1), value: int):
|
|
411
|
+
"""Assigns an integer value into the first element of an array"""
|
|
412
|
+
arr[0] = value
|
|
413
|
+
|
|
414
|
+
|
|
415
|
+
def test_cholesky_solver_graph_capture():
|
|
416
|
+
wp.clear_kernel_cache()
|
|
417
|
+
|
|
418
|
+
max_equations = 1000
|
|
419
|
+
|
|
420
|
+
# Create random SPD matrix A and random RHS b
|
|
421
|
+
rng = np.random.default_rng(42)
|
|
422
|
+
A_np = rng.standard_normal((max_equations, max_equations))
|
|
423
|
+
A_np = A_np @ A_np.T + np.eye(max_equations) * max_equations # Make SPD
|
|
424
|
+
b_np = rng.standard_normal((max_equations, 1))
|
|
425
|
+
|
|
426
|
+
device = "cuda"
|
|
427
|
+
|
|
428
|
+
with wp.ScopedDevice(device):
|
|
429
|
+
warp_solver = BlockCholeskySolver(max_equations, block_size=32)
|
|
430
|
+
|
|
431
|
+
# Create Warp arrays
|
|
432
|
+
# Round up dimensions to next multiple of block size
|
|
433
|
+
block_size = warp_solver.block_size
|
|
434
|
+
padded_n = ((max_equations + block_size - 1) // block_size) * block_size
|
|
435
|
+
|
|
436
|
+
# Create padded arrays initialized with zeros
|
|
437
|
+
A_padded = np.zeros((padded_n, padded_n), dtype=np.float32)
|
|
438
|
+
b_padded = np.zeros((padded_n, 1), dtype=np.float32)
|
|
439
|
+
|
|
440
|
+
# Copy original data into padded arrays
|
|
441
|
+
A_padded[:max_equations, :max_equations] = A_np
|
|
442
|
+
b_padded[:max_equations, :] = b_np
|
|
443
|
+
|
|
444
|
+
# Create Warp arrays from padded numpy arrays
|
|
445
|
+
A_wp = wp.array(A_padded, dtype=wp.float32, ndim=2)
|
|
446
|
+
b_wp = wp.array(b_padded, dtype=wp.float32, ndim=2)
|
|
447
|
+
|
|
448
|
+
# Create result array
|
|
449
|
+
x_wp = wp.zeros_like(b_wp)
|
|
450
|
+
# Create array for equation system size
|
|
451
|
+
n_wp = wp.array([1], dtype=wp.int32)
|
|
452
|
+
|
|
453
|
+
# Create a stream for graph capture
|
|
454
|
+
stream = wp.Stream(device)
|
|
455
|
+
|
|
456
|
+
with wp.ScopedStream(stream):
|
|
457
|
+
# Begin graph capture
|
|
458
|
+
wp.capture_begin()
|
|
459
|
+
try:
|
|
460
|
+
# Loop through different system sizes
|
|
461
|
+
for n in range(1, max_equations + 1):
|
|
462
|
+
# Update system size
|
|
463
|
+
wp.launch(assign_int_kernel, dim=1, inputs=[n_wp, n])
|
|
464
|
+
|
|
465
|
+
# Factorize A
|
|
466
|
+
warp_solver.factorize_dynamic(A_wp, n_wp)
|
|
467
|
+
|
|
468
|
+
# Solve system
|
|
469
|
+
warp_solver.solve(b_wp, x_wp)
|
|
470
|
+
|
|
471
|
+
finally:
|
|
472
|
+
# End graph capture
|
|
473
|
+
graph = wp.capture_end()
|
|
474
|
+
|
|
475
|
+
# Run the captured graph
|
|
476
|
+
with wp.ScopedTimer("Launch graph", cuda_filter=wp.TIMING_GRAPH):
|
|
477
|
+
wp.capture_launch(graph, stream=stream)
|
|
478
|
+
|
|
479
|
+
wp.synchronize()
|
|
480
|
+
print("Finished!")
|
|
481
|
+
|
|
482
|
+
|
|
483
|
+
if __name__ == "__main__":
|
|
484
|
+
wp.clear_kernel_cache()
|
|
485
|
+
|
|
486
|
+
test_graph_capture = False
|
|
487
|
+
|
|
488
|
+
if test_graph_capture:
|
|
489
|
+
test_cholesky_solver_graph_capture()
|
|
490
|
+
|
|
491
|
+
else:
|
|
492
|
+
device = "cpu"
|
|
493
|
+
|
|
494
|
+
# Test equation sys sizes
|
|
495
|
+
sizes = [32, 70, 128, 192, 257, 320, 401, 1000]
|
|
496
|
+
|
|
497
|
+
# Initialize solver once with max size
|
|
498
|
+
warp_solver = BlockCholeskySolver(max(sizes), block_size=16, device=device)
|
|
499
|
+
|
|
500
|
+
for n in sizes:
|
|
501
|
+
print(f"\nTesting system size n = {n}")
|
|
502
|
+
test_cholesky_solver(n, warp_solver, device)
|
|
@@ -82,6 +82,7 @@ if __name__ == "__main__":
|
|
|
82
82
|
print("A\\n (Warp):\n", Y_wp.numpy())
|
|
83
83
|
print("A\\x (Numpy):\n", Y_np)
|
|
84
84
|
|
|
85
|
-
|
|
85
|
+
np.testing.assert_allclose(Y_wp.numpy(), Y_np)
|
|
86
|
+
np.testing.assert_allclose(L_wp.numpy(), L_np)
|
|
86
87
|
|
|
87
88
|
print("Example Tile Cholesky passed")
|
warp/fabric.py
CHANGED
|
@@ -22,12 +22,12 @@ from warp.types import *
|
|
|
22
22
|
|
|
23
23
|
|
|
24
24
|
class fabricbucket_t(ctypes.Structure):
|
|
25
|
-
_fields_ =
|
|
25
|
+
_fields_ = (
|
|
26
26
|
("index_start", ctypes.c_size_t),
|
|
27
27
|
("index_end", ctypes.c_size_t),
|
|
28
28
|
("ptr", ctypes.c_void_p),
|
|
29
29
|
("lengths", ctypes.c_void_p),
|
|
30
|
-
|
|
30
|
+
)
|
|
31
31
|
|
|
32
32
|
def __init__(self, index_start=0, index_end=0, ptr=None, lengths=None):
|
|
33
33
|
self.index_start = index_start
|
|
@@ -37,11 +37,11 @@ class fabricbucket_t(ctypes.Structure):
|
|
|
37
37
|
|
|
38
38
|
|
|
39
39
|
class fabricarray_t(ctypes.Structure):
|
|
40
|
-
_fields_ =
|
|
40
|
+
_fields_ = (
|
|
41
41
|
("buckets", ctypes.c_void_p), # array of fabricbucket_t on the correct device
|
|
42
42
|
("nbuckets", ctypes.c_size_t),
|
|
43
43
|
("size", ctypes.c_size_t),
|
|
44
|
-
|
|
44
|
+
)
|
|
45
45
|
|
|
46
46
|
def __init__(self, buckets=None, nbuckets=0, size=0):
|
|
47
47
|
self.buckets = ctypes.c_void_p(buckets)
|
|
@@ -50,11 +50,11 @@ class fabricarray_t(ctypes.Structure):
|
|
|
50
50
|
|
|
51
51
|
|
|
52
52
|
class indexedfabricarray_t(ctypes.Structure):
|
|
53
|
-
_fields_ =
|
|
53
|
+
_fields_ = (
|
|
54
54
|
("fa", fabricarray_t),
|
|
55
55
|
("indices", ctypes.c_void_p),
|
|
56
56
|
("size", ctypes.c_size_t),
|
|
57
|
-
|
|
57
|
+
)
|
|
58
58
|
|
|
59
59
|
def __init__(self, fa=None, indices=None):
|
|
60
60
|
if fa is None:
|
|
@@ -121,7 +121,7 @@ class fabricarray(noncontiguous_array_base[T]):
|
|
|
121
121
|
_vars = None
|
|
122
122
|
|
|
123
123
|
def __new__(cls, *args, **kwargs):
|
|
124
|
-
instance = super(
|
|
124
|
+
instance = super().__new__(cls)
|
|
125
125
|
instance.deleter = None
|
|
126
126
|
return instance
|
|
127
127
|
|
warp/fem/__init__.py
CHANGED
|
@@ -55,6 +55,8 @@ from .operator import (
|
|
|
55
55
|
degree,
|
|
56
56
|
div,
|
|
57
57
|
div_outer,
|
|
58
|
+
element_closest_point,
|
|
59
|
+
element_coordinates,
|
|
58
60
|
grad,
|
|
59
61
|
grad_average,
|
|
60
62
|
grad_jump,
|
|
@@ -65,8 +67,11 @@ from .operator import (
|
|
|
65
67
|
lookup,
|
|
66
68
|
measure,
|
|
67
69
|
measure_ratio,
|
|
70
|
+
node_count,
|
|
71
|
+
node_index,
|
|
68
72
|
normal,
|
|
69
73
|
outer,
|
|
74
|
+
partition_lookup,
|
|
70
75
|
position,
|
|
71
76
|
to_cell_side,
|
|
72
77
|
to_inner_cell,
|
warp/fem/adaptivity.py
CHANGED
|
@@ -50,7 +50,7 @@ def adaptive_nanogrid_from_hierarchy(
|
|
|
50
50
|
# Concatenate voxels for each grid
|
|
51
51
|
voxel_counts = [grid.get_voxel_count() for grid in grids]
|
|
52
52
|
|
|
53
|
-
voxel_offsets = np.cumsum(np.array([0
|
|
53
|
+
voxel_offsets = np.cumsum(np.array([0, *voxel_counts]))
|
|
54
54
|
merged_ijks = cache.borrow_temporary(temporary_store, dtype=wp.vec3i, shape=int(voxel_offsets[-1]), device=device)
|
|
55
55
|
for l in range(level_count):
|
|
56
56
|
voxel_count = voxel_counts[l]
|