warp-lang 1.9.0__py3-none-win_amd64.whl → 1.10.0rc2__py3-none-win_amd64.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
Potentially problematic release.
This version of warp-lang might be problematic. Click here for more details.
- warp/__init__.py +301 -287
- warp/__init__.pyi +2220 -313
- warp/_src/__init__.py +14 -0
- warp/_src/autograd.py +1075 -0
- warp/_src/build.py +618 -0
- warp/_src/build_dll.py +640 -0
- warp/{builtins.py → _src/builtins.py} +1497 -226
- warp/_src/codegen.py +4359 -0
- warp/{config.py → _src/config.py} +178 -169
- warp/_src/constants.py +57 -0
- warp/_src/context.py +8294 -0
- warp/_src/dlpack.py +462 -0
- warp/_src/fabric.py +355 -0
- warp/_src/fem/__init__.py +14 -0
- warp/_src/fem/adaptivity.py +508 -0
- warp/_src/fem/cache.py +687 -0
- warp/_src/fem/dirichlet.py +188 -0
- warp/{fem → _src/fem}/domain.py +40 -30
- warp/_src/fem/field/__init__.py +131 -0
- warp/_src/fem/field/field.py +701 -0
- warp/{fem → _src/fem}/field/nodal_field.py +30 -15
- warp/{fem → _src/fem}/field/restriction.py +1 -1
- warp/{fem → _src/fem}/field/virtual.py +53 -27
- warp/_src/fem/geometry/__init__.py +32 -0
- warp/{fem → _src/fem}/geometry/adaptive_nanogrid.py +77 -163
- warp/_src/fem/geometry/closest_point.py +97 -0
- warp/{fem → _src/fem}/geometry/deformed_geometry.py +14 -22
- warp/{fem → _src/fem}/geometry/element.py +32 -10
- warp/{fem → _src/fem}/geometry/geometry.py +48 -20
- warp/{fem → _src/fem}/geometry/grid_2d.py +12 -23
- warp/{fem → _src/fem}/geometry/grid_3d.py +12 -23
- warp/{fem → _src/fem}/geometry/hexmesh.py +40 -63
- warp/{fem → _src/fem}/geometry/nanogrid.py +255 -248
- warp/{fem → _src/fem}/geometry/partition.py +121 -63
- warp/{fem → _src/fem}/geometry/quadmesh.py +26 -45
- warp/{fem → _src/fem}/geometry/tetmesh.py +40 -63
- warp/{fem → _src/fem}/geometry/trimesh.py +26 -45
- warp/{fem → _src/fem}/integrate.py +164 -158
- warp/_src/fem/linalg.py +383 -0
- warp/_src/fem/operator.py +396 -0
- warp/_src/fem/polynomial.py +229 -0
- warp/{fem → _src/fem}/quadrature/pic_quadrature.py +15 -20
- warp/{fem → _src/fem}/quadrature/quadrature.py +95 -47
- warp/_src/fem/space/__init__.py +248 -0
- warp/{fem → _src/fem}/space/basis_function_space.py +20 -11
- warp/_src/fem/space/basis_space.py +679 -0
- warp/{fem → _src/fem}/space/dof_mapper.py +3 -3
- warp/{fem → _src/fem}/space/function_space.py +14 -13
- warp/{fem → _src/fem}/space/grid_2d_function_space.py +4 -7
- warp/{fem → _src/fem}/space/grid_3d_function_space.py +4 -4
- warp/{fem → _src/fem}/space/hexmesh_function_space.py +4 -10
- warp/{fem → _src/fem}/space/nanogrid_function_space.py +3 -9
- warp/{fem → _src/fem}/space/partition.py +117 -60
- warp/{fem → _src/fem}/space/quadmesh_function_space.py +4 -10
- warp/{fem → _src/fem}/space/restriction.py +66 -33
- warp/_src/fem/space/shape/__init__.py +152 -0
- warp/{fem → _src/fem}/space/shape/cube_shape_function.py +9 -9
- warp/{fem → _src/fem}/space/shape/shape_function.py +8 -9
- warp/{fem → _src/fem}/space/shape/square_shape_function.py +6 -6
- warp/{fem → _src/fem}/space/shape/tet_shape_function.py +3 -3
- warp/{fem → _src/fem}/space/shape/triangle_shape_function.py +3 -3
- warp/{fem → _src/fem}/space/tetmesh_function_space.py +3 -9
- warp/_src/fem/space/topology.py +459 -0
- warp/{fem → _src/fem}/space/trimesh_function_space.py +3 -9
- warp/_src/fem/types.py +112 -0
- warp/_src/fem/utils.py +486 -0
- warp/_src/jax.py +186 -0
- warp/_src/jax_experimental/__init__.py +14 -0
- warp/_src/jax_experimental/custom_call.py +387 -0
- warp/_src/jax_experimental/ffi.py +1284 -0
- warp/_src/jax_experimental/xla_ffi.py +656 -0
- warp/_src/marching_cubes.py +708 -0
- warp/_src/math.py +414 -0
- warp/_src/optim/__init__.py +14 -0
- warp/_src/optim/adam.py +163 -0
- warp/_src/optim/linear.py +1606 -0
- warp/_src/optim/sgd.py +112 -0
- warp/_src/paddle.py +406 -0
- warp/_src/render/__init__.py +14 -0
- warp/_src/render/imgui_manager.py +289 -0
- warp/_src/render/render_opengl.py +3636 -0
- warp/_src/render/render_usd.py +937 -0
- warp/_src/render/utils.py +160 -0
- warp/_src/sparse.py +2716 -0
- warp/_src/tape.py +1206 -0
- warp/{thirdparty → _src/thirdparty}/unittest_parallel.py +9 -2
- warp/_src/torch.py +391 -0
- warp/_src/types.py +5870 -0
- warp/_src/utils.py +1693 -0
- warp/autograd.py +12 -1054
- warp/bin/warp-clang.dll +0 -0
- warp/bin/warp.dll +0 -0
- warp/build.py +8 -588
- warp/build_dll.py +6 -471
- warp/codegen.py +6 -4246
- warp/constants.py +6 -39
- warp/context.py +12 -7851
- warp/dlpack.py +6 -444
- warp/examples/distributed/example_jacobi_mpi.py +4 -5
- warp/examples/fem/example_adaptive_grid.py +1 -1
- warp/examples/fem/example_apic_fluid.py +1 -1
- warp/examples/fem/example_burgers.py +8 -8
- warp/examples/fem/example_diffusion.py +1 -1
- warp/examples/fem/example_distortion_energy.py +1 -1
- warp/examples/fem/example_mixed_elasticity.py +2 -2
- warp/examples/fem/example_navier_stokes.py +1 -1
- warp/examples/fem/example_nonconforming_contact.py +7 -7
- warp/examples/fem/example_stokes.py +1 -1
- warp/examples/fem/example_stokes_transfer.py +1 -1
- warp/examples/fem/utils.py +2 -2
- warp/examples/interop/example_jax_callable.py +1 -1
- warp/examples/interop/example_jax_ffi_callback.py +1 -1
- warp/examples/interop/example_jax_kernel.py +3 -2
- warp/examples/tile/example_tile_mcgp.py +191 -0
- warp/fabric.py +6 -337
- warp/fem/__init__.py +159 -97
- warp/fem/adaptivity.py +7 -489
- warp/fem/cache.py +9 -648
- warp/fem/dirichlet.py +6 -184
- warp/fem/field/__init__.py +8 -109
- warp/fem/field/field.py +7 -652
- warp/fem/geometry/__init__.py +7 -18
- warp/fem/geometry/closest_point.py +11 -77
- warp/fem/linalg.py +18 -366
- warp/fem/operator.py +11 -369
- warp/fem/polynomial.py +9 -209
- warp/fem/space/__init__.py +5 -211
- warp/fem/space/basis_space.py +6 -662
- warp/fem/space/shape/__init__.py +41 -118
- warp/fem/space/topology.py +6 -437
- warp/fem/types.py +6 -81
- warp/fem/utils.py +11 -444
- warp/jax.py +8 -165
- warp/jax_experimental/__init__.py +14 -1
- warp/jax_experimental/custom_call.py +8 -342
- warp/jax_experimental/ffi.py +17 -853
- warp/jax_experimental/xla_ffi.py +5 -596
- warp/marching_cubes.py +5 -689
- warp/math.py +16 -393
- warp/native/array.h +385 -37
- warp/native/builtin.h +316 -39
- warp/native/bvh.cpp +43 -9
- warp/native/bvh.cu +62 -27
- warp/native/bvh.h +310 -309
- warp/native/clang/clang.cpp +102 -97
- warp/native/coloring.cpp +0 -1
- warp/native/crt.h +208 -0
- warp/native/exports.h +156 -0
- warp/native/hashgrid.cu +2 -0
- warp/native/intersect.h +24 -1
- warp/native/intersect_tri.h +44 -35
- warp/native/mat.h +1456 -276
- warp/native/mesh.cpp +4 -4
- warp/native/mesh.cu +4 -2
- warp/native/mesh.h +176 -61
- warp/native/quat.h +0 -52
- warp/native/scan.cu +2 -0
- warp/native/sort.cu +22 -13
- warp/native/sort.h +2 -0
- warp/native/sparse.cu +7 -3
- warp/native/spatial.h +12 -0
- warp/native/tile.h +837 -70
- warp/native/tile_radix_sort.h +1 -1
- warp/native/tile_reduce.h +394 -46
- warp/native/tile_scan.h +4 -4
- warp/native/vec.h +469 -53
- warp/native/version.h +23 -0
- warp/native/volume.cpp +1 -1
- warp/native/volume.cu +1 -0
- warp/native/volume.h +1 -1
- warp/native/volume_builder.cu +2 -0
- warp/native/warp.cpp +60 -32
- warp/native/warp.cu +313 -201
- warp/native/warp.h +14 -11
- warp/optim/__init__.py +6 -3
- warp/optim/adam.py +6 -145
- warp/optim/linear.py +14 -1585
- warp/optim/sgd.py +6 -94
- warp/paddle.py +6 -388
- warp/render/__init__.py +8 -4
- warp/render/imgui_manager.py +7 -267
- warp/render/render_opengl.py +6 -3616
- warp/render/render_usd.py +6 -918
- warp/render/utils.py +6 -142
- warp/sparse.py +37 -2563
- warp/tape.py +6 -1188
- warp/tests/__main__.py +1 -1
- warp/tests/cuda/test_async.py +4 -4
- warp/tests/cuda/test_conditional_captures.py +1 -1
- warp/tests/cuda/test_multigpu.py +1 -1
- warp/tests/cuda/test_streams.py +58 -1
- warp/tests/geometry/test_bvh.py +157 -22
- warp/tests/geometry/test_hash_grid.py +38 -0
- warp/tests/geometry/test_marching_cubes.py +0 -1
- warp/tests/geometry/test_mesh.py +5 -3
- warp/tests/geometry/test_mesh_query_aabb.py +5 -12
- warp/tests/geometry/test_mesh_query_point.py +5 -2
- warp/tests/geometry/test_mesh_query_ray.py +15 -3
- warp/tests/geometry/test_volume_write.py +5 -5
- warp/tests/interop/test_dlpack.py +14 -14
- warp/tests/interop/test_jax.py +1382 -79
- warp/tests/interop/test_paddle.py +1 -1
- warp/tests/test_adam.py +0 -1
- warp/tests/test_arithmetic.py +9 -9
- warp/tests/test_array.py +529 -100
- warp/tests/test_array_reduce.py +3 -3
- warp/tests/test_atomic.py +12 -8
- warp/tests/test_atomic_bitwise.py +209 -0
- warp/tests/test_atomic_cas.py +4 -4
- warp/tests/test_bool.py +2 -2
- warp/tests/test_builtins_resolution.py +5 -571
- warp/tests/test_codegen.py +34 -15
- warp/tests/test_conditional.py +1 -1
- warp/tests/test_context.py +6 -6
- warp/tests/test_copy.py +242 -161
- warp/tests/test_ctypes.py +3 -3
- warp/tests/test_devices.py +24 -2
- warp/tests/test_examples.py +16 -84
- warp/tests/test_fabricarray.py +35 -35
- warp/tests/test_fast_math.py +0 -2
- warp/tests/test_fem.py +60 -14
- warp/tests/test_fixedarray.py +3 -3
- warp/tests/test_func.py +8 -5
- warp/tests/test_generics.py +1 -1
- warp/tests/test_indexedarray.py +24 -24
- warp/tests/test_intersect.py +39 -9
- warp/tests/test_large.py +1 -1
- warp/tests/test_lerp.py +3 -1
- warp/tests/test_linear_solvers.py +1 -1
- warp/tests/test_map.py +49 -4
- warp/tests/test_mat.py +52 -62
- warp/tests/test_mat_constructors.py +4 -5
- warp/tests/test_mat_lite.py +1 -1
- warp/tests/test_mat_scalar_ops.py +121 -121
- warp/tests/test_math.py +34 -0
- warp/tests/test_module_aot.py +4 -4
- warp/tests/test_modules_lite.py +28 -2
- warp/tests/test_print.py +11 -11
- warp/tests/test_quat.py +93 -58
- warp/tests/test_runlength_encode.py +1 -1
- warp/tests/test_scalar_ops.py +38 -10
- warp/tests/test_smoothstep.py +1 -1
- warp/tests/test_sparse.py +126 -15
- warp/tests/test_spatial.py +105 -87
- warp/tests/test_special_values.py +6 -6
- warp/tests/test_static.py +7 -7
- warp/tests/test_struct.py +13 -2
- warp/tests/test_triangle_closest_point.py +48 -1
- warp/tests/test_tuple.py +96 -0
- warp/tests/test_types.py +82 -9
- warp/tests/test_utils.py +52 -52
- warp/tests/test_vec.py +29 -29
- warp/tests/test_vec_constructors.py +5 -5
- warp/tests/test_vec_scalar_ops.py +97 -97
- warp/tests/test_version.py +75 -0
- warp/tests/tile/test_tile.py +239 -0
- warp/tests/tile/test_tile_atomic_bitwise.py +403 -0
- warp/tests/tile/test_tile_cholesky.py +7 -4
- warp/tests/tile/test_tile_load.py +26 -2
- warp/tests/tile/test_tile_mathdx.py +3 -3
- warp/tests/tile/test_tile_matmul.py +1 -1
- warp/tests/tile/test_tile_mlp.py +2 -4
- warp/tests/tile/test_tile_reduce.py +214 -13
- warp/tests/unittest_suites.py +6 -14
- warp/tests/unittest_utils.py +10 -9
- warp/tests/walkthrough_debug.py +3 -1
- warp/torch.py +6 -373
- warp/types.py +29 -5750
- warp/utils.py +10 -1659
- {warp_lang-1.9.0.dist-info → warp_lang-1.10.0rc2.dist-info}/METADATA +47 -103
- warp_lang-1.10.0rc2.dist-info/RECORD +468 -0
- warp_lang-1.10.0rc2.dist-info/licenses/licenses/Gaia-LICENSE.txt +6 -0
- warp_lang-1.10.0rc2.dist-info/licenses/licenses/appdirs-LICENSE.txt +22 -0
- warp_lang-1.10.0rc2.dist-info/licenses/licenses/asset_pixel_jpg-LICENSE.txt +3 -0
- warp_lang-1.10.0rc2.dist-info/licenses/licenses/cuda-LICENSE.txt +1582 -0
- warp_lang-1.10.0rc2.dist-info/licenses/licenses/dlpack-LICENSE.txt +201 -0
- warp_lang-1.10.0rc2.dist-info/licenses/licenses/fp16-LICENSE.txt +28 -0
- warp_lang-1.10.0rc2.dist-info/licenses/licenses/libmathdx-LICENSE.txt +220 -0
- warp_lang-1.10.0rc2.dist-info/licenses/licenses/llvm-LICENSE.txt +279 -0
- warp_lang-1.10.0rc2.dist-info/licenses/licenses/moller-LICENSE.txt +16 -0
- warp_lang-1.10.0rc2.dist-info/licenses/licenses/nanovdb-LICENSE.txt +2 -0
- warp_lang-1.10.0rc2.dist-info/licenses/licenses/nvrtc-LICENSE.txt +1592 -0
- warp_lang-1.10.0rc2.dist-info/licenses/licenses/svd-LICENSE.txt +23 -0
- warp_lang-1.10.0rc2.dist-info/licenses/licenses/unittest_parallel-LICENSE.txt +21 -0
- warp_lang-1.10.0rc2.dist-info/licenses/licenses/usd-LICENSE.txt +213 -0
- warp_lang-1.10.0rc2.dist-info/licenses/licenses/windingnumber-LICENSE.txt +21 -0
- warp/examples/assets/cartpole.urdf +0 -110
- warp/examples/assets/crazyflie.usd +0 -0
- warp/examples/assets/nv_ant.xml +0 -92
- warp/examples/assets/nv_humanoid.xml +0 -183
- warp/examples/assets/quadruped.urdf +0 -268
- warp/examples/optim/example_bounce.py +0 -266
- warp/examples/optim/example_cloth_throw.py +0 -228
- warp/examples/optim/example_drone.py +0 -870
- warp/examples/optim/example_inverse_kinematics.py +0 -182
- warp/examples/optim/example_inverse_kinematics_torch.py +0 -191
- warp/examples/optim/example_softbody_properties.py +0 -400
- warp/examples/optim/example_spring_cage.py +0 -245
- warp/examples/optim/example_trajectory.py +0 -227
- warp/examples/sim/example_cartpole.py +0 -143
- warp/examples/sim/example_cloth.py +0 -225
- warp/examples/sim/example_cloth_self_contact.py +0 -316
- warp/examples/sim/example_granular.py +0 -130
- warp/examples/sim/example_granular_collision_sdf.py +0 -202
- warp/examples/sim/example_jacobian_ik.py +0 -244
- warp/examples/sim/example_particle_chain.py +0 -124
- warp/examples/sim/example_quadruped.py +0 -203
- warp/examples/sim/example_rigid_chain.py +0 -203
- warp/examples/sim/example_rigid_contact.py +0 -195
- warp/examples/sim/example_rigid_force.py +0 -133
- warp/examples/sim/example_rigid_gyroscopic.py +0 -115
- warp/examples/sim/example_rigid_soft_contact.py +0 -140
- warp/examples/sim/example_soft_body.py +0 -196
- warp/examples/tile/example_tile_walker.py +0 -327
- warp/sim/__init__.py +0 -74
- warp/sim/articulation.py +0 -793
- warp/sim/collide.py +0 -2570
- warp/sim/graph_coloring.py +0 -307
- warp/sim/import_mjcf.py +0 -791
- warp/sim/import_snu.py +0 -227
- warp/sim/import_urdf.py +0 -579
- warp/sim/import_usd.py +0 -898
- warp/sim/inertia.py +0 -357
- warp/sim/integrator.py +0 -245
- warp/sim/integrator_euler.py +0 -2000
- warp/sim/integrator_featherstone.py +0 -2101
- warp/sim/integrator_vbd.py +0 -2487
- warp/sim/integrator_xpbd.py +0 -3295
- warp/sim/model.py +0 -4821
- warp/sim/particles.py +0 -121
- warp/sim/render.py +0 -431
- warp/sim/utils.py +0 -431
- warp/tests/sim/disabled_kinematics.py +0 -244
- warp/tests/sim/test_cloth.py +0 -863
- warp/tests/sim/test_collision.py +0 -743
- warp/tests/sim/test_coloring.py +0 -347
- warp/tests/sim/test_inertia.py +0 -161
- warp/tests/sim/test_model.py +0 -226
- warp/tests/sim/test_sim_grad.py +0 -287
- warp/tests/sim/test_sim_grad_bounce_linear.py +0 -212
- warp/tests/sim/test_sim_kinematics.py +0 -98
- warp/thirdparty/__init__.py +0 -0
- warp_lang-1.9.0.dist-info/RECORD +0 -456
- /warp/{fem → _src/fem}/quadrature/__init__.py +0 -0
- /warp/{tests/sim → _src/thirdparty}/__init__.py +0 -0
- /warp/{thirdparty → _src/thirdparty}/appdirs.py +0 -0
- /warp/{thirdparty → _src/thirdparty}/dlpack.py +0 -0
- {warp_lang-1.9.0.dist-info → warp_lang-1.10.0rc2.dist-info}/WHEEL +0 -0
- {warp_lang-1.9.0.dist-info → warp_lang-1.10.0rc2.dist-info}/licenses/LICENSE.md +0 -0
- {warp_lang-1.9.0.dist-info → warp_lang-1.10.0rc2.dist-info}/top_level.txt +0 -0
warp/_src/build.py
ADDED
|
@@ -0,0 +1,618 @@
|
|
|
1
|
+
# SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
|
|
2
|
+
# SPDX-License-Identifier: Apache-2.0
|
|
3
|
+
#
|
|
4
|
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
5
|
+
# you may not use this file except in compliance with the License.
|
|
6
|
+
# You may obtain a copy of the License at
|
|
7
|
+
#
|
|
8
|
+
# http://www.apache.org/licenses/LICENSE-2.0
|
|
9
|
+
#
|
|
10
|
+
# Unless required by applicable law or agreed to in writing, software
|
|
11
|
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
12
|
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
13
|
+
# See the License for the specific language governing permissions and
|
|
14
|
+
# limitations under the License.
|
|
15
|
+
|
|
16
|
+
import ctypes
|
|
17
|
+
import errno
|
|
18
|
+
import hashlib
|
|
19
|
+
import json
|
|
20
|
+
import os
|
|
21
|
+
import time
|
|
22
|
+
from pathlib import Path
|
|
23
|
+
|
|
24
|
+
import warp._src.config
|
|
25
|
+
from warp._src.thirdparty import appdirs
|
|
26
|
+
from warp._src.types import *
|
|
27
|
+
|
|
28
|
+
# From nvJitLink.h
|
|
29
|
+
nvJitLink_input_type = {"cubin": 1, "ptx": 2, "ltoir": 3, "fatbin": 4, "object": 5, "library": 6}
|
|
30
|
+
|
|
31
|
+
warp_home = os.path.realpath(os.path.join(os.path.dirname(__file__), ".."))
|
|
32
|
+
|
|
33
|
+
|
|
34
|
+
# builds cuda source to PTX or CUBIN using NVRTC (output type determined by output_path extension)
|
|
35
|
+
def build_cuda(
|
|
36
|
+
cu_path,
|
|
37
|
+
arch,
|
|
38
|
+
output_path,
|
|
39
|
+
config="release",
|
|
40
|
+
verify_fp=False,
|
|
41
|
+
fast_math=False,
|
|
42
|
+
fuse_fp=True,
|
|
43
|
+
lineinfo=False,
|
|
44
|
+
compile_time_trace=False,
|
|
45
|
+
ltoirs=None,
|
|
46
|
+
fatbins=None,
|
|
47
|
+
) -> None:
|
|
48
|
+
with open(cu_path, "rb") as src_file:
|
|
49
|
+
src = src_file.read()
|
|
50
|
+
cu_path_bytes = cu_path.encode("utf-8")
|
|
51
|
+
program_name_bytes = os.path.basename(cu_path).encode("utf-8")
|
|
52
|
+
inc_path = os.path.join(warp_home, "native").encode("utf-8")
|
|
53
|
+
output_path = output_path.encode("utf-8")
|
|
54
|
+
|
|
55
|
+
if warp._src.config.llvm_cuda:
|
|
56
|
+
warp._src.context.runtime.llvm.wp_compile_cuda(src, cu_path_bytes, inc_path, output_path, False)
|
|
57
|
+
|
|
58
|
+
else:
|
|
59
|
+
if ltoirs is None:
|
|
60
|
+
ltoirs = []
|
|
61
|
+
if fatbins is None:
|
|
62
|
+
fatbins = []
|
|
63
|
+
|
|
64
|
+
link_data = list(ltoirs) + list(fatbins)
|
|
65
|
+
num_link = len(link_data)
|
|
66
|
+
arr_link = (ctypes.c_char_p * num_link)(*link_data)
|
|
67
|
+
arr_link_sizes = (ctypes.c_size_t * num_link)(*[len(l) for l in link_data])
|
|
68
|
+
link_input_types = [nvJitLink_input_type["ltoir"]] * len(ltoirs) + [nvJitLink_input_type["fatbin"]] * len(
|
|
69
|
+
fatbins
|
|
70
|
+
)
|
|
71
|
+
arr_link_input_types = (ctypes.c_int * num_link)(*link_input_types)
|
|
72
|
+
err = warp._src.context.runtime.core.wp_cuda_compile_program(
|
|
73
|
+
src,
|
|
74
|
+
program_name_bytes,
|
|
75
|
+
arch,
|
|
76
|
+
inc_path,
|
|
77
|
+
0,
|
|
78
|
+
None,
|
|
79
|
+
config == "debug",
|
|
80
|
+
warp._src.config.verbose,
|
|
81
|
+
verify_fp,
|
|
82
|
+
fast_math,
|
|
83
|
+
fuse_fp,
|
|
84
|
+
lineinfo,
|
|
85
|
+
compile_time_trace,
|
|
86
|
+
output_path,
|
|
87
|
+
num_link,
|
|
88
|
+
arr_link,
|
|
89
|
+
arr_link_sizes,
|
|
90
|
+
arr_link_input_types,
|
|
91
|
+
)
|
|
92
|
+
if err != 0:
|
|
93
|
+
raise Exception(f"CUDA kernel build failed with error code {err}")
|
|
94
|
+
|
|
95
|
+
|
|
96
|
+
# load PTX or CUBIN as a CUDA runtime module (input type determined by input_path extension)
|
|
97
|
+
def load_cuda(input_path, device):
|
|
98
|
+
if not device.is_cuda:
|
|
99
|
+
raise RuntimeError("Not a CUDA device")
|
|
100
|
+
|
|
101
|
+
return warp._src.context.runtime.core.wp_cuda_load_module(device.context, input_path.encode("utf-8"))
|
|
102
|
+
|
|
103
|
+
|
|
104
|
+
def build_cpu(obj_path, cpp_path, mode="release", verify_fp=False, fast_math=False, fuse_fp=True):
|
|
105
|
+
with open(cpp_path, "rb") as cpp:
|
|
106
|
+
src = cpp.read()
|
|
107
|
+
cpp_path = cpp_path.encode("utf-8")
|
|
108
|
+
inc_path = os.path.join(warp_home, "native").encode("utf-8")
|
|
109
|
+
obj_path = obj_path.encode("utf-8")
|
|
110
|
+
|
|
111
|
+
err = warp._src.context.runtime.llvm.wp_compile_cpp(
|
|
112
|
+
src,
|
|
113
|
+
cpp_path,
|
|
114
|
+
inc_path,
|
|
115
|
+
obj_path,
|
|
116
|
+
mode == "debug",
|
|
117
|
+
verify_fp,
|
|
118
|
+
fuse_fp,
|
|
119
|
+
warp.config.enable_tiles_in_stack_memory,
|
|
120
|
+
)
|
|
121
|
+
if err != 0:
|
|
122
|
+
raise Exception(f"CPU kernel build failed with error code {err}")
|
|
123
|
+
|
|
124
|
+
|
|
125
|
+
def init_kernel_cache(path=None):
|
|
126
|
+
"""Initialize kernel cache directory.
|
|
127
|
+
|
|
128
|
+
This function is used during Warp initialization, but it can also be called directly to change the cache location.
|
|
129
|
+
If the path is not explicitly specified, a default location will be chosen based on OS-specific conventions.
|
|
130
|
+
|
|
131
|
+
To change the default cache location, set warp.config.kernel_cache_dir before calling warp.init().
|
|
132
|
+
"""
|
|
133
|
+
|
|
134
|
+
if path is not None:
|
|
135
|
+
cache_root_dir = os.path.realpath(path)
|
|
136
|
+
elif "WARP_CACHE_PATH" in os.environ:
|
|
137
|
+
cache_root_dir = os.path.realpath(os.environ.get("WARP_CACHE_PATH"))
|
|
138
|
+
else:
|
|
139
|
+
cache_root_dir = appdirs.user_cache_dir(appname="warp", appauthor="NVIDIA", version=warp._src.config.version)
|
|
140
|
+
|
|
141
|
+
if os.name == "nt" and os.path.isabs(cache_root_dir) and not cache_root_dir.startswith("\\\\?\\"):
|
|
142
|
+
# Add Windows long-path prefix, accounting for UNC shares.
|
|
143
|
+
if cache_root_dir.startswith("\\\\"):
|
|
144
|
+
# UNC path \\server\share\… → \\?\UNC\server\share\…
|
|
145
|
+
cache_root_dir = "\\\\?\\UNC\\" + cache_root_dir.lstrip("\\")
|
|
146
|
+
else:
|
|
147
|
+
# Drive-letter path C:\… → \\?\C:\…
|
|
148
|
+
cache_root_dir = "\\\\?\\" + cache_root_dir
|
|
149
|
+
|
|
150
|
+
warp._src.config.kernel_cache_dir = cache_root_dir
|
|
151
|
+
|
|
152
|
+
os.makedirs(warp._src.config.kernel_cache_dir, exist_ok=True)
|
|
153
|
+
|
|
154
|
+
|
|
155
|
+
def clear_kernel_cache() -> None:
|
|
156
|
+
"""Clear the kernel cache directory of previously generated source code and compiler artifacts.
|
|
157
|
+
|
|
158
|
+
Only directories beginning with ``wp_`` will be deleted.
|
|
159
|
+
This function only clears the cache for the current Warp version.
|
|
160
|
+
LTO artifacts are not affected.
|
|
161
|
+
"""
|
|
162
|
+
|
|
163
|
+
warp._src.context.init()
|
|
164
|
+
|
|
165
|
+
import shutil
|
|
166
|
+
|
|
167
|
+
is_initialized = warp._src.context.runtime is not None
|
|
168
|
+
assert is_initialized, "The kernel cache directory is not configured; wp.init() has not been called yet or failed."
|
|
169
|
+
|
|
170
|
+
for item in os.listdir(warp._src.config.kernel_cache_dir):
|
|
171
|
+
item_path = os.path.join(warp._src.config.kernel_cache_dir, item)
|
|
172
|
+
if os.path.isdir(item_path) and item.startswith("wp_"):
|
|
173
|
+
# Remove the directory and its contents
|
|
174
|
+
shutil.rmtree(item_path, ignore_errors=True)
|
|
175
|
+
|
|
176
|
+
|
|
177
|
+
def clear_lto_cache() -> None:
|
|
178
|
+
"""Clear the LTO cache directory of previously generated LTO code.
|
|
179
|
+
|
|
180
|
+
The LTO cache is stored within a subdirectory of the kernel cache directory.
|
|
181
|
+
This function only clears the cache for the current Warp version.
|
|
182
|
+
"""
|
|
183
|
+
|
|
184
|
+
warp._src.context.init()
|
|
185
|
+
|
|
186
|
+
import shutil
|
|
187
|
+
|
|
188
|
+
is_initialized = warp._src.context.runtime is not None
|
|
189
|
+
assert is_initialized, "The kernel cache directory is not configured; wp.init() has not been called yet or failed."
|
|
190
|
+
|
|
191
|
+
lto_path = os.path.join(warp._src.config.kernel_cache_dir, "lto")
|
|
192
|
+
if os.path.isdir(lto_path):
|
|
193
|
+
# Remove the lto directory and its contents
|
|
194
|
+
shutil.rmtree(lto_path, ignore_errors=True)
|
|
195
|
+
|
|
196
|
+
|
|
197
|
+
def safe_rename(src, dst, attempts=5, delay=0.1):
|
|
198
|
+
for i in range(attempts):
|
|
199
|
+
try:
|
|
200
|
+
os.rename(src, dst)
|
|
201
|
+
return
|
|
202
|
+
except FileExistsError:
|
|
203
|
+
return
|
|
204
|
+
except OSError as e:
|
|
205
|
+
if e.errno == errno.ENOTEMPTY:
|
|
206
|
+
# if directory exists we assume another process
|
|
207
|
+
# got there first, in which case we will copy
|
|
208
|
+
# our output to the directory manually in second step
|
|
209
|
+
return
|
|
210
|
+
else:
|
|
211
|
+
# otherwise assume directory creation failed e.g.: access denied
|
|
212
|
+
# on Windows we see occasional failures to rename directories due to
|
|
213
|
+
# some process holding a lock on a file to be moved to workaround
|
|
214
|
+
# this we make multiple attempts to rename with some delay
|
|
215
|
+
if i < attempts - 1:
|
|
216
|
+
time.sleep(delay)
|
|
217
|
+
else:
|
|
218
|
+
print(
|
|
219
|
+
f"Could not update Warp cache with compiled binaries, trying to rename {src} to {dst}, error {e}"
|
|
220
|
+
)
|
|
221
|
+
raise e
|
|
222
|
+
|
|
223
|
+
|
|
224
|
+
def hash_symbol(symbol):
|
|
225
|
+
ch = hashlib.sha256()
|
|
226
|
+
ch.update(symbol.encode("utf-8"))
|
|
227
|
+
return ch.hexdigest()
|
|
228
|
+
|
|
229
|
+
|
|
230
|
+
def get_lto_cache_dir():
|
|
231
|
+
lto_dir = os.path.join(warp._src.config.kernel_cache_dir, "lto")
|
|
232
|
+
return lto_dir
|
|
233
|
+
|
|
234
|
+
|
|
235
|
+
def get_cached_lto(path):
|
|
236
|
+
if os.path.exists(path):
|
|
237
|
+
with open(path, "rb") as f:
|
|
238
|
+
lto_code_data = f.read()
|
|
239
|
+
return lto_code_data
|
|
240
|
+
else:
|
|
241
|
+
return None
|
|
242
|
+
|
|
243
|
+
|
|
244
|
+
def get_cached_lto_meta(path, symbol):
|
|
245
|
+
if os.path.exists(path):
|
|
246
|
+
with open(path) as f:
|
|
247
|
+
keys = json.load(f)
|
|
248
|
+
value = keys[symbol]
|
|
249
|
+
return value
|
|
250
|
+
else:
|
|
251
|
+
return None
|
|
252
|
+
|
|
253
|
+
|
|
254
|
+
def _build_lto_base(lto_symbol, compile_func, builder, extra_files=None):
|
|
255
|
+
"""Generic LTO build function that handles caching, file operations and process management.
|
|
256
|
+
|
|
257
|
+
Args:
|
|
258
|
+
lto_symbol: Unique identifier for the LTO operation
|
|
259
|
+
compile_func: Function to compile the specific LTO
|
|
260
|
+
(receives a dictionary of build paths)
|
|
261
|
+
builder: Builder object to store results
|
|
262
|
+
extra_files: Dictionary of additional file types to handle (e.g.,
|
|
263
|
+
{".meta": None, ".fatbin": None}). Values are the functions to get
|
|
264
|
+
the cached file data.
|
|
265
|
+
|
|
266
|
+
Returns:
|
|
267
|
+
Tuple where the first element is a success flag (``bool``). The second
|
|
268
|
+
element is the LTO code as bytes (or ``None`` on failure).
|
|
269
|
+
If ``extra_files`` is provided, additional elements follow in the same
|
|
270
|
+
order as the keys in ``extra_files``:
|
|
271
|
+
- ``".meta"``: int (shared memory bytes).
|
|
272
|
+
- ``"_fatbin.lto"``: bytes (universal fatbin).
|
|
273
|
+
"""
|
|
274
|
+
if extra_files is None:
|
|
275
|
+
extra_files = {}
|
|
276
|
+
|
|
277
|
+
# Hash symbol and set up paths
|
|
278
|
+
h = hash_symbol(lto_symbol)
|
|
279
|
+
lto_dir = get_lto_cache_dir()
|
|
280
|
+
lto_name = f"{h[:7]}.lto"
|
|
281
|
+
lto_path = os.path.join(lto_dir, lto_name)
|
|
282
|
+
|
|
283
|
+
# Set up paths for extra files
|
|
284
|
+
file_paths = {".lto": lto_path}
|
|
285
|
+
temp_file_paths = {}
|
|
286
|
+
|
|
287
|
+
for ext, _ in extra_files.items():
|
|
288
|
+
name = f"{h[:7]}{ext}"
|
|
289
|
+
file_paths[ext] = os.path.join(lto_dir, name)
|
|
290
|
+
|
|
291
|
+
# Check if already built but not cached
|
|
292
|
+
lto_code_data = get_cached_lto(lto_path)
|
|
293
|
+
if lto_code_data is not None:
|
|
294
|
+
# Get the cached data for the extra files and early return
|
|
295
|
+
all_files_cached = True
|
|
296
|
+
for ext, getter in extra_files.items():
|
|
297
|
+
if getter and os.path.exists(file_paths[ext]):
|
|
298
|
+
cached_data = getter(file_paths[ext])
|
|
299
|
+
if cached_data is None:
|
|
300
|
+
all_files_cached = False
|
|
301
|
+
break
|
|
302
|
+
extra_files[ext] = cached_data
|
|
303
|
+
elif getter: # If there's a getter but file doesn't exist
|
|
304
|
+
all_files_cached = False
|
|
305
|
+
break
|
|
306
|
+
|
|
307
|
+
if all_files_cached:
|
|
308
|
+
if not extra_files:
|
|
309
|
+
return (True, lto_code_data)
|
|
310
|
+
else:
|
|
311
|
+
return (True, lto_code_data, *[extra_files[ext] for ext in extra_files.keys()])
|
|
312
|
+
|
|
313
|
+
# Create process-dependent temporary build directory
|
|
314
|
+
build_dir = f"{lto_dir}_p{os.getpid()}"
|
|
315
|
+
Path(build_dir).mkdir(parents=True, exist_ok=True)
|
|
316
|
+
|
|
317
|
+
# Set up temporary paths for the build outputs
|
|
318
|
+
for ext, path in file_paths.items():
|
|
319
|
+
temp_file_paths[ext] = os.path.join(build_dir, os.path.basename(path))
|
|
320
|
+
|
|
321
|
+
# Compile LTO with the specialized function
|
|
322
|
+
result, outputs = compile_func(temp_file_paths)
|
|
323
|
+
|
|
324
|
+
if not result:
|
|
325
|
+
# Clean up and fail
|
|
326
|
+
for path in temp_file_paths.values():
|
|
327
|
+
if Path(path).exists():
|
|
328
|
+
Path(path).unlink()
|
|
329
|
+
|
|
330
|
+
outputs[".lto"] = None
|
|
331
|
+
for ext in extra_files.keys():
|
|
332
|
+
outputs[ext] = None
|
|
333
|
+
else:
|
|
334
|
+
# Move outputs to cache
|
|
335
|
+
safe_rename(build_dir, lto_dir)
|
|
336
|
+
|
|
337
|
+
# If build_dir couldn't be moved by a rename, move the outputs one-by-one to lto_dir
|
|
338
|
+
if os.path.exists(lto_dir):
|
|
339
|
+
for ext, path in file_paths.items():
|
|
340
|
+
if not os.path.exists(path):
|
|
341
|
+
try:
|
|
342
|
+
# copy output file to the destination lto dir
|
|
343
|
+
os.rename(temp_file_paths[ext], path)
|
|
344
|
+
except (OSError, FileExistsError):
|
|
345
|
+
# another process likely updated the lto dir first
|
|
346
|
+
pass
|
|
347
|
+
|
|
348
|
+
# Clean up the temporary build directory
|
|
349
|
+
if build_dir:
|
|
350
|
+
import shutil
|
|
351
|
+
|
|
352
|
+
shutil.rmtree(build_dir, ignore_errors=True)
|
|
353
|
+
|
|
354
|
+
if not extra_files:
|
|
355
|
+
return (result, outputs[".lto"])
|
|
356
|
+
else:
|
|
357
|
+
return (result, outputs[".lto"], *[outputs[ext] for ext in extra_files.keys()])
|
|
358
|
+
|
|
359
|
+
|
|
360
|
+
def build_lto_dot(M, N, K, adtype, bdtype, cdtype, alayout, blayout, clayout, arch, num_threads, builder):
|
|
361
|
+
arch = 120 if arch > 121 else arch
|
|
362
|
+
|
|
363
|
+
# Maps Python/Warp types to C++ types and enums
|
|
364
|
+
def cublasdx_type_map(dtype):
|
|
365
|
+
if dtype == float16:
|
|
366
|
+
return ("wp::float16", 3, 0)
|
|
367
|
+
if dtype == float32:
|
|
368
|
+
return ("wp::float32", 5, 0)
|
|
369
|
+
if dtype == float64:
|
|
370
|
+
return ("wp::float64", 6, 0)
|
|
371
|
+
if dtype == vec2h:
|
|
372
|
+
return ("wp::vec2h", 3, 1)
|
|
373
|
+
if dtype == vec2f:
|
|
374
|
+
return ("wp::vec2f", 5, 1)
|
|
375
|
+
if dtype == vec2d:
|
|
376
|
+
return ("wp::vec2d", 6, 1)
|
|
377
|
+
raise TypeError("Unsupported input type in tile_matmul")
|
|
378
|
+
|
|
379
|
+
def cublasdx_arrangement_map(layout):
|
|
380
|
+
if layout == "colmajor":
|
|
381
|
+
return 0 # CUBLASDX_ARRANGEMENT_COL_MAJOR
|
|
382
|
+
if layout == "rowmajor":
|
|
383
|
+
return 1 # CUBLASDX_ARRANGEMENT_ROW_MAJOR
|
|
384
|
+
raise ValueError("Unsupported layout in tile_matmul")
|
|
385
|
+
|
|
386
|
+
(a_dtype, a_prec, a_type) = cublasdx_type_map(adtype)
|
|
387
|
+
(b_dtype, b_prec, b_type) = cublasdx_type_map(bdtype)
|
|
388
|
+
(c_dtype, c_prec, c_type) = cublasdx_type_map(cdtype)
|
|
389
|
+
a_arrangement = cublasdx_arrangement_map(alayout)
|
|
390
|
+
b_arrangement = cublasdx_arrangement_map(blayout)
|
|
391
|
+
c_arrangement = cublasdx_arrangement_map(clayout)
|
|
392
|
+
|
|
393
|
+
if a_type != b_type or a_type != c_type:
|
|
394
|
+
raise TypeError("tile_matmul(A, B, C) requires all inputs to be real or complex")
|
|
395
|
+
|
|
396
|
+
element_type = a_type
|
|
397
|
+
|
|
398
|
+
lto_symbol = f"dot_{M}_{N}_{K}_{arch}_{num_threads}_{a_arrangement}_{b_arrangement}_{c_arrangement}_{a_prec}_{b_prec}_{c_prec}_{element_type}"
|
|
399
|
+
|
|
400
|
+
def compile_lto_dot(temp_paths):
|
|
401
|
+
result = warp._src.context.runtime.core.wp_cuda_compile_dot(
|
|
402
|
+
temp_paths[".lto"].encode("utf-8"),
|
|
403
|
+
lto_symbol.encode("utf-8"),
|
|
404
|
+
0,
|
|
405
|
+
None,
|
|
406
|
+
None,
|
|
407
|
+
arch,
|
|
408
|
+
M,
|
|
409
|
+
N,
|
|
410
|
+
K,
|
|
411
|
+
a_prec,
|
|
412
|
+
b_prec,
|
|
413
|
+
c_prec,
|
|
414
|
+
element_type,
|
|
415
|
+
a_arrangement,
|
|
416
|
+
b_arrangement,
|
|
417
|
+
c_arrangement,
|
|
418
|
+
num_threads,
|
|
419
|
+
)
|
|
420
|
+
|
|
421
|
+
if result:
|
|
422
|
+
with open(temp_paths[".lto"], "rb") as f:
|
|
423
|
+
lto_code_data = f.read()
|
|
424
|
+
return True, {".lto": lto_code_data}
|
|
425
|
+
return False, {}
|
|
426
|
+
|
|
427
|
+
# Early out if already cached in module
|
|
428
|
+
if lto_symbol in builder.ltoirs:
|
|
429
|
+
lto_code_data = builder.ltoirs[lto_symbol]
|
|
430
|
+
else:
|
|
431
|
+
(result, lto_code_data) = _build_lto_base(lto_symbol, compile_lto_dot, builder, {})
|
|
432
|
+
|
|
433
|
+
if not result:
|
|
434
|
+
raise RuntimeError(
|
|
435
|
+
f"Failed to compile LTO '{lto_symbol}'. "
|
|
436
|
+
"Set the environment variable LIBMATHDX_LOG_LEVEL=5 and rerun for more details."
|
|
437
|
+
)
|
|
438
|
+
|
|
439
|
+
# Update builder
|
|
440
|
+
builder.ltoirs[lto_symbol] = lto_code_data
|
|
441
|
+
builder.ltoirs_decl[lto_symbol] = (
|
|
442
|
+
f"void {lto_symbol}({c_dtype}*, {a_dtype}*, {b_dtype}*, {c_dtype}*, {c_dtype}*);"
|
|
443
|
+
)
|
|
444
|
+
|
|
445
|
+
return lto_symbol, lto_code_data
|
|
446
|
+
|
|
447
|
+
|
|
448
|
+
def build_lto_solver(
|
|
449
|
+
M,
|
|
450
|
+
N,
|
|
451
|
+
NRHS,
|
|
452
|
+
solver,
|
|
453
|
+
solver_enum,
|
|
454
|
+
side_enum,
|
|
455
|
+
diag_enum,
|
|
456
|
+
alayout,
|
|
457
|
+
blayout,
|
|
458
|
+
fill_mode,
|
|
459
|
+
arch,
|
|
460
|
+
precision_enum,
|
|
461
|
+
num_threads,
|
|
462
|
+
parameter_list,
|
|
463
|
+
builder,
|
|
464
|
+
smem_estimate_bytes=None,
|
|
465
|
+
):
|
|
466
|
+
arch = 120 if arch > 121 else arch
|
|
467
|
+
|
|
468
|
+
def cusolverdx_arrangement_map(layout):
|
|
469
|
+
if layout == "colmajor":
|
|
470
|
+
return 0 # CUSOLVERDX_ARRANGEMENT_COL_MAJOR
|
|
471
|
+
if layout == "rowmajor":
|
|
472
|
+
return 1 # CUSOLVERDX_ARRANGEMENT_ROW_MAJOR
|
|
473
|
+
raise ValueError("Unsupported layout in tile_matmul")
|
|
474
|
+
|
|
475
|
+
a_arrangement = cusolverdx_arrangement_map(alayout)
|
|
476
|
+
b_arrangement = cusolverdx_arrangement_map(blayout)
|
|
477
|
+
|
|
478
|
+
lto_symbol = f"{solver}_{M}_{N}_{NRHS}_{arch}_{num_threads}_{a_arrangement}_{b_arrangement}_{precision_enum}_{side_enum if side_enum >= 0 else 'x'}_{diag_enum if diag_enum >= 0 else 'x'}_{fill_mode}"
|
|
479
|
+
|
|
480
|
+
def compile_lto_solver(temp_paths):
|
|
481
|
+
# compile LTO
|
|
482
|
+
result = warp._src.context.runtime.core.wp_cuda_compile_solver(
|
|
483
|
+
temp_paths["_fatbin.lto"].encode("utf-8"),
|
|
484
|
+
temp_paths[".lto"].encode("utf-8"),
|
|
485
|
+
lto_symbol.encode("utf-8"),
|
|
486
|
+
0,
|
|
487
|
+
None,
|
|
488
|
+
None,
|
|
489
|
+
arch,
|
|
490
|
+
M,
|
|
491
|
+
N,
|
|
492
|
+
NRHS,
|
|
493
|
+
solver_enum,
|
|
494
|
+
side_enum,
|
|
495
|
+
diag_enum,
|
|
496
|
+
precision_enum,
|
|
497
|
+
a_arrangement,
|
|
498
|
+
b_arrangement,
|
|
499
|
+
fill_mode,
|
|
500
|
+
num_threads,
|
|
501
|
+
)
|
|
502
|
+
|
|
503
|
+
if result:
|
|
504
|
+
with open(temp_paths[".lto"], "rb") as f:
|
|
505
|
+
lto_code_data = f.read()
|
|
506
|
+
with open(temp_paths["_fatbin.lto"], "rb") as f:
|
|
507
|
+
universal_fatbin_code_data = f.read()
|
|
508
|
+
return True, {".lto": lto_code_data, "_fatbin.lto": universal_fatbin_code_data}
|
|
509
|
+
return False, {}
|
|
510
|
+
|
|
511
|
+
# Early out if already cached in module
|
|
512
|
+
if lto_symbol in builder.ltoirs:
|
|
513
|
+
lto_code_data = builder.ltoirs[lto_symbol]
|
|
514
|
+
else:
|
|
515
|
+
(result, lto_code_data, universal_fatbin_code_data) = _build_lto_base(
|
|
516
|
+
lto_symbol, compile_lto_solver, builder, {"_fatbin.lto": get_cached_lto}
|
|
517
|
+
)
|
|
518
|
+
|
|
519
|
+
if not result:
|
|
520
|
+
hint = ""
|
|
521
|
+
if smem_estimate_bytes:
|
|
522
|
+
max_smem_bytes = 232448
|
|
523
|
+
max_smem_is_estimate = True
|
|
524
|
+
for d in warp.get_cuda_devices():
|
|
525
|
+
if d.arch == arch:
|
|
526
|
+
# We can directly query the max shared memory for this device
|
|
527
|
+
queried_bytes = warp._src.context.runtime.core.wp_cuda_get_max_shared_memory(d.context)
|
|
528
|
+
if queried_bytes > 0:
|
|
529
|
+
max_smem_bytes = queried_bytes
|
|
530
|
+
max_smem_is_estimate = False
|
|
531
|
+
break
|
|
532
|
+
if smem_estimate_bytes > max_smem_bytes:
|
|
533
|
+
source = "estimated limit" if max_smem_is_estimate else "device-reported limit"
|
|
534
|
+
hint = (
|
|
535
|
+
f"Estimated shared memory requirement is {smem_estimate_bytes}B, "
|
|
536
|
+
f"but the {source} is {max_smem_bytes}B. "
|
|
537
|
+
"The tile size(s) may be too large for this device."
|
|
538
|
+
)
|
|
539
|
+
|
|
540
|
+
if warp._src.context.runtime.toolkit_version < (12, 6):
|
|
541
|
+
raise RuntimeError(
|
|
542
|
+
"cuSolverDx requires CUDA Toolkit 12.6.3 or later. This version of Warp was built against CUDA Toolkit "
|
|
543
|
+
f"{warp._src.context.runtime.toolkit_version[0]}.{warp._src.context.runtime.toolkit_version[1]}. "
|
|
544
|
+
"Upgrade your CUDA Toolkit and rebuild Warp, or install a Warp wheel built with CUDA >= 12.6.3."
|
|
545
|
+
)
|
|
546
|
+
else:
|
|
547
|
+
raise RuntimeError(
|
|
548
|
+
f"Failed to compile LTO '{lto_symbol}'. {hint}"
|
|
549
|
+
" Set the environment variable LIBMATHDX_LOG_LEVEL=5 and rerun for more details."
|
|
550
|
+
)
|
|
551
|
+
|
|
552
|
+
# Update builder
|
|
553
|
+
builder.ltoirs[lto_symbol] = lto_code_data
|
|
554
|
+
builder.ltoirs_decl[lto_symbol] = f"void {lto_symbol}{parameter_list};"
|
|
555
|
+
builder.fatbins[lto_symbol] = universal_fatbin_code_data
|
|
556
|
+
|
|
557
|
+
return lto_symbol, lto_code_data
|
|
558
|
+
|
|
559
|
+
|
|
560
|
+
def build_lto_fft(arch, size, ept, direction, dir, precision, builder):
|
|
561
|
+
arch = 120 if arch > 121 else arch
|
|
562
|
+
|
|
563
|
+
lto_symbol = f"fft_{size}_{ept}_{arch}_{direction}_{precision}"
|
|
564
|
+
|
|
565
|
+
def compile_lto_fft(temp_paths):
|
|
566
|
+
shared_memory_size = ctypes.c_int(0)
|
|
567
|
+
|
|
568
|
+
result = warp._src.context.runtime.core.wp_cuda_compile_fft(
|
|
569
|
+
temp_paths[".lto"].encode("utf-8"),
|
|
570
|
+
lto_symbol.encode("utf-8"),
|
|
571
|
+
0,
|
|
572
|
+
None,
|
|
573
|
+
None,
|
|
574
|
+
arch,
|
|
575
|
+
size,
|
|
576
|
+
ept,
|
|
577
|
+
dir,
|
|
578
|
+
precision,
|
|
579
|
+
ctypes.byref(shared_memory_size),
|
|
580
|
+
)
|
|
581
|
+
|
|
582
|
+
if result:
|
|
583
|
+
with open(temp_paths[".lto"], "rb") as f:
|
|
584
|
+
lto_code_data = f.read()
|
|
585
|
+
|
|
586
|
+
shared_memory_bytes = tile.round_up(shared_memory_size.value)
|
|
587
|
+
|
|
588
|
+
# output meta file with shared memory requirements for this lto_symbol
|
|
589
|
+
meta = {}
|
|
590
|
+
meta[lto_symbol] = shared_memory_bytes
|
|
591
|
+
|
|
592
|
+
with open(temp_paths[".meta"], "w") as meta_file:
|
|
593
|
+
json.dump(meta, meta_file)
|
|
594
|
+
|
|
595
|
+
return True, {".lto": lto_code_data, ".meta": shared_memory_bytes}
|
|
596
|
+
|
|
597
|
+
return False, {}
|
|
598
|
+
|
|
599
|
+
# Early out if already cached in module
|
|
600
|
+
if lto_symbol in builder.ltoirs and lto_symbol in builder.shared_memory_bytes:
|
|
601
|
+
lto_code_data = builder.ltoirs[lto_symbol]
|
|
602
|
+
shared_memory_bytes = builder.shared_memory_bytes[lto_symbol]
|
|
603
|
+
else:
|
|
604
|
+
(result, lto_code_data, shared_memory_bytes) = _build_lto_base(
|
|
605
|
+
lto_symbol, compile_lto_fft, builder, {".meta": lambda path: get_cached_lto_meta(path, lto_symbol)}
|
|
606
|
+
)
|
|
607
|
+
|
|
608
|
+
if not result:
|
|
609
|
+
raise RuntimeError(
|
|
610
|
+
f"Failed to compile LTO '{lto_symbol}'."
|
|
611
|
+
"Set the environment variable LIBMATHDX_LOG_LEVEL=5 and rerun for more details."
|
|
612
|
+
)
|
|
613
|
+
|
|
614
|
+
# Update builder
|
|
615
|
+
builder.ltoirs[lto_symbol] = lto_code_data
|
|
616
|
+
builder.shared_memory_bytes[lto_symbol] = shared_memory_bytes
|
|
617
|
+
|
|
618
|
+
return lto_symbol, lto_code_data, shared_memory_bytes
|