warp-lang 1.9.1__py3-none-manylinux_2_34_aarch64.whl → 1.10.0rc2__py3-none-manylinux_2_34_aarch64.whl

This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.

Potentially problematic release.


This version of warp-lang might be problematic. Click here for more details.

Files changed (346) hide show
  1. warp/__init__.py +301 -287
  2. warp/__init__.pyi +794 -305
  3. warp/_src/__init__.py +14 -0
  4. warp/_src/autograd.py +1075 -0
  5. warp/_src/build.py +618 -0
  6. warp/_src/build_dll.py +640 -0
  7. warp/{builtins.py → _src/builtins.py} +1382 -377
  8. warp/_src/codegen.py +4359 -0
  9. warp/{config.py → _src/config.py} +178 -169
  10. warp/_src/constants.py +57 -0
  11. warp/_src/context.py +8294 -0
  12. warp/_src/dlpack.py +462 -0
  13. warp/_src/fabric.py +355 -0
  14. warp/_src/fem/__init__.py +14 -0
  15. warp/_src/fem/adaptivity.py +508 -0
  16. warp/_src/fem/cache.py +687 -0
  17. warp/_src/fem/dirichlet.py +188 -0
  18. warp/{fem → _src/fem}/domain.py +40 -30
  19. warp/_src/fem/field/__init__.py +131 -0
  20. warp/_src/fem/field/field.py +701 -0
  21. warp/{fem → _src/fem}/field/nodal_field.py +30 -15
  22. warp/{fem → _src/fem}/field/restriction.py +1 -1
  23. warp/{fem → _src/fem}/field/virtual.py +53 -27
  24. warp/_src/fem/geometry/__init__.py +32 -0
  25. warp/{fem → _src/fem}/geometry/adaptive_nanogrid.py +77 -163
  26. warp/_src/fem/geometry/closest_point.py +97 -0
  27. warp/{fem → _src/fem}/geometry/deformed_geometry.py +14 -22
  28. warp/{fem → _src/fem}/geometry/element.py +32 -10
  29. warp/{fem → _src/fem}/geometry/geometry.py +48 -20
  30. warp/{fem → _src/fem}/geometry/grid_2d.py +12 -23
  31. warp/{fem → _src/fem}/geometry/grid_3d.py +12 -23
  32. warp/{fem → _src/fem}/geometry/hexmesh.py +40 -63
  33. warp/{fem → _src/fem}/geometry/nanogrid.py +255 -248
  34. warp/{fem → _src/fem}/geometry/partition.py +121 -63
  35. warp/{fem → _src/fem}/geometry/quadmesh.py +26 -45
  36. warp/{fem → _src/fem}/geometry/tetmesh.py +40 -63
  37. warp/{fem → _src/fem}/geometry/trimesh.py +26 -45
  38. warp/{fem → _src/fem}/integrate.py +164 -158
  39. warp/_src/fem/linalg.py +383 -0
  40. warp/_src/fem/operator.py +396 -0
  41. warp/_src/fem/polynomial.py +229 -0
  42. warp/{fem → _src/fem}/quadrature/pic_quadrature.py +15 -20
  43. warp/{fem → _src/fem}/quadrature/quadrature.py +95 -47
  44. warp/_src/fem/space/__init__.py +248 -0
  45. warp/{fem → _src/fem}/space/basis_function_space.py +20 -11
  46. warp/_src/fem/space/basis_space.py +679 -0
  47. warp/{fem → _src/fem}/space/dof_mapper.py +3 -3
  48. warp/{fem → _src/fem}/space/function_space.py +14 -13
  49. warp/{fem → _src/fem}/space/grid_2d_function_space.py +4 -7
  50. warp/{fem → _src/fem}/space/grid_3d_function_space.py +4 -4
  51. warp/{fem → _src/fem}/space/hexmesh_function_space.py +4 -10
  52. warp/{fem → _src/fem}/space/nanogrid_function_space.py +3 -9
  53. warp/{fem → _src/fem}/space/partition.py +117 -60
  54. warp/{fem → _src/fem}/space/quadmesh_function_space.py +4 -10
  55. warp/{fem → _src/fem}/space/restriction.py +66 -33
  56. warp/_src/fem/space/shape/__init__.py +152 -0
  57. warp/{fem → _src/fem}/space/shape/cube_shape_function.py +9 -9
  58. warp/{fem → _src/fem}/space/shape/shape_function.py +8 -9
  59. warp/{fem → _src/fem}/space/shape/square_shape_function.py +6 -6
  60. warp/{fem → _src/fem}/space/shape/tet_shape_function.py +3 -3
  61. warp/{fem → _src/fem}/space/shape/triangle_shape_function.py +3 -3
  62. warp/{fem → _src/fem}/space/tetmesh_function_space.py +3 -9
  63. warp/_src/fem/space/topology.py +459 -0
  64. warp/{fem → _src/fem}/space/trimesh_function_space.py +3 -9
  65. warp/_src/fem/types.py +112 -0
  66. warp/_src/fem/utils.py +486 -0
  67. warp/_src/jax.py +186 -0
  68. warp/_src/jax_experimental/__init__.py +14 -0
  69. warp/_src/jax_experimental/custom_call.py +387 -0
  70. warp/_src/jax_experimental/ffi.py +1284 -0
  71. warp/_src/jax_experimental/xla_ffi.py +656 -0
  72. warp/_src/marching_cubes.py +708 -0
  73. warp/_src/math.py +414 -0
  74. warp/_src/optim/__init__.py +14 -0
  75. warp/_src/optim/adam.py +163 -0
  76. warp/_src/optim/linear.py +1606 -0
  77. warp/_src/optim/sgd.py +112 -0
  78. warp/_src/paddle.py +406 -0
  79. warp/_src/render/__init__.py +14 -0
  80. warp/_src/render/imgui_manager.py +289 -0
  81. warp/_src/render/render_opengl.py +3636 -0
  82. warp/_src/render/render_usd.py +937 -0
  83. warp/_src/render/utils.py +160 -0
  84. warp/_src/sparse.py +2716 -0
  85. warp/_src/tape.py +1206 -0
  86. warp/{thirdparty → _src/thirdparty}/unittest_parallel.py +9 -2
  87. warp/_src/torch.py +391 -0
  88. warp/_src/types.py +5870 -0
  89. warp/_src/utils.py +1693 -0
  90. warp/autograd.py +12 -1054
  91. warp/bin/warp-clang.so +0 -0
  92. warp/bin/warp.so +0 -0
  93. warp/build.py +8 -588
  94. warp/build_dll.py +6 -721
  95. warp/codegen.py +6 -4251
  96. warp/constants.py +6 -39
  97. warp/context.py +12 -8062
  98. warp/dlpack.py +6 -444
  99. warp/examples/distributed/example_jacobi_mpi.py +4 -5
  100. warp/examples/fem/example_adaptive_grid.py +1 -1
  101. warp/examples/fem/example_apic_fluid.py +1 -1
  102. warp/examples/fem/example_burgers.py +8 -8
  103. warp/examples/fem/example_diffusion.py +1 -1
  104. warp/examples/fem/example_distortion_energy.py +1 -1
  105. warp/examples/fem/example_mixed_elasticity.py +2 -2
  106. warp/examples/fem/example_navier_stokes.py +1 -1
  107. warp/examples/fem/example_nonconforming_contact.py +7 -7
  108. warp/examples/fem/example_stokes.py +1 -1
  109. warp/examples/fem/example_stokes_transfer.py +1 -1
  110. warp/examples/fem/utils.py +2 -2
  111. warp/examples/interop/example_jax_callable.py +1 -1
  112. warp/examples/interop/example_jax_ffi_callback.py +1 -1
  113. warp/examples/interop/example_jax_kernel.py +1 -1
  114. warp/examples/tile/example_tile_mcgp.py +191 -0
  115. warp/fabric.py +6 -337
  116. warp/fem/__init__.py +159 -97
  117. warp/fem/adaptivity.py +7 -489
  118. warp/fem/cache.py +9 -648
  119. warp/fem/dirichlet.py +6 -184
  120. warp/fem/field/__init__.py +8 -109
  121. warp/fem/field/field.py +7 -652
  122. warp/fem/geometry/__init__.py +7 -18
  123. warp/fem/geometry/closest_point.py +11 -77
  124. warp/fem/linalg.py +18 -366
  125. warp/fem/operator.py +11 -369
  126. warp/fem/polynomial.py +9 -209
  127. warp/fem/space/__init__.py +5 -211
  128. warp/fem/space/basis_space.py +6 -662
  129. warp/fem/space/shape/__init__.py +41 -118
  130. warp/fem/space/topology.py +6 -437
  131. warp/fem/types.py +6 -81
  132. warp/fem/utils.py +11 -444
  133. warp/jax.py +8 -165
  134. warp/jax_experimental/__init__.py +14 -1
  135. warp/jax_experimental/custom_call.py +8 -365
  136. warp/jax_experimental/ffi.py +17 -873
  137. warp/jax_experimental/xla_ffi.py +5 -605
  138. warp/marching_cubes.py +5 -689
  139. warp/math.py +16 -393
  140. warp/native/array.h +385 -37
  141. warp/native/builtin.h +314 -37
  142. warp/native/bvh.cpp +43 -9
  143. warp/native/bvh.cu +62 -27
  144. warp/native/bvh.h +310 -309
  145. warp/native/clang/clang.cpp +102 -97
  146. warp/native/coloring.cpp +0 -1
  147. warp/native/crt.h +208 -0
  148. warp/native/exports.h +156 -0
  149. warp/native/hashgrid.cu +2 -0
  150. warp/native/intersect.h +24 -1
  151. warp/native/intersect_tri.h +44 -35
  152. warp/native/mat.h +1456 -276
  153. warp/native/mesh.cpp +4 -4
  154. warp/native/mesh.cu +4 -2
  155. warp/native/mesh.h +176 -61
  156. warp/native/quat.h +0 -52
  157. warp/native/scan.cu +2 -0
  158. warp/native/sparse.cu +7 -3
  159. warp/native/spatial.h +12 -0
  160. warp/native/tile.h +681 -89
  161. warp/native/tile_radix_sort.h +1 -1
  162. warp/native/tile_reduce.h +394 -46
  163. warp/native/tile_scan.h +4 -4
  164. warp/native/vec.h +469 -0
  165. warp/native/version.h +23 -0
  166. warp/native/volume.cpp +1 -1
  167. warp/native/volume.cu +1 -0
  168. warp/native/volume.h +1 -1
  169. warp/native/volume_builder.cu +2 -0
  170. warp/native/warp.cpp +57 -29
  171. warp/native/warp.cu +253 -171
  172. warp/native/warp.h +11 -8
  173. warp/optim/__init__.py +6 -3
  174. warp/optim/adam.py +6 -145
  175. warp/optim/linear.py +14 -1585
  176. warp/optim/sgd.py +6 -94
  177. warp/paddle.py +6 -388
  178. warp/render/__init__.py +8 -4
  179. warp/render/imgui_manager.py +7 -267
  180. warp/render/render_opengl.py +6 -3618
  181. warp/render/render_usd.py +6 -919
  182. warp/render/utils.py +6 -142
  183. warp/sparse.py +37 -2563
  184. warp/tape.py +6 -1188
  185. warp/tests/__main__.py +1 -1
  186. warp/tests/cuda/test_async.py +4 -4
  187. warp/tests/cuda/test_conditional_captures.py +1 -1
  188. warp/tests/cuda/test_multigpu.py +1 -1
  189. warp/tests/cuda/test_streams.py +58 -1
  190. warp/tests/geometry/test_bvh.py +157 -22
  191. warp/tests/geometry/test_marching_cubes.py +0 -1
  192. warp/tests/geometry/test_mesh.py +5 -3
  193. warp/tests/geometry/test_mesh_query_aabb.py +5 -12
  194. warp/tests/geometry/test_mesh_query_point.py +5 -2
  195. warp/tests/geometry/test_mesh_query_ray.py +15 -3
  196. warp/tests/geometry/test_volume_write.py +5 -5
  197. warp/tests/interop/test_dlpack.py +14 -14
  198. warp/tests/interop/test_jax.py +772 -49
  199. warp/tests/interop/test_paddle.py +1 -1
  200. warp/tests/test_adam.py +0 -1
  201. warp/tests/test_arithmetic.py +9 -9
  202. warp/tests/test_array.py +527 -100
  203. warp/tests/test_array_reduce.py +3 -3
  204. warp/tests/test_atomic.py +12 -8
  205. warp/tests/test_atomic_bitwise.py +209 -0
  206. warp/tests/test_atomic_cas.py +4 -4
  207. warp/tests/test_bool.py +2 -2
  208. warp/tests/test_builtins_resolution.py +5 -571
  209. warp/tests/test_codegen.py +33 -14
  210. warp/tests/test_conditional.py +1 -1
  211. warp/tests/test_context.py +6 -6
  212. warp/tests/test_copy.py +242 -161
  213. warp/tests/test_ctypes.py +3 -3
  214. warp/tests/test_devices.py +24 -2
  215. warp/tests/test_examples.py +16 -84
  216. warp/tests/test_fabricarray.py +35 -35
  217. warp/tests/test_fast_math.py +0 -2
  218. warp/tests/test_fem.py +56 -10
  219. warp/tests/test_fixedarray.py +3 -3
  220. warp/tests/test_func.py +8 -5
  221. warp/tests/test_generics.py +1 -1
  222. warp/tests/test_indexedarray.py +24 -24
  223. warp/tests/test_intersect.py +39 -9
  224. warp/tests/test_large.py +1 -1
  225. warp/tests/test_lerp.py +3 -1
  226. warp/tests/test_linear_solvers.py +1 -1
  227. warp/tests/test_map.py +35 -4
  228. warp/tests/test_mat.py +52 -62
  229. warp/tests/test_mat_constructors.py +4 -5
  230. warp/tests/test_mat_lite.py +1 -1
  231. warp/tests/test_mat_scalar_ops.py +121 -121
  232. warp/tests/test_math.py +34 -0
  233. warp/tests/test_module_aot.py +4 -4
  234. warp/tests/test_modules_lite.py +28 -2
  235. warp/tests/test_print.py +11 -11
  236. warp/tests/test_quat.py +93 -58
  237. warp/tests/test_runlength_encode.py +1 -1
  238. warp/tests/test_scalar_ops.py +38 -10
  239. warp/tests/test_smoothstep.py +1 -1
  240. warp/tests/test_sparse.py +126 -15
  241. warp/tests/test_spatial.py +105 -87
  242. warp/tests/test_special_values.py +6 -6
  243. warp/tests/test_static.py +7 -7
  244. warp/tests/test_struct.py +13 -2
  245. warp/tests/test_triangle_closest_point.py +48 -1
  246. warp/tests/test_types.py +27 -15
  247. warp/tests/test_utils.py +52 -52
  248. warp/tests/test_vec.py +29 -29
  249. warp/tests/test_vec_constructors.py +5 -5
  250. warp/tests/test_vec_scalar_ops.py +97 -97
  251. warp/tests/test_version.py +75 -0
  252. warp/tests/tile/test_tile.py +178 -0
  253. warp/tests/tile/test_tile_atomic_bitwise.py +403 -0
  254. warp/tests/tile/test_tile_cholesky.py +7 -4
  255. warp/tests/tile/test_tile_load.py +26 -2
  256. warp/tests/tile/test_tile_mathdx.py +3 -3
  257. warp/tests/tile/test_tile_matmul.py +1 -1
  258. warp/tests/tile/test_tile_mlp.py +2 -4
  259. warp/tests/tile/test_tile_reduce.py +214 -13
  260. warp/tests/unittest_suites.py +6 -14
  261. warp/tests/unittest_utils.py +10 -9
  262. warp/tests/walkthrough_debug.py +3 -1
  263. warp/torch.py +6 -373
  264. warp/types.py +29 -5764
  265. warp/utils.py +10 -1659
  266. {warp_lang-1.9.1.dist-info → warp_lang-1.10.0rc2.dist-info}/METADATA +46 -99
  267. warp_lang-1.10.0rc2.dist-info/RECORD +468 -0
  268. warp_lang-1.10.0rc2.dist-info/licenses/licenses/Gaia-LICENSE.txt +6 -0
  269. warp_lang-1.10.0rc2.dist-info/licenses/licenses/appdirs-LICENSE.txt +22 -0
  270. warp_lang-1.10.0rc2.dist-info/licenses/licenses/asset_pixel_jpg-LICENSE.txt +3 -0
  271. warp_lang-1.10.0rc2.dist-info/licenses/licenses/cuda-LICENSE.txt +1582 -0
  272. warp_lang-1.10.0rc2.dist-info/licenses/licenses/dlpack-LICENSE.txt +201 -0
  273. warp_lang-1.10.0rc2.dist-info/licenses/licenses/fp16-LICENSE.txt +28 -0
  274. warp_lang-1.10.0rc2.dist-info/licenses/licenses/libmathdx-LICENSE.txt +220 -0
  275. warp_lang-1.10.0rc2.dist-info/licenses/licenses/llvm-LICENSE.txt +279 -0
  276. warp_lang-1.10.0rc2.dist-info/licenses/licenses/moller-LICENSE.txt +16 -0
  277. warp_lang-1.10.0rc2.dist-info/licenses/licenses/nanovdb-LICENSE.txt +2 -0
  278. warp_lang-1.10.0rc2.dist-info/licenses/licenses/nvrtc-LICENSE.txt +1592 -0
  279. warp_lang-1.10.0rc2.dist-info/licenses/licenses/svd-LICENSE.txt +23 -0
  280. warp_lang-1.10.0rc2.dist-info/licenses/licenses/unittest_parallel-LICENSE.txt +21 -0
  281. warp_lang-1.10.0rc2.dist-info/licenses/licenses/usd-LICENSE.txt +213 -0
  282. warp_lang-1.10.0rc2.dist-info/licenses/licenses/windingnumber-LICENSE.txt +21 -0
  283. warp/examples/assets/cartpole.urdf +0 -110
  284. warp/examples/assets/crazyflie.usd +0 -0
  285. warp/examples/assets/nv_ant.xml +0 -92
  286. warp/examples/assets/nv_humanoid.xml +0 -183
  287. warp/examples/assets/quadruped.urdf +0 -268
  288. warp/examples/optim/example_bounce.py +0 -266
  289. warp/examples/optim/example_cloth_throw.py +0 -228
  290. warp/examples/optim/example_drone.py +0 -870
  291. warp/examples/optim/example_inverse_kinematics.py +0 -182
  292. warp/examples/optim/example_inverse_kinematics_torch.py +0 -191
  293. warp/examples/optim/example_softbody_properties.py +0 -400
  294. warp/examples/optim/example_spring_cage.py +0 -245
  295. warp/examples/optim/example_trajectory.py +0 -227
  296. warp/examples/sim/example_cartpole.py +0 -143
  297. warp/examples/sim/example_cloth.py +0 -225
  298. warp/examples/sim/example_cloth_self_contact.py +0 -316
  299. warp/examples/sim/example_granular.py +0 -130
  300. warp/examples/sim/example_granular_collision_sdf.py +0 -202
  301. warp/examples/sim/example_jacobian_ik.py +0 -244
  302. warp/examples/sim/example_particle_chain.py +0 -124
  303. warp/examples/sim/example_quadruped.py +0 -203
  304. warp/examples/sim/example_rigid_chain.py +0 -203
  305. warp/examples/sim/example_rigid_contact.py +0 -195
  306. warp/examples/sim/example_rigid_force.py +0 -133
  307. warp/examples/sim/example_rigid_gyroscopic.py +0 -115
  308. warp/examples/sim/example_rigid_soft_contact.py +0 -140
  309. warp/examples/sim/example_soft_body.py +0 -196
  310. warp/examples/tile/example_tile_walker.py +0 -327
  311. warp/sim/__init__.py +0 -74
  312. warp/sim/articulation.py +0 -793
  313. warp/sim/collide.py +0 -2570
  314. warp/sim/graph_coloring.py +0 -307
  315. warp/sim/import_mjcf.py +0 -791
  316. warp/sim/import_snu.py +0 -227
  317. warp/sim/import_urdf.py +0 -579
  318. warp/sim/import_usd.py +0 -898
  319. warp/sim/inertia.py +0 -357
  320. warp/sim/integrator.py +0 -245
  321. warp/sim/integrator_euler.py +0 -2000
  322. warp/sim/integrator_featherstone.py +0 -2101
  323. warp/sim/integrator_vbd.py +0 -2487
  324. warp/sim/integrator_xpbd.py +0 -3295
  325. warp/sim/model.py +0 -4821
  326. warp/sim/particles.py +0 -121
  327. warp/sim/render.py +0 -431
  328. warp/sim/utils.py +0 -431
  329. warp/tests/sim/disabled_kinematics.py +0 -244
  330. warp/tests/sim/test_cloth.py +0 -863
  331. warp/tests/sim/test_collision.py +0 -743
  332. warp/tests/sim/test_coloring.py +0 -347
  333. warp/tests/sim/test_inertia.py +0 -161
  334. warp/tests/sim/test_model.py +0 -226
  335. warp/tests/sim/test_sim_grad.py +0 -287
  336. warp/tests/sim/test_sim_grad_bounce_linear.py +0 -212
  337. warp/tests/sim/test_sim_kinematics.py +0 -98
  338. warp/thirdparty/__init__.py +0 -0
  339. warp_lang-1.9.1.dist-info/RECORD +0 -456
  340. /warp/{fem → _src/fem}/quadrature/__init__.py +0 -0
  341. /warp/{tests/sim → _src/thirdparty}/__init__.py +0 -0
  342. /warp/{thirdparty → _src/thirdparty}/appdirs.py +0 -0
  343. /warp/{thirdparty → _src/thirdparty}/dlpack.py +0 -0
  344. {warp_lang-1.9.1.dist-info → warp_lang-1.10.0rc2.dist-info}/WHEEL +0 -0
  345. {warp_lang-1.9.1.dist-info → warp_lang-1.10.0rc2.dist-info}/licenses/LICENSE.md +0 -0
  346. {warp_lang-1.9.1.dist-info → warp_lang-1.10.0rc2.dist-info}/top_level.txt +0 -0
warp/_src/build.py ADDED
@@ -0,0 +1,618 @@
1
+ # SPDX-FileCopyrightText: Copyright (c) 2022 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2
+ # SPDX-License-Identifier: Apache-2.0
3
+ #
4
+ # Licensed under the Apache License, Version 2.0 (the "License");
5
+ # you may not use this file except in compliance with the License.
6
+ # You may obtain a copy of the License at
7
+ #
8
+ # http://www.apache.org/licenses/LICENSE-2.0
9
+ #
10
+ # Unless required by applicable law or agreed to in writing, software
11
+ # distributed under the License is distributed on an "AS IS" BASIS,
12
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13
+ # See the License for the specific language governing permissions and
14
+ # limitations under the License.
15
+
16
+ import ctypes
17
+ import errno
18
+ import hashlib
19
+ import json
20
+ import os
21
+ import time
22
+ from pathlib import Path
23
+
24
+ import warp._src.config
25
+ from warp._src.thirdparty import appdirs
26
+ from warp._src.types import *
27
+
28
+ # From nvJitLink.h
29
+ nvJitLink_input_type = {"cubin": 1, "ptx": 2, "ltoir": 3, "fatbin": 4, "object": 5, "library": 6}
30
+
31
+ warp_home = os.path.realpath(os.path.join(os.path.dirname(__file__), ".."))
32
+
33
+
34
+ # builds cuda source to PTX or CUBIN using NVRTC (output type determined by output_path extension)
35
+ def build_cuda(
36
+ cu_path,
37
+ arch,
38
+ output_path,
39
+ config="release",
40
+ verify_fp=False,
41
+ fast_math=False,
42
+ fuse_fp=True,
43
+ lineinfo=False,
44
+ compile_time_trace=False,
45
+ ltoirs=None,
46
+ fatbins=None,
47
+ ) -> None:
48
+ with open(cu_path, "rb") as src_file:
49
+ src = src_file.read()
50
+ cu_path_bytes = cu_path.encode("utf-8")
51
+ program_name_bytes = os.path.basename(cu_path).encode("utf-8")
52
+ inc_path = os.path.join(warp_home, "native").encode("utf-8")
53
+ output_path = output_path.encode("utf-8")
54
+
55
+ if warp._src.config.llvm_cuda:
56
+ warp._src.context.runtime.llvm.wp_compile_cuda(src, cu_path_bytes, inc_path, output_path, False)
57
+
58
+ else:
59
+ if ltoirs is None:
60
+ ltoirs = []
61
+ if fatbins is None:
62
+ fatbins = []
63
+
64
+ link_data = list(ltoirs) + list(fatbins)
65
+ num_link = len(link_data)
66
+ arr_link = (ctypes.c_char_p * num_link)(*link_data)
67
+ arr_link_sizes = (ctypes.c_size_t * num_link)(*[len(l) for l in link_data])
68
+ link_input_types = [nvJitLink_input_type["ltoir"]] * len(ltoirs) + [nvJitLink_input_type["fatbin"]] * len(
69
+ fatbins
70
+ )
71
+ arr_link_input_types = (ctypes.c_int * num_link)(*link_input_types)
72
+ err = warp._src.context.runtime.core.wp_cuda_compile_program(
73
+ src,
74
+ program_name_bytes,
75
+ arch,
76
+ inc_path,
77
+ 0,
78
+ None,
79
+ config == "debug",
80
+ warp._src.config.verbose,
81
+ verify_fp,
82
+ fast_math,
83
+ fuse_fp,
84
+ lineinfo,
85
+ compile_time_trace,
86
+ output_path,
87
+ num_link,
88
+ arr_link,
89
+ arr_link_sizes,
90
+ arr_link_input_types,
91
+ )
92
+ if err != 0:
93
+ raise Exception(f"CUDA kernel build failed with error code {err}")
94
+
95
+
96
+ # load PTX or CUBIN as a CUDA runtime module (input type determined by input_path extension)
97
+ def load_cuda(input_path, device):
98
+ if not device.is_cuda:
99
+ raise RuntimeError("Not a CUDA device")
100
+
101
+ return warp._src.context.runtime.core.wp_cuda_load_module(device.context, input_path.encode("utf-8"))
102
+
103
+
104
+ def build_cpu(obj_path, cpp_path, mode="release", verify_fp=False, fast_math=False, fuse_fp=True):
105
+ with open(cpp_path, "rb") as cpp:
106
+ src = cpp.read()
107
+ cpp_path = cpp_path.encode("utf-8")
108
+ inc_path = os.path.join(warp_home, "native").encode("utf-8")
109
+ obj_path = obj_path.encode("utf-8")
110
+
111
+ err = warp._src.context.runtime.llvm.wp_compile_cpp(
112
+ src,
113
+ cpp_path,
114
+ inc_path,
115
+ obj_path,
116
+ mode == "debug",
117
+ verify_fp,
118
+ fuse_fp,
119
+ warp.config.enable_tiles_in_stack_memory,
120
+ )
121
+ if err != 0:
122
+ raise Exception(f"CPU kernel build failed with error code {err}")
123
+
124
+
125
+ def init_kernel_cache(path=None):
126
+ """Initialize kernel cache directory.
127
+
128
+ This function is used during Warp initialization, but it can also be called directly to change the cache location.
129
+ If the path is not explicitly specified, a default location will be chosen based on OS-specific conventions.
130
+
131
+ To change the default cache location, set warp.config.kernel_cache_dir before calling warp.init().
132
+ """
133
+
134
+ if path is not None:
135
+ cache_root_dir = os.path.realpath(path)
136
+ elif "WARP_CACHE_PATH" in os.environ:
137
+ cache_root_dir = os.path.realpath(os.environ.get("WARP_CACHE_PATH"))
138
+ else:
139
+ cache_root_dir = appdirs.user_cache_dir(appname="warp", appauthor="NVIDIA", version=warp._src.config.version)
140
+
141
+ if os.name == "nt" and os.path.isabs(cache_root_dir) and not cache_root_dir.startswith("\\\\?\\"):
142
+ # Add Windows long-path prefix, accounting for UNC shares.
143
+ if cache_root_dir.startswith("\\\\"):
144
+ # UNC path \\server\share\… → \\?\UNC\server\share\…
145
+ cache_root_dir = "\\\\?\\UNC\\" + cache_root_dir.lstrip("\\")
146
+ else:
147
+ # Drive-letter path C:\… → \\?\C:\…
148
+ cache_root_dir = "\\\\?\\" + cache_root_dir
149
+
150
+ warp._src.config.kernel_cache_dir = cache_root_dir
151
+
152
+ os.makedirs(warp._src.config.kernel_cache_dir, exist_ok=True)
153
+
154
+
155
+ def clear_kernel_cache() -> None:
156
+ """Clear the kernel cache directory of previously generated source code and compiler artifacts.
157
+
158
+ Only directories beginning with ``wp_`` will be deleted.
159
+ This function only clears the cache for the current Warp version.
160
+ LTO artifacts are not affected.
161
+ """
162
+
163
+ warp._src.context.init()
164
+
165
+ import shutil
166
+
167
+ is_initialized = warp._src.context.runtime is not None
168
+ assert is_initialized, "The kernel cache directory is not configured; wp.init() has not been called yet or failed."
169
+
170
+ for item in os.listdir(warp._src.config.kernel_cache_dir):
171
+ item_path = os.path.join(warp._src.config.kernel_cache_dir, item)
172
+ if os.path.isdir(item_path) and item.startswith("wp_"):
173
+ # Remove the directory and its contents
174
+ shutil.rmtree(item_path, ignore_errors=True)
175
+
176
+
177
+ def clear_lto_cache() -> None:
178
+ """Clear the LTO cache directory of previously generated LTO code.
179
+
180
+ The LTO cache is stored within a subdirectory of the kernel cache directory.
181
+ This function only clears the cache for the current Warp version.
182
+ """
183
+
184
+ warp._src.context.init()
185
+
186
+ import shutil
187
+
188
+ is_initialized = warp._src.context.runtime is not None
189
+ assert is_initialized, "The kernel cache directory is not configured; wp.init() has not been called yet or failed."
190
+
191
+ lto_path = os.path.join(warp._src.config.kernel_cache_dir, "lto")
192
+ if os.path.isdir(lto_path):
193
+ # Remove the lto directory and its contents
194
+ shutil.rmtree(lto_path, ignore_errors=True)
195
+
196
+
197
+ def safe_rename(src, dst, attempts=5, delay=0.1):
198
+ for i in range(attempts):
199
+ try:
200
+ os.rename(src, dst)
201
+ return
202
+ except FileExistsError:
203
+ return
204
+ except OSError as e:
205
+ if e.errno == errno.ENOTEMPTY:
206
+ # if directory exists we assume another process
207
+ # got there first, in which case we will copy
208
+ # our output to the directory manually in second step
209
+ return
210
+ else:
211
+ # otherwise assume directory creation failed e.g.: access denied
212
+ # on Windows we see occasional failures to rename directories due to
213
+ # some process holding a lock on a file to be moved to workaround
214
+ # this we make multiple attempts to rename with some delay
215
+ if i < attempts - 1:
216
+ time.sleep(delay)
217
+ else:
218
+ print(
219
+ f"Could not update Warp cache with compiled binaries, trying to rename {src} to {dst}, error {e}"
220
+ )
221
+ raise e
222
+
223
+
224
+ def hash_symbol(symbol):
225
+ ch = hashlib.sha256()
226
+ ch.update(symbol.encode("utf-8"))
227
+ return ch.hexdigest()
228
+
229
+
230
+ def get_lto_cache_dir():
231
+ lto_dir = os.path.join(warp._src.config.kernel_cache_dir, "lto")
232
+ return lto_dir
233
+
234
+
235
+ def get_cached_lto(path):
236
+ if os.path.exists(path):
237
+ with open(path, "rb") as f:
238
+ lto_code_data = f.read()
239
+ return lto_code_data
240
+ else:
241
+ return None
242
+
243
+
244
+ def get_cached_lto_meta(path, symbol):
245
+ if os.path.exists(path):
246
+ with open(path) as f:
247
+ keys = json.load(f)
248
+ value = keys[symbol]
249
+ return value
250
+ else:
251
+ return None
252
+
253
+
254
+ def _build_lto_base(lto_symbol, compile_func, builder, extra_files=None):
255
+ """Generic LTO build function that handles caching, file operations and process management.
256
+
257
+ Args:
258
+ lto_symbol: Unique identifier for the LTO operation
259
+ compile_func: Function to compile the specific LTO
260
+ (receives a dictionary of build paths)
261
+ builder: Builder object to store results
262
+ extra_files: Dictionary of additional file types to handle (e.g.,
263
+ {".meta": None, ".fatbin": None}). Values are the functions to get
264
+ the cached file data.
265
+
266
+ Returns:
267
+ Tuple where the first element is a success flag (``bool``). The second
268
+ element is the LTO code as bytes (or ``None`` on failure).
269
+ If ``extra_files`` is provided, additional elements follow in the same
270
+ order as the keys in ``extra_files``:
271
+ - ``".meta"``: int (shared memory bytes).
272
+ - ``"_fatbin.lto"``: bytes (universal fatbin).
273
+ """
274
+ if extra_files is None:
275
+ extra_files = {}
276
+
277
+ # Hash symbol and set up paths
278
+ h = hash_symbol(lto_symbol)
279
+ lto_dir = get_lto_cache_dir()
280
+ lto_name = f"{h[:7]}.lto"
281
+ lto_path = os.path.join(lto_dir, lto_name)
282
+
283
+ # Set up paths for extra files
284
+ file_paths = {".lto": lto_path}
285
+ temp_file_paths = {}
286
+
287
+ for ext, _ in extra_files.items():
288
+ name = f"{h[:7]}{ext}"
289
+ file_paths[ext] = os.path.join(lto_dir, name)
290
+
291
+ # Check if already built but not cached
292
+ lto_code_data = get_cached_lto(lto_path)
293
+ if lto_code_data is not None:
294
+ # Get the cached data for the extra files and early return
295
+ all_files_cached = True
296
+ for ext, getter in extra_files.items():
297
+ if getter and os.path.exists(file_paths[ext]):
298
+ cached_data = getter(file_paths[ext])
299
+ if cached_data is None:
300
+ all_files_cached = False
301
+ break
302
+ extra_files[ext] = cached_data
303
+ elif getter: # If there's a getter but file doesn't exist
304
+ all_files_cached = False
305
+ break
306
+
307
+ if all_files_cached:
308
+ if not extra_files:
309
+ return (True, lto_code_data)
310
+ else:
311
+ return (True, lto_code_data, *[extra_files[ext] for ext in extra_files.keys()])
312
+
313
+ # Create process-dependent temporary build directory
314
+ build_dir = f"{lto_dir}_p{os.getpid()}"
315
+ Path(build_dir).mkdir(parents=True, exist_ok=True)
316
+
317
+ # Set up temporary paths for the build outputs
318
+ for ext, path in file_paths.items():
319
+ temp_file_paths[ext] = os.path.join(build_dir, os.path.basename(path))
320
+
321
+ # Compile LTO with the specialized function
322
+ result, outputs = compile_func(temp_file_paths)
323
+
324
+ if not result:
325
+ # Clean up and fail
326
+ for path in temp_file_paths.values():
327
+ if Path(path).exists():
328
+ Path(path).unlink()
329
+
330
+ outputs[".lto"] = None
331
+ for ext in extra_files.keys():
332
+ outputs[ext] = None
333
+ else:
334
+ # Move outputs to cache
335
+ safe_rename(build_dir, lto_dir)
336
+
337
+ # If build_dir couldn't be moved by a rename, move the outputs one-by-one to lto_dir
338
+ if os.path.exists(lto_dir):
339
+ for ext, path in file_paths.items():
340
+ if not os.path.exists(path):
341
+ try:
342
+ # copy output file to the destination lto dir
343
+ os.rename(temp_file_paths[ext], path)
344
+ except (OSError, FileExistsError):
345
+ # another process likely updated the lto dir first
346
+ pass
347
+
348
+ # Clean up the temporary build directory
349
+ if build_dir:
350
+ import shutil
351
+
352
+ shutil.rmtree(build_dir, ignore_errors=True)
353
+
354
+ if not extra_files:
355
+ return (result, outputs[".lto"])
356
+ else:
357
+ return (result, outputs[".lto"], *[outputs[ext] for ext in extra_files.keys()])
358
+
359
+
360
+ def build_lto_dot(M, N, K, adtype, bdtype, cdtype, alayout, blayout, clayout, arch, num_threads, builder):
361
+ arch = 120 if arch > 121 else arch
362
+
363
+ # Maps Python/Warp types to C++ types and enums
364
+ def cublasdx_type_map(dtype):
365
+ if dtype == float16:
366
+ return ("wp::float16", 3, 0)
367
+ if dtype == float32:
368
+ return ("wp::float32", 5, 0)
369
+ if dtype == float64:
370
+ return ("wp::float64", 6, 0)
371
+ if dtype == vec2h:
372
+ return ("wp::vec2h", 3, 1)
373
+ if dtype == vec2f:
374
+ return ("wp::vec2f", 5, 1)
375
+ if dtype == vec2d:
376
+ return ("wp::vec2d", 6, 1)
377
+ raise TypeError("Unsupported input type in tile_matmul")
378
+
379
+ def cublasdx_arrangement_map(layout):
380
+ if layout == "colmajor":
381
+ return 0 # CUBLASDX_ARRANGEMENT_COL_MAJOR
382
+ if layout == "rowmajor":
383
+ return 1 # CUBLASDX_ARRANGEMENT_ROW_MAJOR
384
+ raise ValueError("Unsupported layout in tile_matmul")
385
+
386
+ (a_dtype, a_prec, a_type) = cublasdx_type_map(adtype)
387
+ (b_dtype, b_prec, b_type) = cublasdx_type_map(bdtype)
388
+ (c_dtype, c_prec, c_type) = cublasdx_type_map(cdtype)
389
+ a_arrangement = cublasdx_arrangement_map(alayout)
390
+ b_arrangement = cublasdx_arrangement_map(blayout)
391
+ c_arrangement = cublasdx_arrangement_map(clayout)
392
+
393
+ if a_type != b_type or a_type != c_type:
394
+ raise TypeError("tile_matmul(A, B, C) requires all inputs to be real or complex")
395
+
396
+ element_type = a_type
397
+
398
+ lto_symbol = f"dot_{M}_{N}_{K}_{arch}_{num_threads}_{a_arrangement}_{b_arrangement}_{c_arrangement}_{a_prec}_{b_prec}_{c_prec}_{element_type}"
399
+
400
+ def compile_lto_dot(temp_paths):
401
+ result = warp._src.context.runtime.core.wp_cuda_compile_dot(
402
+ temp_paths[".lto"].encode("utf-8"),
403
+ lto_symbol.encode("utf-8"),
404
+ 0,
405
+ None,
406
+ None,
407
+ arch,
408
+ M,
409
+ N,
410
+ K,
411
+ a_prec,
412
+ b_prec,
413
+ c_prec,
414
+ element_type,
415
+ a_arrangement,
416
+ b_arrangement,
417
+ c_arrangement,
418
+ num_threads,
419
+ )
420
+
421
+ if result:
422
+ with open(temp_paths[".lto"], "rb") as f:
423
+ lto_code_data = f.read()
424
+ return True, {".lto": lto_code_data}
425
+ return False, {}
426
+
427
+ # Early out if already cached in module
428
+ if lto_symbol in builder.ltoirs:
429
+ lto_code_data = builder.ltoirs[lto_symbol]
430
+ else:
431
+ (result, lto_code_data) = _build_lto_base(lto_symbol, compile_lto_dot, builder, {})
432
+
433
+ if not result:
434
+ raise RuntimeError(
435
+ f"Failed to compile LTO '{lto_symbol}'. "
436
+ "Set the environment variable LIBMATHDX_LOG_LEVEL=5 and rerun for more details."
437
+ )
438
+
439
+ # Update builder
440
+ builder.ltoirs[lto_symbol] = lto_code_data
441
+ builder.ltoirs_decl[lto_symbol] = (
442
+ f"void {lto_symbol}({c_dtype}*, {a_dtype}*, {b_dtype}*, {c_dtype}*, {c_dtype}*);"
443
+ )
444
+
445
+ return lto_symbol, lto_code_data
446
+
447
+
448
+ def build_lto_solver(
449
+ M,
450
+ N,
451
+ NRHS,
452
+ solver,
453
+ solver_enum,
454
+ side_enum,
455
+ diag_enum,
456
+ alayout,
457
+ blayout,
458
+ fill_mode,
459
+ arch,
460
+ precision_enum,
461
+ num_threads,
462
+ parameter_list,
463
+ builder,
464
+ smem_estimate_bytes=None,
465
+ ):
466
+ arch = 120 if arch > 121 else arch
467
+
468
+ def cusolverdx_arrangement_map(layout):
469
+ if layout == "colmajor":
470
+ return 0 # CUSOLVERDX_ARRANGEMENT_COL_MAJOR
471
+ if layout == "rowmajor":
472
+ return 1 # CUSOLVERDX_ARRANGEMENT_ROW_MAJOR
473
+ raise ValueError("Unsupported layout in tile_matmul")
474
+
475
+ a_arrangement = cusolverdx_arrangement_map(alayout)
476
+ b_arrangement = cusolverdx_arrangement_map(blayout)
477
+
478
+ lto_symbol = f"{solver}_{M}_{N}_{NRHS}_{arch}_{num_threads}_{a_arrangement}_{b_arrangement}_{precision_enum}_{side_enum if side_enum >= 0 else 'x'}_{diag_enum if diag_enum >= 0 else 'x'}_{fill_mode}"
479
+
480
+ def compile_lto_solver(temp_paths):
481
+ # compile LTO
482
+ result = warp._src.context.runtime.core.wp_cuda_compile_solver(
483
+ temp_paths["_fatbin.lto"].encode("utf-8"),
484
+ temp_paths[".lto"].encode("utf-8"),
485
+ lto_symbol.encode("utf-8"),
486
+ 0,
487
+ None,
488
+ None,
489
+ arch,
490
+ M,
491
+ N,
492
+ NRHS,
493
+ solver_enum,
494
+ side_enum,
495
+ diag_enum,
496
+ precision_enum,
497
+ a_arrangement,
498
+ b_arrangement,
499
+ fill_mode,
500
+ num_threads,
501
+ )
502
+
503
+ if result:
504
+ with open(temp_paths[".lto"], "rb") as f:
505
+ lto_code_data = f.read()
506
+ with open(temp_paths["_fatbin.lto"], "rb") as f:
507
+ universal_fatbin_code_data = f.read()
508
+ return True, {".lto": lto_code_data, "_fatbin.lto": universal_fatbin_code_data}
509
+ return False, {}
510
+
511
+ # Early out if already cached in module
512
+ if lto_symbol in builder.ltoirs:
513
+ lto_code_data = builder.ltoirs[lto_symbol]
514
+ else:
515
+ (result, lto_code_data, universal_fatbin_code_data) = _build_lto_base(
516
+ lto_symbol, compile_lto_solver, builder, {"_fatbin.lto": get_cached_lto}
517
+ )
518
+
519
+ if not result:
520
+ hint = ""
521
+ if smem_estimate_bytes:
522
+ max_smem_bytes = 232448
523
+ max_smem_is_estimate = True
524
+ for d in warp.get_cuda_devices():
525
+ if d.arch == arch:
526
+ # We can directly query the max shared memory for this device
527
+ queried_bytes = warp._src.context.runtime.core.wp_cuda_get_max_shared_memory(d.context)
528
+ if queried_bytes > 0:
529
+ max_smem_bytes = queried_bytes
530
+ max_smem_is_estimate = False
531
+ break
532
+ if smem_estimate_bytes > max_smem_bytes:
533
+ source = "estimated limit" if max_smem_is_estimate else "device-reported limit"
534
+ hint = (
535
+ f"Estimated shared memory requirement is {smem_estimate_bytes}B, "
536
+ f"but the {source} is {max_smem_bytes}B. "
537
+ "The tile size(s) may be too large for this device."
538
+ )
539
+
540
+ if warp._src.context.runtime.toolkit_version < (12, 6):
541
+ raise RuntimeError(
542
+ "cuSolverDx requires CUDA Toolkit 12.6.3 or later. This version of Warp was built against CUDA Toolkit "
543
+ f"{warp._src.context.runtime.toolkit_version[0]}.{warp._src.context.runtime.toolkit_version[1]}. "
544
+ "Upgrade your CUDA Toolkit and rebuild Warp, or install a Warp wheel built with CUDA >= 12.6.3."
545
+ )
546
+ else:
547
+ raise RuntimeError(
548
+ f"Failed to compile LTO '{lto_symbol}'. {hint}"
549
+ " Set the environment variable LIBMATHDX_LOG_LEVEL=5 and rerun for more details."
550
+ )
551
+
552
+ # Update builder
553
+ builder.ltoirs[lto_symbol] = lto_code_data
554
+ builder.ltoirs_decl[lto_symbol] = f"void {lto_symbol}{parameter_list};"
555
+ builder.fatbins[lto_symbol] = universal_fatbin_code_data
556
+
557
+ return lto_symbol, lto_code_data
558
+
559
+
560
+ def build_lto_fft(arch, size, ept, direction, dir, precision, builder):
561
+ arch = 120 if arch > 121 else arch
562
+
563
+ lto_symbol = f"fft_{size}_{ept}_{arch}_{direction}_{precision}"
564
+
565
+ def compile_lto_fft(temp_paths):
566
+ shared_memory_size = ctypes.c_int(0)
567
+
568
+ result = warp._src.context.runtime.core.wp_cuda_compile_fft(
569
+ temp_paths[".lto"].encode("utf-8"),
570
+ lto_symbol.encode("utf-8"),
571
+ 0,
572
+ None,
573
+ None,
574
+ arch,
575
+ size,
576
+ ept,
577
+ dir,
578
+ precision,
579
+ ctypes.byref(shared_memory_size),
580
+ )
581
+
582
+ if result:
583
+ with open(temp_paths[".lto"], "rb") as f:
584
+ lto_code_data = f.read()
585
+
586
+ shared_memory_bytes = tile.round_up(shared_memory_size.value)
587
+
588
+ # output meta file with shared memory requirements for this lto_symbol
589
+ meta = {}
590
+ meta[lto_symbol] = shared_memory_bytes
591
+
592
+ with open(temp_paths[".meta"], "w") as meta_file:
593
+ json.dump(meta, meta_file)
594
+
595
+ return True, {".lto": lto_code_data, ".meta": shared_memory_bytes}
596
+
597
+ return False, {}
598
+
599
+ # Early out if already cached in module
600
+ if lto_symbol in builder.ltoirs and lto_symbol in builder.shared_memory_bytes:
601
+ lto_code_data = builder.ltoirs[lto_symbol]
602
+ shared_memory_bytes = builder.shared_memory_bytes[lto_symbol]
603
+ else:
604
+ (result, lto_code_data, shared_memory_bytes) = _build_lto_base(
605
+ lto_symbol, compile_lto_fft, builder, {".meta": lambda path: get_cached_lto_meta(path, lto_symbol)}
606
+ )
607
+
608
+ if not result:
609
+ raise RuntimeError(
610
+ f"Failed to compile LTO '{lto_symbol}'."
611
+ "Set the environment variable LIBMATHDX_LOG_LEVEL=5 and rerun for more details."
612
+ )
613
+
614
+ # Update builder
615
+ builder.ltoirs[lto_symbol] = lto_code_data
616
+ builder.shared_memory_bytes[lto_symbol] = shared_memory_bytes
617
+
618
+ return lto_symbol, lto_code_data, shared_memory_bytes